1200字范文,内容丰富有趣,写作的好帮手!
1200字范文 > 爬取IT之家评论

爬取IT之家评论

时间:2022-07-23 04:47:18

相关推荐

爬取IT之家评论

声明:转载请在本文评论中标明转载发布地址

本人博客欢迎查看点击打开链接

大版本更新预告:

增加评论迭代更新,不再错任何一篇新闻任何一个评论

5月18日21:10:55更新:

新增功能

能在指定页面id范围内批量爬取预期功能已实现

下版本增加功能:

启用代理ip爬取,降低被检查的风险增加请求头

已知bug:

评论可能没有1楼,导致可能出错

版本2.0

import requestsfrom bs4 import BeautifulSoupimport leancloud# import logging# logging.basicConfig(level=logging.DEBUG)#请输入您的leancloud,如果不需要上传到云端,请关闭上传函数leancloud.init("2C2xis80wMsTyMrkyi1cQIxG-gzGzoHsz", "sYMaOVyBxA81KTXMXQYwDDIg")url = '/ithome/getajaxdata.aspx' # IT之家评论信息请求地址news_id =str(0)urlpage = '/comment/' + news_iddef get_news_hash(news_id): # 得到文章的hash值'''hash值不是js计算得出的而是在评论页面源代码最后几行,它被设置成隐藏属性'''url_gethash = urlpagehtml_home = requests.get(url=url_gethash)'''理网页源代码信息,筛选出hash'''tree = BeautifulSoup(html_home.text,'html.parser')news_hash = tree.find('input',attrs={'id':'hash'})['value']return news_hashdef getpage_commentinfo(page_url): #输入文章url,输出用户列表to_braak = Falseall_comment = []for i in range(1,6666) : # 调整页面data_page = { # 发送的数据包'newsID':news_id,'hash':get_news_hash(news_id),'type':'commentpage','page':str(i),'order':'false'}'''读出一页中所有用户信息'''page = requests.post(url=url, data=data_page).texthtml = BeautifulSoup(page,'html.parser')user_allmember = html.find_all('li', attrs={'class': 'entry'}) # 筛选网页信息,得到用户对象列表,列表里对应用户所有信息源码'''循环读取每一个用户对象的信息加载到字典中,再把字典加载到列表里去'''for x in range(0,6666) :long = len(user_allmember) # 计算用户对象个数if long == 0:to_braak = Truebreakuser_infor = user_allmember[x] #取一个用户对象'''id 等级 姓名 评论 赞同数 反对数 设备信息 楼层 地址 时间 app版本 新闻id'''user_allinfo = {}user_allinfo['user_id'] = user_infor.div.a['title'].replace('软媒通行证数字ID:', '')user_allinfo['user_level'] = user_infor.div.find('div', attrs={'class': 'level'}).span.string.replace('Lv.', '')user_allinfo['user_name'] = str(user_infor.find('span', attrs={'class': 'nick'}).string)try:user_allinfo['user_comment'] = user_infor.find('div',attrs={'class':'comm'}).p.get_text()except:user_allinfo['user_comment'] = 'None'user_allinfo['user_comment_praise'] = user_infor.find('a', attrs={'class': 's'}).string.replace('支持(', '').replace(')', '')user_allinfo['user_comment_oppose'] = user_infor.find('a', attrs={'class': 'a'}).string.replace('反对(', '').replace(')', '')try:user_allinfo['user_dev'] = user_infor.find('a',attrs={'href':'///ithome/download/'}).stringexcept:user_allinfo['user_dev'] = 'None'user_allinfo['user_floor'] = user_infor.find('strong', attrs={'class': 'p_floor'}).string.replace('楼', '')try:temp = user_infor.find('div',attrs={'class':'nmp'}).find('span',attrs={'class':'posandtime'})\.string.replace('\xa0', ' ').replace('IT之家', '').replace('网友', '')temp = temp.split(' ')user_allinfo['user_address'] = temp[0]user_allinfo['user_time'] = temp[1]+' '+temp[2]except:user_allinfo['user_address'] = 'None'try:user_allinfo['user_app'] = user_infor.find('span', attrs={'class': 'mobile android'}).a['title'].replace('App版本:v', '')except:user_allinfo['user_app'] = 'None'user_allinfo['user_news_id'] = news_idall_comment.append(user_allinfo)# all_user =print('已抓取'+str(user_allinfo['user_floor'])+'楼')if x == long-1: #自动连接每一页breakif to_braak == True:breakif int(user_allinfo['user_floor'])== 1 :breakreturn all_commentdef run_write_page_comment(page_commentinfo): #输入用户列表,把信息写到文件中去page_comment = page_commentinfowith open('评论信息.txt','a',encoding='utf-8') as f:# f.write('id☆等级☆姓名☆评论☆赞同数☆反对数☆设备信息☆楼层☆地址☆时间☆app版本☆新闻id'+'\n')for i in range(0,len(page_comment)):try:f.write(page_comment[i]['user_id']+'☆'+page_comment[i]['user_level']+'☆'+page_comment[i]['user_name']+'☆'+page_comment[i]['user_comment']+'☆'+page_comment[i]['user_comment_praise']+'☆'+page_comment[i]['user_comment_oppose']+'☆'+page_comment[i]['user_dev']+'☆'+page_comment[i]['user_floor']+'☆'+str(page_comment[i]['user_address'])+'☆'+str(page_comment[i]['user_time'])+'☆'+str(page_comment[i]['user_app'])+'☆'+str(page_comment[i]['user_news_id'])+'\n')except:print('抓取错误的楼层'+page_comment[i]['user_floor']+'\n')if i==len(page_comment)-1 :print('抓取成功')def run_update(page_commentinfo): #输入用户信息列表,上传用户信息# 每天只有3万的api请求数,请注意api请求量,一条评论信息就是一个请求量print('正在上传评论到云端')for member in page_commentinfo:comment = leancloud.Object.extend('comment')comment = comment()a = comment.set('user_id',int(member['user_id']))comment.set('user_level',int(member['user_level']))comment.set('user_name', member['user_name'])comment.set('user_comment', member['user_comment'])comment.set('user_comment_praise', int(member['user_comment_praise']))comment.set('user_comment_oppose', int(member['user_comment_oppose']))comment.set('user_dev', member['user_dev'])comment.set('user_floor', int(member['user_floor']))comment.set('user_address', member['user_address'])comment.set('user_time', member['user_time'])comment.set('user_app', member['user_app'])comment.set('user_news_id', int(member['user_news_id']))comment.save()print('成功上传'+str(len(page_commentinfo))+'条评论到云端')with open('评论信息.txt','a',encoding='utf-8') as f:f.write('id☆等级☆姓名☆评论☆赞同数☆反对数☆设备信息☆楼层☆地址☆时间☆app版本☆新闻id'+'\n')for i in range(208073,310000) :try:news_id = str(i)print('正在爬新闻ID为'+news_id)urlpage = '/comment/' + news_id# run_update(getpage_commentinfo(urlpage))run_write_page_comment(getpage_commentinfo(urlpage))except:passprint('任务完成')

5月18日18:23:45更新:

新增功能

增加上传云数据库功能

已知bug:

修复无法获取设备信息bug导入xlsx时会导致一些用户信息异常是由分割符#引起的(评论中可能有#)

以上bug已在2.0中修复

版本1.0

import requestsfrom bs4 import BeautifulSoupimport leancloud# import logging## logging.basicConfig(level=logging.DEBUG)leancloud.init("api id", "api key")news_id =input("请输入网站id:")urlpage = '/comment/' + news_idurl = '/ithome/getajaxdata.aspx' # IT之家评论信息请求地址def get_news_hash(news_id):# 得到文章的hash值'''hash值不是js计算得出的而是在评论页面源代码最后几行,它被设置成隐藏属性'''url_gethash = urlpagehtml_home = requests.get(url=url_gethash)'''理网页源代码信息,筛选出hash'''tree = BeautifulSoup(html_home.text)news_hash = tree.find('input',attrs={'id':'hash'})['value']return news_hash'''第一页评论不包括热评'''data_hot = {'newsID':news_id,'pid':'0','type':'hotcomment'}def getpage_commentinfo(page_url): #输入文章url,输出用户列表all_comment = []for i in range(1,6666) :# 调整页面data_page = { # 发送的数据包'newsID':news_id,'hash':get_news_hash(news_id),'type':'commentpage','page':str(i),'order':'false'}'''读出一页中所有用户信息'''page = requests.post(url=url, data=data_page).texthtml = BeautifulSoup(page,'html.parser')user_allmember = html.find_all('li', attrs={'class': 'entry'}) # 筛选网页信息,得到用户对象列表,列表里对应用户所有信息源码'''循环读取每一个用户对象的信息加载到字典中,再把字典加载到列表里去'''for x in range(0,6666) :long = len(user_allmember) # 计算用户对象个数user_infor = user_allmember[x] #取一个用户对象'''id 等级 姓名 评论 赞同数 反对数 设备信息 楼层 地址 时间 app版本'''user_allinfo = {}user_allinfo['user_id'] = user_infor.div.a['title'].replace('软媒通行证数字ID:', '')user_allinfo['user_level'] = user_infor.div.find('div', attrs={'class': 'level'}).span.string.replace('Lv.', '')user_allinfo['user_name'] = str(user_infor.find('span', attrs={'class': 'nick'}).string)try:user_allinfo['user_comment'] = user_infor.find('div',attrs={'class':'comm'}).p.get_text()except:user_allinfo['user_comment'] = 'None'user_allinfo['user_comment_praise'] = user_infor.find('a', attrs={'class': 's'}).string.replace('支持(', '').replace(')', '')user_allinfo['user_comment_oppose'] = user_infor.find('a', attrs={'class': 'a'}).string.replace('反对(', '').replace(')', '')try:user_allinfo['user_dev'] = user_infor.find('a',attrs={'href':'///ithome/download/'}).stringexcept:user_allinfo['user_dev'] = 'None'user_allinfo['user_floor'] = user_infor.find('strong', attrs={'class': 'p_floor'}).string.replace('楼', '')try:temp = user_infor.find('div',attrs={'class':'nmp'}).find('span',attrs={'class':'posandtime'})\.string.replace('\xa0', ' ').replace('IT之家', '').replace('网友', '')temp = temp.split(' ')user_allinfo['user_address'] = temp[0]user_allinfo['user_time'] = temp[1]+' '+temp[2]except:user_allinfo['user_address'] = 'None'try:user_allinfo['user_app'] = user_infor.find('span', attrs={'class': 'mobile android'}).a['title'].replace('App版本:v', '')except:user_allinfo['user_app'] = 'None'all_comment.append(user_allinfo)# all_user =print('已抓取'+str(user_allinfo['user_floor'])+'楼')if x == long-1: #自动连接每一页breakif int(user_allinfo['user_floor']) == 1:breakreturn all_commentdef run_write_page_comment(page_commentinfo): #输入用户列表,把信息写到文件中去page_comment = page_commentinfowith open('评论信息.txt','a',encoding='utf-8') as f:f.write('id#等级#姓名#评论#赞同数#反对数#设备信息#楼层#地址#时间#app版本'+'\n')for i in range(0,len(page_comment)):try:f.write(page_comment[i]['user_id']+'#'+page_comment[i]['user_level']+'#'+page_comment[i]['user_name']+'#'+page_comment[i]['user_comment']+'#'+page_comment[i]['user_comment_praise']+'#'+page_comment[i]['user_comment_oppose']+'#'+page_comment[i]['user_dev']+'#'+page_comment[i]['user_floor']+'#'+str(page_comment[i]['user_address'])+'#'+str(page_comment[i]['user_time'])+'#'+str(page_comment[i]['user_app'])+'\n')except:print('抓取错误的楼层'+page_comment[i]['user_floor']+'\n')if i==len(page_comment)-1 :print('抓取成功')def run_update(page_commentinfo): #输入用户信息列表,上传用户信息for member in page_commentinfo:comment = leancloud.Object.extend('comment')comment = comment()print(member['user_dev'])a = comment.set('user_id',int(member['user_id']))comment.set('user_level',int(member['user_level']))comment.set('user_name', member['user_name'])comment.set('user_comment', member['user_comment'])comment.set('user_comment_praise', int(member['user_comment_praise']))comment.set('user_comment_oppose', int(member['user_comment_oppose']))comment.set('user_dev', member['user_dev'])comment.set('user_floor', int(member['user_floor']))comment.set('user_address', member['user_address'])comment.set('user_time', member['user_time'])comment.set('user_app', member['user_app'])#每天只有3万的api请求数,请注意api请求量comment.save()run_update(getpage_commentinfo(urlpage))# run_write_page_comment(getpage_commentinfo(urlpage))

已知bug:

无法获取安卓系统以外设备信息新闻可能没评论,导致程序出现异常获取用户信息错位

以上bug已在2.0中修复

import requestsfrom bs4 import BeautifulSoupnews_id =input("请输入网站id:")urlpage = '/comment/' + news_idurl = '/ithome/getajaxdata.aspx' # IT之家评论信息请求地址def get_news_hash(news_id):# 得到文章的hash值'''hash值不是js计算得出的而是在评论页面源代码最后几行,它被设置成隐藏属性'''url_gethash = urlpagehtml_home = requests.get(url=url_gethash)'''理网页源代码信息,筛选出hash'''tree = BeautifulSoup(html_home.text)news_hash = tree.find('input',attrs={'id':'hash'})['value']return news_hash'''第一页评论不包括热评'''data_hot = {'newsID':news_id,'pid':'0','type':'hotcomment'}def getpage_commentinfo(page_url):all_comment = []for i in range(1,6666) :# 调整页面data_page = { # 发送的数据包'newsID':news_id,'hash':get_news_hash(news_id),'type':'commentpage','page':str(i),'order':'false'}'''读出一页中所有用户信息'''page = requests.post(url=url, data=data_page).texthtml = BeautifulSoup(page,'html.parser')user_allmember = html.find_all('li', attrs={'class': 'entry'}) # 筛选网页信息,得到用户对象列表,列表里对应用户所有信息源码'''循环读取每一个用户对象的信息加载到字典中,再把字典加载到列表里去'''for x in range(0,6666) :long = len(user_allmember) # 计算用户对象个数user_infor = user_allmember[x] #取一个用户对象'''id 等级 姓名 评论 赞同数 反对数 设备信息 楼层 地址 时间 app版本'''user_allinfo = {}user_allinfo['user_id'] = user_infor.div.a['title'].replace('软媒通行证数字ID:', '')user_allinfo['user_level'] = user_infor.div.find('div', attrs={'class': 'level'}).span.string.replace('Lv.', '')user_allinfo['user_name'] = str(user_infor.find('span', attrs={'class': 'nick'}).string)try:user_allinfo['user_comment'] = user_infor.find('div',attrs={'class':'comm'}).p.get_text()except:user_allinfo['user_comment'] = 'None'user_allinfo['user_comment_praise'] = user_infor.find('a', attrs={'class': 's'}).string.replace('支持(', '').replace(')', '')user_allinfo['user_comment_oppose'] = user_infor.find('a', attrs={'class': 'a'}).string.replace('反对(', '').replace(')', '')try:user_allinfo['user_dev'] = user_infor.find('span',attrs={'class':'mobile android'}).stringexcept:user_allinfo['user_dev'] = 'None'user_allinfo['user_floor'] = user_infor.find('strong', attrs={'class': 'p_floor'}).string.replace('楼', '')try:temp = user_infor.find('div',attrs={'class':'nmp'}).find('span',attrs={'class':'posandtime'})\.string.replace('\xa0', ' ').replace('IT之家', '').replace('网友', '')temp = temp.split(' ')user_allinfo['user_address'] = temp[0]user_allinfo['user_time'] = temp[1]+' '+temp[2]except:user_allinfo['user_address_and_time'] = 'None'try:user_allinfo['user_app'] = user_infor.find('span', attrs={'class': 'mobile android'}).a['title'].replace('App版本:v', '')except:user_allinfo['user_app'] = 'None'all_comment.append(user_allinfo)# all_user =print(str(user_allinfo['user_floor']))if x == long-1: #自动连接每一页breakif int(user_allinfo['user_floor']) == 1:breakreturn all_commentdef run_write_page_comment(page_commentinfo):page_comment = page_commentinfowith open('评论信息.txt','w',encoding='utf-8') as f:f.write('id#等级#姓名#评论#赞同数#反对数#设备信息#楼层#地址#时间#app版本'+'\n')for i in range(0,len(page_comment)):try:f.write(page_comment[i]['user_id']+'#'+page_comment[i]['user_level']+'#'+page_comment[i]['user_name']+'#'+page_comment[i]['user_comment']+'#'+page_comment[i]['user_comment_praise']+'#'+page_comment[i]['user_comment_oppose']+'#'+page_comment[i]['user_dev']+'#'+page_comment[i]['user_floor']+'#'+str(page_comment[i]['user_address'])+'#'+str(page_comment[i]['user_time'])+'#'+str(page_comment[i]['user_app'])+'\n')except:print('抓取错误的楼层'+page_comment[i]['user_floor']+'\n')if i==len(page_comment)-1 :print('抓取成功')run_write_page_comment(getpage_commentinfo(urlpage))

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。