1200字范文 > [Python爬虫] 之二十五：Selenium +phantomjs 利用 pyquery抓取今日头条网数据

[Python爬虫] 之二十五：Selenium +phantomjs 利用 pyquery抓取今日头条网数据

时间：2022-06-22 06:18:55

一、介绍

本例子用Selenium +phantomjs爬取今日头条（/search/?keyword=电视）的资讯信息，输入给定关键字抓取资讯信息。

给定关键字：数字；融合；电视

抓取信息内如下：

1、资讯标题

2、资讯链接

3、资讯时间

4、资讯来源

二、网站信息

三、数据抓取

针对上面的网站信息，来进行抓取

1、首先抓取信息列表

抓取代码：Elements = doc('div[class="articleCard"]')

2、抓取标题

抓取代码：title = element('a[class="link title"]').find('span').text().encode('utf8').replace(' ', '')

3、抓取链接

抓取代码：url = '' + element.find('a[class="link title"]').attr('href')

4、抓取日期

抓取代码：strdate = element('span[class="lbtn"]').text().encode('utf8').strip()

5、抓取来源

抓取代码：source = element('a[class="lbtn source J_source"]').text().encode('utf8').replace(' ', '')

四、完整代码

# coding=utf-8import osimport refrom selenium import webdriverimport selenium.webdriver.support.ui as uiimport timefrom datetime import datetimeimport IniFile# from threading import Threadfrom pyquery import PyQuery as pqimport LogFileimport mongoDBimport urllibclass toutiaoSpider(object):def __init__(self):logfile = os.path.join(os.path.dirname(os.getcwd()), time.strftime('%Y-%m-%d') + '.txt')self.log = LogFile.LogFile(logfile)configfile = os.path.join(os.path.dirname(os.getcwd()), 'setting.conf')cf = IniFile.ConfigFile(configfile)webSearchUrl = cf.GetValue("toutiao", "webSearchUrl")self.keyword_list = cf.GetValue("section", "information_keywords").split(';')self.db = mongoDB.mongoDbBase()self.start_urls = []for word in self.keyword_list:self.start_urls.append(webSearchUrl + urllib.quote(word))self.driver = webdriver.PhantomJS()self.wait = ui.WebDriverWait(self.driver, 2)self.driver.maximize_window()def scroll_foot(self):'''滚动条拉到底部:return:'''js = ""# 如何利用chrome驱动或phantomjs抓取if self.driver.name == "chrome" or self.driver.name == 'phantomjs':js = "var q=document.body.scrollTop=10000"# 如何利用IE驱动抓取elif self.driver.name == 'internet explorer':js = "var q=document.documentElement.scrollTop=10000"return self.driver.execute_script(js)def date_isValid(self, strDateText):'''判断日期时间字符串是否合法：如果给定时间大于当前时间是合法，或者说当前时间给定的范围内:param strDateText: 四种格式 '2小时前'; '2天前' ; '昨天' ;'.2.12 ':return: True:合法；False:不合法'''currentDate = time.strftime('%Y-%m-%d')if strDateText.find('分钟前') > 0 or strDateText.find('刚刚') > -1:return True, currentDateelif strDateText.find('小时前') > 0:datePattern = pile(r'\d{1,2}')ch = int(time.strftime('%H')) # 当前小时数strDate = re.findall(datePattern, strDateText)if len(strDate) == 1:if int(strDate[0]) <= ch: # 只有小于当前小时数，才认为是今天return True, currentDatereturn False, ''def log_print(self, msg):'''# 日志函数# :param msg: 日志信息# :return:# '''print '%s: %s' % (time.strftime('%Y-%m-%d %H-%M-%S'), msg)def scrapy_date(self):strsplit = '------------------------------------------------------------------------------------'index = 0for link in self.start_urls:self.driver.get(link)keyword = self.keyword_list[index]index = index + 1time.sleep(1) #数据比较多，延迟下，否则会出现查不到数据的情况selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")doc = pq(selenium_html)infoList = []self.log.WriteLog(strsplit)self.log_print(strsplit)Elements = doc('div[class="articleCard"]')for element in Elements.items():strdate = element('span[class="lbtn"]').text().encode('utf8').strip()flag, date = self.date_isValid(strdate)if flag:title = element('a[class="link title"]').find('span').text().encode('utf8').replace(' ', '')if title.find(keyword) > -1:url = '' + element.find('a[class="link title"]').attr('href')source = element('a[class="lbtn source J_source"]').text().encode('utf8').replace(' ', '')dictM = {'title': title, 'date': date,'url': url, 'keyword': keyword, 'introduction': title, 'source': source}infoList.append(dictM)# self.log.WriteLog('title:%s' % title)# self.log.WriteLog('url:%s' % url)# self.log.WriteLog('source:%s' % source)# self.log.WriteLog('kword:%s' % keyword)# self.log.WriteLog(strsplit)self.log_print('title:%s' % dictM['title'])self.log_print('url:%s' % dictM['url'])self.log_print('date:%s' % dictM['date'])self.log_print('source:%s' % dictM['source'])self.log_print('kword:%s' % dictM['keyword'])self.log_print(strsplit)if len(infoList)>0:self.db.SaveInformations(infoList)self.driver.close()self.driver.quit()obj = toutiaoSpider()obj.scrapy_date()

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。