一. 概要
1.通过python爬虫循环爬取古诗词网站古诗名句
2.落地到本地数据库
二. 页面分析
首先通过firedebug进行页面定位:
其次源码定位:
最终生成lxml etree定位div标签源码:
response = etree.HTML(data)for row in response.xpath('//div[@class="left"]/div[@class="sons"]/div[@class="cont"]'):content = row.xpath('a/text()')[0]origin = row.xpath('a/text()')[-1]self.db.add_new_row('mingJuSpider', {'content': content, 'origin': origin, 'createTime': str(date.today())})
三. 执行结果
四. 脚本源码
#!/usr/bin/env python# -*- coding: utf-8 -*-'''@Date : /12/21 12:35@Author : kaiqing.huang@File : mingJuSpider.py'''from utils import MySpider, MongoBasefrom datetime import datefrom lxml import etreeimport sysclass mingJuSpider():def __init__(self):self.db = MongoBase()self.spider = MySpider()def download(self):for pageId in range(1,117):url = '/mingju/Default.aspx?p={}&c=&t='.format(pageId)print urldata = self.spider.get(url)if data:self.parse(data)def parse(self, data):response = etree.HTML(data)for row in response.xpath('//div[@class="left"]/div[@class="sons"]/div[@class="cont"]'):content = row.xpath('a/text()')[0]origin = row.xpath('a/text()')[-1]self.db.add_new_row('mingJuSpider', {'content': content, 'origin': origin, 'createTime': str(date.today())})if __name__ == '__main__':sys.setrecursionlimit(100000)do = mingJuSpider()do.download()