1200字范文 > python爬虫(16)使用scrapy框架爬取顶点小说网

python爬虫(16)使用scrapy框架爬取顶点小说网

时间：2022-08-21 05:16:56

本文以scrapy 框架来爬取整个顶点小说网的小说

1.scrapy的安装

这个安装教程，网上有很多的例子，这里就不在赘述了

2.关于scrapy

scrapy框架是一个非常好的东西，能够实现异步爬取，节省时间，其实本文纯粹的按照之前的思维来做，

也不是不可以，但是感觉速度太慢了，毕竟数据量有点大

框架内容也在网上找找例子吧

3.直接说实现吧

使用

scrapy startproject dingdian

创建项目

然后增加文件，最后代码目录如下：

├── dingdian│ ├── __init__.py│ ├── items.py│ ├── pipelines.py│ ├── settings.py│ └── spiders│├── __init__.py│└── mydingdian.py

主要程序：

mydingdian.py

#coding:utf-8import scrapyimport refrom scrapy.http import Requestfrom dingdian.items import DingdianItem#from dingdian.items import DDNovelContentItemclass Myspider(scrapy.Spider):name = "dingdian"allowed_domains = [""]bash_url = "/class/"bashurl='.html'def start_requests(self):#for i in range(1,11):for i in range(7,8):url=self.bash_url+str(i)+"_1"+self.bashurlyield Request(url,self.parse)def parse(self, response):baseurl=response.url #此处得到的url为/class/*_1.htmlmax_num=response.xpath('//*[@id="pagelink"]/a[14]/text()').extract_first()#获取当前页面的最大页码数print max_numbaseurl=baseurl[:-7]#for num in xrange(1,int(max_num)+1):for num in xrange(1,3):newurl=baseurl+"_"+str(num)+self.bashurl#此处使用dont_filter和不使用的效果不一样，使用dont_filter就能够抓取到第一个页面的内容，不用就抓不到#scrapy会对request的URL去重(RFPDupeFilter)，加上dont_filter则告诉它这个URL不参与去重。yield Request(newurl,dont_filter=True,callback=self.get_name)#将新的页面url的内容传递给get_name函数去处理def get_name(self,response):for nameinfo in response.xpath('//tr'):novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()#小说地址name = nameinfo.xpath('td[1]/a/text()').extract_first()#小说名字if novelurl:yield Request(novelurl,dont_filter=True,callback=self.get_novelcontent,meta={'name':name})'''#在当前页面获取小说详情#print nameinfoname = nameinfo.xpath('td[1]/a/text()').extract_first()#小说名字author= nameinfo.xpath('td[3]/text()').extract_first()#小说作者novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()#小说地址serialstatus = nameinfo.xpath('td[6]/text()').extract_first()#小说状态serialnumber = nameinfo.xpath('td[4]/text()').extract_first()#小说字数if novelurl:targentcontent['novel_name']=nametargentcontent['author']=authortargentcontent['novelurl']=novelurltargentcontent['serialstatus']=serialstatustargentcontent['serialnumber']=serialnumber#print name,author,novelurl,serialstatus,serialnumberyield Request(novelurl,callback=self.get_novelcontent,meta={'targentcontent':targentcontent})小说相关的详情可以暂时不传递'''def get_novelcontent(self,response):#targentcontent=response.meta['targentcontent']#print targentcontent['novelurl'],targentcontent['name']#title = response.xpath('//dd[1]/h1/text()').extract_first()novel_name=response.meta['name']#小说名字author = response.xpath('//tr[1]/td[2]/text()').extract_first()#作者novelurl = response.url#小说地址serialstatus = response.xpath('//tr[1]/td[3]/text()').extract_first()#状态serialnumber = response.xpath('//tr[2]/td[2]/text()').extract_first()#连载字数category = response.xpath('//tr[1]/td[1]/a/text()').extract_first()#小说类别name_id = novelurl[-5:]#小说编号collect_num_total=response.xpath('//tr[2]/td[1]/text()').extract_first()#总收藏click_num_total=response.xpath('//tr[3]/td[1]/text()').extract_first()#总点击#chapterlistul=response.xpath('//dd[2]/div[2]/p[2]/a/text()').extract_first()chapterlisturl=response.xpath('//dd[2]/div[2]/p[2]/a/@href').extract_first()novel_breif=response.xpath('//dd[2]/p[2]').extract_first()targentcontent=DingdianItem()targentcontent['novel_name']=novel_nametargentcontent['author']=authortargentcontent['novelurl']=novelurltargentcontent['serialstatus']=serialstatustargentcontent['serialnumber']=serialnumbertargentcontent['category']=categorytargentcontent['name_id']=name_idtargentcontent['collect_num_total']=collect_num_totaltargentcontent['click_num_total']=click_num_totaltargentcontent['novel_breif']=novel_breif#yield targentcontent#print novel_name,author,novelurl,serialstatus,serialnumber,category,name_id,collect_num_total,click_num_total,chapterlisturlyield Request(chapterlisturl,dont_filter=True,callback=self.get_charaterurl,meta={'targentcontent':targentcontent})def get_charaterurl(self,response):#print response.urlitem=response.meta['targentcontent']for contents in response.xpath('//table/tr'):for content in contents.xpath('td'):if content.xpath('a/text()').extract_first():#print content.xpath('a/text()').extract_first()item['chapterurl']=response.url+content.xpath('a/@href').extract_first()item['chaptername']=content.xpath('a/text()').extract_first()yield item

定义的存贮内容即 items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# /en/latest/topics/items.htmlimport scrapyclass DingdianItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()novel_name = scrapy.Field()#小说名字author = scrapy.Field()#作者novelurl = scrapy.Field()#小说地址serialstatus = scrapy.Field()#状态serialnumber = scrapy.Field()#连载字数category = scrapy.Field()#小说类别name_id = scrapy.Field()#小说编号collect_num_total=scrapy.Field()#总收藏click_num_total=scrapy.Field()#总点击novel_breif=scrapy.Field()#小说简介chapterurl = scrapy.Field()#小说章节地址chaptername = scrapy.Field()#小说章节名字

设置相关 settings.py

# -*- coding: utf-8 -*-# Scrapy settings for dingdian project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##/en/latest/topics/settings.html#/en/latest/topics/downloader-middleware.html#/en/latest/topics/spider-middleware.htmlBOT_NAME = 'dingdian'SPIDER_MODULES = ['dingdian.spiders']NEWSPIDER_MODULE = 'dingdian.spiders'PAGE_STORGE="novels"# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'dingdian (+)'# Obey robots.txt rulesROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See /en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',#}# Enable or disable spider middlewares# See /en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'dingdian.middlewares.MyCustomSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See /en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# 'dingdian.middlewares.MyCustomDownloaderMiddleware': 543,#}# Enable or disable extensions# See /en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See /en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {'dingdian.pipelines.DingdianPipeline': 100,}# Enable and configure the AutoThrottle extension (disabled by default)# See /en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See /en/latest/topics/downloader-middleware.html#httpcache-middleware-settingsHTTPCACHE_ENABLED = TrueHTTPCACHE_EXPIRATION_SECS = 0HTTPCACHE_DIR = 'httpcache'HTTPCACHE_IGNORE_HTTP_CODES = []HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

最终的数据处理以及保存

pipelines.py

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: /en/latest/topics/item-pipeline.htmlfrom dingdian import settingsimport osimport urllib2from dingdian.items import DingdianItem#from dingdian.items import DDNovelContentItemfrom bs4 import BeautifulSoup as bsimport sysreload(sys)sys.setdefaultencoding('utf-8')class DingdianPipeline(object):def process_item(self, item, spider):dir_path="%s/%s" % (settings.PAGE_STORGE,spider.name)if not os.path.exists(dir_path):#print "dir_path is %s",dir_pathos.makedirs(dir_path)if isinstance(item,DingdianItem):novelpath=dir_path+'/'+item['novel_name']print novelpathif not os.path.exists(novelpath):os.makedirs(novelpath)novelbreif=item['novel_name']+"_简介"novelbreifpath=novelpath+'/'+novelbreif+'.txt'if not os.path.exists(novelbreifpath):with open(novelbreifpath,'wb') as novel_write:novel_write.write(item['novel_name'])novel_write.write('\t|\t')novel_write.write(item['author'])novel_write.write('\t|\t')novel_write.write(item['novelurl'])novel_write.write('\n')novel_write.write(item['serialstatus'])novel_write.write('\t|\t')novel_write.write(item['serialnumber'])novel_write.write('\t|\t')novel_write.write(item['category'])novel_write.write('\n')novel_write.write(item['name_id'])novel_write.write('\t|\t')novel_write.write(item['collect_num_total'])novel_write.write('\t|\t')novel_write.write(item['click_num_total'])novel_write.write('\n')novel_write.write(item['novel_breif'])novel_write.closetitlename=item['chaptername']titlenamepath=novelpath+'/'+titlename+'.txt'print titlenamepathchapterurl=item['chapterurl']html=urllib2.urlopen(chapterurl).read()soup1=bs(html,'lxml')if not os.path.exists(titlenamepath):with open(titlenamepath,'wb') as file_write:cont=soup1.find("dd",attrs={"id":"contents"}).getText()#print contfile_write.write(cont)file_write.close()return item#-o books.csv 参数的意思是将抓取的Item集合输出到csv文件。#除了CSV格式，Scrapy还支持JSON，XML的格式输入

然后运行

scrapy crawl dingdian

没有报错的话，就等上几个小时，然后就能看到好多小说就躺在自己的电脑上面了

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。