原创:仅用于学习Python爬虫,请勿商业或恶意爬取数据
文件夹和文件都是程序创建,我只爬了这些数据用于测试
仅用了两个for循环,并没有搞的太难(函数),适合新手操练,有大量注释易于理解
from lxml import etreeimport requestsimport osfrom multiprocessing import Poolheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",'Referer':'/'}# 设置urlurl = '/book/'# 请求数据response = requests.get(url=url,headers=headers)# 获取页面数据page_text = response.text# 转换成可解析对象etrees = etree.HTML(page_text)# 解析数据h2_list = etrees.xpath('//div[@class="bookmark-list"]/ul/li/h2')# 获取某本书的目录for i in h2_list:# 获取链接到目录urlsrc_list = i.xpath("./a/@href")[0]# print(src_list)#/book/shuotang.htmlsrc_url = url + src_list# 获取链接到目录 目录名a_name = i.xpath("./a/text()")[0]# print(a_name) #说唐# 根据目录链接进入某本书的目录页面response = requests.get(url=src_url,headers=headers)# 获取页面数据page_text = response.textetrees = etree.HTML(page_text)# 解析页面 准备找url和章节名称li_list = etrees.xpath('//div[@class="book-mulu"]/ul/li')file_path = "./史书典籍/%s"%a_name + "/"if not os.path.exists(file_path):os.mkdir(file_path)for j in li_list:# 单篇小说的某个章节名字a_text_singles = j.xpath("./a/text()")[0]# 单篇小说的某个章节的urla_url_singles = j.xpath("./a/@href")[0]a_url = url + a_url_singlesresponse = requests.get(url=a_url, headers=headers)# 获取页面数据page_text = response.text# 转成可解析的对象etrees = etree.HTML(page_text)content = etrees.xpath('//div[@class="chapter_content"]/p/text()')# 获取到的列表转为字符串content = "".join(content)file_name = file_path + a_text_singles +".txt"with open(file_name,"w",encoding="utf-8") as f:f.write(content)print("%s,%s下载完成"%(a_name,a_text_singles))print("%s下载完成"%a_name)