requests
文档:http://cn.python-/zh_CN/latest/
安装:pip --timeout=100 install requests
[ python ] pip 配置国内镜像源(亲测有效)
百度搜索
一个简单地小例子基于requests
模块的get
请求爬取百度搜索首页import requestsif __name__ == "__main__":url = ""response = requests.get(url)response.encoding = 'utf-8'print("状态码:" + str(response.status_code))page_text = response.textprint("页面内容:" + page_text)with open('./baidu.html', 'w', encoding='utf-8') as fp:fp.write(page_text)print('爬取数据结束!')
搜狗搜索
基于requests
模块的get
请求爬取搜狗指定词条对应的搜索结果页面import requestsif __name__ == '__main__':headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}url = '/web'kw = input('输入查询关键字:')param = {'query': kw}response = requests.get(url, param, headers=headers)page_text = response.textfileName = kw + '.html'with open(fileName, 'w', encoding='utf-8') as fp:fp.write(page_text)print('数据爬取结束!')
百度翻译
基于requests
模块的post
请求破解百度翻译import requestsimport jsonif __name__ == '__main__':headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}post_url = '/sug'word = input('输入查询关键字:')data = {'kw': word}response = requests.post(post_url, data, headers=headers)dic_obj = response.json()print(dic_obj)fileName = word + '.json'fp = open(fileName, 'w', encoding='utf-8')json.dump(dic_obj, fp, ensure_ascii=False)print('数据爬取结束!')
豆瓣喜剧电影排行榜
基于requests
模块ajax
的get
请求爬取链接:/
爬取豆瓣电影分类排行榜 - 喜剧片import requestsimport jsonif __name__ == '__main__':headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}param = {"type": "24","interval_id": "100:90","action": "","start": "0","limit": "20",}url = '/j/chart/top_list'response = requests.get(url, param, headers=headers)dic_obj = response.json()print(dic_obj)fileName = '豆瓣电影排行榜.json'fp = open(fileName, 'w', encoding='utf-8')json.dump(dic_obj, fp, ensure_ascii=False)print('数据爬取结束!')
企业信息爬取
爬取链接:http://125.35.6.84:81/xk/
爬取企业化妆品生产许可证信息import requestsimport jsonif __name__ == '__main__':headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/78.0.3904.108 Safari/537.36 '}url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'# 企业 id 列表id_list = []detail_list = []# 获取前两页企业 id,30 条idfor page in range(1, 3):page = str(page)param = {"on": "true","page": page,"pageSize": "15","productName": "","conditionType": "1","applyname": "","applysn": "",}response = requests.post(url, param, headers=headers)json_ids = response.json()for dic in json_ids['list']:id_list.append(dic['ID'])post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'for id in id_list:data = {'id': id}res = requests.post(post_url, data, headers=headers)detail_json = res.json()detail_list.append(detail_json)fileName = '企业信息.json'fp = open(fileName, 'w', encoding='utf-8')json.dump(detail_list, fp, ensure_ascii=False)print('数据爬取结束!')
来源:爬虫开发入门丨老男孩IT教育