Python网络爬虫实例:股票数据定向爬虫
一、功能描述
目标:获取上交所和深交所所有股票的名称和交易信息
输出:保存到文件中
技术路线:requests-bs4-re
二、候选数据网站选择
候选网站:
新浪股票:/stock/百度股票:/stock/东方财富网-/stocklist.html同花顺:/#refCountId=db_509381c1_860老虎社区:/
选取原则:股票信息存在于HTML页面中,非js代码生成。没有robots协议限制
选取方法:浏览器F12 ,源代码查看等
选取心态:不要纠结与某个网站,多找信息源尝试
网站确定:
获取股票列表:
东方财富网:/stocklist.html
获取个股信息:
老虎社区:/
三、程序的结构设计
从东方财富网获取股票列表根据股票列表逐个到老虎社区股票获取个股信息将结果存储到文件四、实例代码
import requestsfrom bs4 import BeautifulSoupimport tracebackimport redef getHTMLText(url):try:r = requests.get(url)r.raise_for_status()r.encoding = r.apparent_encodingreturn r.textexcept:return ""def getStockList(lst, stockURL):html = getHTMLText(stockURL)soup = BeautifulSoup(html, 'html.parser') a = soup.find_all('a')for i in a:try:href = i.attrs['href']lst.append(re.findall(r"\d{6}", href)[0])except:continueprint(len(lst))def getStockInfo(lst,stockURL,fpath):for stock in lst:url = stockURL + stockhtml = getHTMLText(url)try:if html == "":continueinfoDict = {}soup = BeautifulSoup(html,'html.parser')stockName = soup.find_all('h1',attrs={'class':'name'})[0]name = stockName.text.split()[0]infoDict.update({'股票名称':name})stockInfo = soup.find('div',attrs={'class':'stock-detail'})keyList = stockInfo.find_all('dt')valueList = stockInfo.find_all('dd')for i in range(len(keyList)):key = keyList[i].textval = valueList[i].textinfoDict[key] = valwith open(fpath,'a',encoding='utf-8') as f:f.write(str(infoDict) + '\n' )except:traceback.print_exc()continuedef main():stock_list_url = '/stock_list.html'stock_info_url = '/stock/a/'output_file = 'D:/BaiduStockInfo.txt'slist = []getStockList(slist,stock_list_url)getStockInfo(slist,stock_info_url,output_file)main()
五、优化
import requestsfrom bs4 import BeautifulSoupimport tracebackimport redef getHTMLText(url,code="utf-8"):#此处指定编码方式,优化了速度try:r = requests.get(url)r.raise_for_status()r.encoding = codereturn r.textexcept:return ""def getStockList(lst, stockURL):html = getHTMLText(stockURL,'GB2312')soup = BeautifulSoup(html, 'html.parser') a = soup.find_all('a')for i in a:try:href = i.attrs['href']lst.append(re.findall(r"\d{6}", href)[0])except:continueprint(len(lst))def getStockInfo(lst,stockURL,fpath):count = 0for stock in lst:url = stockURL + stockhtml = getHTMLText(url)try:if html == "":continueinfoDict = {}soup = BeautifulSoup(html,'html.parser')stockName = soup.find_all('h1',attrs={'class':'name'})[0]name = stockName.text.split()[0]infoDict.update({'股票名称':name})stockInfo = soup.find('div',attrs={'class':'stock-detail'})keyList = stockInfo.find_all('dt')valueList = stockInfo.find_all('dd')for i in range(len(keyList)):key = keyList[i].textval = valueList[i].textinfoDict[key] = valwith open(fpath,'a',encoding='utf-8') as f:f.write(str(infoDict) + '\n' )count = count + 1print('\r当前进度:{:.2f}%'.format(count*100/len(lst)),end="")#增加进度条,优化用户体验except:count = count + 1print('\r当前进度:{:.2f}%'.format(count*100/len(lst)),end="")continuedef main():stock_list_url = '/stock_list.html'stock_info_url = '/stock/a/'output_file = 'D:/BaiduStockInfo.txt'slist = []getStockList(slist,stock_list_url)getStockInfo(slist,stock_info_url,output_file)main()
在cmd中运行此文件,可看到进度的动态显示!
这一篇使用requests库,下一篇使用scrapy框架爬取股票数据.
参考中国大学MOOC网课程:Python网络爬虫与信息提取-嵩天