import numpy as np
import pandas as pd
import requests as req
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
# 获取体育新闻并存储到文件中
def getNewsHtml(url):
# 爬取过程中可能会出现爬取失败的情况,一旦失败停止爬取
try:
r = req.get(url, headers={'user-agent': 'Mozilla/5.0'})
r.raise_for_status()
html = r.text
return html
except:
return "Error"
# 爬取新闻信息
def getNewDate(html):
# 使用BeautifulSoup类解析网页源码
soup = BeautifulSoup(html, "html.parser")
# 获取新闻标题
title = soup.select("div.LEFT > h1")
# 打印新闻标题
print(title[0].text)
# 获取新闻发布时间
mata = soup.find_all("meta", attrs={"name": "apub:time"})[0].attrs["content"]
print(mata)
# 获取新闻主题内容
cntents = soup.select("div.content-article > p.one-p")
text = ""
n = 0
# 循环遍历contents中的p标签
for p in cntents:
if n > 1:
# 拼接内容
text = text+p.text
n = n + 1
return [title[0].text, text, mata]
# 循环爬取urls数组中的路径
def forNewUrl(urls):
List = []
for url in urls:
# 爬取页面源码
html = getNewsHtml(url)
# 返回新闻页面数据集合
newdata = getNewDate(html)
List.append(newdata)
return List
# 用来保存新闻数据
def saveNewDate(ListNewsDate,newPath):
writer = pd.ExcelWriter(newPath)
# 将数据转为DataFrame格式,用来存储在excel表格中
df= pd.DataFrame(ListNewsDate,columns=["NewTilte","NewContent","createtime"])
#
df.to_excel(writer, sheet_name="ListNewsDate1")
writer.save()
# 爬取新闻的页面ur路径
# url = "/rain/a/SPO121602087000"
urls = ["/rain/a/SPO121602087000",
"/omn/1218/1218A0NMFX00.html",
"/omn/1218/1218A0OTX800.html",
"/omn/1218/1218A0JR4H00.html",
"/omn/1218/1218A0OO9M00.html",
"/omn/1218/1218A0JVAA00.html",
"/omn/1218/1218A0HDXZ00.html",
"/omn/1218/1218A0F26Y00.html",
"/omn/1218/1218A0F1T500.html",
"/omn/1218/1218A0ENJ800.html",
"/omn/1218/1218A0E85400.html",
"/rain/a/1218A0CEBN00",
"/omn/1218/1218A0CAJB00.html",
"/omn/1218/1218A0BPK400.html",
"/omn/1218/1218A0BNTG00.html",
"/rain/a/1218A0BNI300",
"/omn/1218/1218A0BM8G00.html",
"/omn/1218/1218A0BFS000.html",
"/omn/1218/1218A0B3AT00.html",
"/rain/a/1218A0B0CI00",
"/omn/1218/1218A0AUGQ00.html",
"/omn/1218/1218A0A42300.html"
,"/omn/1218/1218A09YES00.html",
"/omn/1218/1218A09XPJ00.html",
"/omn/1218/1218A09MW500.html",
"/omn/1218/1218A09AGO00.html",
"/omn/1218/1218A08E6V00.html",
"/omn/1218/1218A067ZI00.html",
"/omn/1218/1218A046ZD00.html",
"/omn/1218/1218A0424P00.html"]
def run():
ListNewsDate = forNewUrl(urls)
saveNewDate(ListNewsDate, "ListNewsDate.xlsx")
#执行代码
run()