您可以使用像python-goose这样的工具,它旨在从html页面中提取文章。在
另外,我做了以下小程序,效果不错:from html5lib import parse
with open('page.html') as f:
doc = parse(f.read(), treebuilder='lxml', namespaceHTMLElements=False)
html = doc.getroot()
body = html.xpath('//body')[0]
def sanitize(element):
"""Retrieve all the text contained in an element as a single line of
text. This must be executed only on blocks that have only inlines
as children
"""
# join all the strings and remove \n
out = ' '.join(element.itertext()).replace('\n', ' ')
# replace multiple space with a single space
out = ' '.join(out.split())
return out
def parse(element):
# those elements can contain other block inside them
if element.tag in ['div', 'li', 'a', 'body', 'ul']:
if element.text is None or element.text.isspace():
for child in element.getchildren():
yield from parse(child)
else:
yield sanitize(element)
# those elements are "guaranteed" to contains only inlines
elif element.tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
yield sanitize(element)
else:
try:
print('> ignored', element.tag)
except:
pass
for e in filter(lambda x: len(x) > 80, parse(body)):
print(e)