1200字范文 > 用Python爬虫对豆瓣《敦刻尔克》影评进行词云展示

用Python爬虫对豆瓣《敦刻尔克》影评进行词云展示

时间：2024-01-15 07:58:26

最近很想看的一个电影，去知乎上看一下评论，刚好在学Python爬虫，就做个小实例。

代码基于第三方修改原文链接 /88325/#comment-94754

#coding:utf-8from lib2to3.pgen2.grammar import line__author__ = 'hang'import warningswarnings.filterwarnings("ignore")import jieba #分词包import numpy #numpy计算包import reimport pandas as pdimport matplotlib.pyplot as pltimport urllib2from bs4 import BeautifulSoup as bsimport matplotlibmatplotlib.rcParams['figure.figsize'] = (10.0, 5.0)from wordcloud import WordCloud#词云包#分析网页函数def getNowPlayingMovie_list():resp = urllib2.urlopen('/nowplaying/hangzhou/')html_data = resp.read().decode('utf-8')soup = bs(html_data, 'html.parser')nowplaying_movie = soup.find_all('div', id='nowplaying')nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')nowplaying_list = []for item in nowplaying_movie_list:nowplaying_dict = {}nowplaying_dict['id'] = item['data-subject']for tag_img_item in item.find_all('img'):nowplaying_dict['name'] = tag_img_item['alt']nowplaying_list.append(nowplaying_dict)return nowplaying_list#爬取评论函数def getCommentsById(movieId, pageNum):eachCommentStr = ''if pageNum>0:start = (pageNum-1) * 20else:return Falserequrl = '/subject/' + movieId + '/comments' +'?' +'start=' + str(start) + '&limit=20'print(requrl)resp = urllib2.urlopen(requrl)html_data = resp.read()soup = bs(html_data, 'html.parser')comment_div_lits = soup.find_all('div', class_='comment')for item in comment_div_lits:if item.find_all('p')[0].string is not None:eachCommentStr+=item.find_all('p')[0].stringreturn eachCommentStr.strip()def main():#循环获取第一个电影的前10页评论commentStr = ''NowPlayingMovie_list = getNowPlayingMovie_list()for i in range(10):num = i + 1commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num)commentStr+=commentList_temp.strip()#print commentscleaned_comments = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】《》<>,“”！，...。？、~@#￥%……&*（）]+", "",commentStr)print cleaned_comments#使用结巴分词进行中文分词segment = jieba.lcut(cleaned_comments)words_df=pd.DataFrame({'segment':segment})#去掉停用词stopwords=pd.read_csv("D:\pycode\stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用words_df=words_df[~words_df.segment.isin(stopwords.stopword)]print words_df#统计词频words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)#用词云进行显示wordcloud=WordCloud(font_path="D:\pycode\simhei.ttf",background_color="white",max_font_size=80)word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}word_frequence_list = []for key in word_frequence:temp = (key,word_frequence[key])word_frequence_list.append(temp)wordcloud = wordcloud.fit_words(dict(word_frequence_list))plt.imshow(wordcloud)plt.axis("off")plt.show()#主函数main()

posted on -09-05 17:49 CW.Liu 阅读(...) 评论(...) 编辑收藏

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。