一、爬取数据
使用scrapy,不多说,上码
1、spider
import scrapyfrom lianjia.items import anjukeItemclass AnjukeSpider(scrapy.Spider):name = 'anjuke'allowed_domains = ['', '']start_urls = ['/sale/?pi=baidu-cpc-sz-cty1&kwid=210821712325&bd_vid=6500656477618314429']def parse(self, response):ls = response.xpath('//div[@class="property"]')item = anjukeItem()for l in ls:home_title = l.xpath('.//h3/@title').extract_first()home_size = l.xpath('.//p[@class="property-content-info-text"][1]/text()').extract_first().strip()home_addres = l.xpath('string(.//div[@class="property-content-info property-content-info-comm"])').extract_first()home_single_price = l.xpath('.//p[@class="property-price-average"]/text()').extract_first()home_commpy = l.xpath('.//span[@class="property-extra-text"][3]/text()').extract_first()home_orientations = l.xpath('.//p[@class="property-content-info-text"][2]/text()').extract_first().strip()home_height = l.xpath('.//p[@class="property-content-info-text"][3]/text()').extract_first().strip()home_year = l.xpath('.//p[@class="property-content-info-text"][4]/text()').extract_first().strip()home_layout = l.xpath('string(.//p[@class="property-content-info-text property-content-info-attribute"])').extract_first().strip()item['home_title'] = home_titleitem['home_size'] = home_sizeitem['home_addres'] = home_addresitem['home_single_price'] = home_single_priceitem['home_commpy'] = home_commpyitem['home_orientations'] = home_orientationsitem['home_height'] = home_heightitem['home_year'] = home_yearitem['home_layout'] = home_layoutyield itemfor i in range(2, 50):next_pag_url = f'/sale/p{i}/?'yield scrapy.Request(url=next_pag_url, callback=self.parse)
2、item
# Define here the models for your scraped items## See documentation in:# /en/latest/topics/items.htmlimport scrapyclass anjukeItem(scrapy.Item):# define the fields for your item here like:home_title = scrapy.Field()home_size = scrapy.Field()home_addres = scrapy.Field()home_single_price = scrapy.Field()home_commpy = scrapy.Field()home_orientations = scrapy.Field()home_height = scrapy.Field()home_year = scrapy.Field()home_layout = scrapy.Field()
3、pipelines
class anjuketxtPipeline:# def process_item(self, item, spider):#return itemdef open_spider(self, spider):self.fp = open('./anjuke.txt', mode='w+', encoding='utf-8')self.fp.write('home_title, home_size, home_addres, home_single_price, home_commpy, home_orientations, home_height, home_year, home_layout\n')def process_item(self, item, spider):# 写入文件try:line = ';'.join(list(item.values())) + '\n'self.fp.write(line)return itemexcept:passdef close_spider(self, spider):# 关闭文件self.fp.close()
二、使用jupyter分析
import reimport osimport jiebaimport wordcloudimport pandas as pdimport numpy as npfrom PIL import Imageimport seaborn as snsfrom docx import Documentfrom docx.shared import Inchesimport matplotlib.pyplot as pltfrom pandas import DataFrame,Series
sns.set_style('darkgrid')plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = False
df_home = pd.read_table('./anjuke.txt', sep=';')
df_home[df_home.isnull().any(axis=1) == True]
df_home['loupan_name'] = df_home['home_addres'].str.split(' ', expand=True)[0]df_home['home_addres']= df_home['home_addres'].str.split(' ', expand=True)[1]df_home['home_size'] = df_home['home_size'].map(lambda s : re.findall(f'(\d+).*', str(s))[0]).astype('float32')df_home['home_single_price'] = df_home['home_single_price'].map(lambda s : re.findall(f'(\d+).*', str(s))[0]).astype('float32')df_home['home_year'] = df_home['home_year'].map(lambda s : re.findall(f'(\d+).*', str(s))[0]).astype('int64')df_home['sum_price'] = df_home['home_size'] * df_home['home_single_price']
plt.figure(figsize=(10, 8), dpi=100)ax = df_home.home_size.plot(kind='kde')ax.set_xlabel('总面积')ax.set_ylabel('总数')ax.set_title('房子总面积分布情况')boxplot_fig = ax.get_figure()boxplot_fig.savefig('房子总面积分布情况.png', dpi=400)
plt.figure(figsize=(18, 10), dpi=100)ax = sns.boxplot(x='home_year', y='sum_price', data=df_home)ax.set_xlabel('年份')ax.set_ylabel('总价')ax.set_title('各年份价格区间分布')boxplot_fig = ax.get_figure()boxplot_fig.savefig('各年份价格区间分布.png', dpi=400)
plt.figure(figsize=(10, 8), dpi=100)ax = sns.barplot(x='loupan_name', y='home_single_price', data=df_home.sort_values(by='home_single_price', ascending=False).iloc[:10, :], hue='home_year')ax.set_xlabel('楼盘名称')ax.set_ylabel('单价')ax.set_title('单价排名前十的楼盘')boxplot_fig = ax.get_figure()boxplot_fig.savefig('单价排名前十的楼盘.png', dpi=400)
plt.figure(figsize=(10, 8), dpi=100)ax = sns.barplot(x='loupan_name', y='home_single_price', data=df_home.sort_values(by='home_single_price', ascending=False).iloc[:10, :], hue='home_height')ax.set_xlabel('楼盘名称')ax.set_ylabel('单价')ax.set_title('单价排名前十的楼盘的层高')boxplot_fig = ax.get_figure()boxplot_fig.savefig('单价排名前十的楼盘的层高.png', dpi=400)
plt.figure(figsize=(10, 8), dpi=100)ax = sns.barplot(x='loupan_name', y='home_single_price', data=df_home.sort_values(by='home_single_price', ascending=False).iloc[:10, :], hue='home_size')ax.set_xlabel('楼盘名称')ax.set_ylabel('单价')ax.set_title('单价排名前十的楼盘大小')boxplot_fig = ax.get_figure()boxplot_fig.savefig('单价排名前十的楼盘大小.png', dpi=400)
plt.figure(figsize=(10, 8), dpi=100)ax = sns.barplot(x='loupan_name', y='home_single_price', data=df_home.sort_values(by='home_single_price', ascending=False).iloc[:10, :], hue='home_layout')ax.set_xlabel('楼盘名称')ax.set_ylabel('单价')ax.set_title('单价排名前十的楼盘布局')boxplot_fig = ax.get_figure()boxplot_fig.savefig('单价排名前十的楼盘布局.png', dpi=400)
plt.figure(figsize=(10, 8), dpi=100)ax = sns.barplot(x='loupan_name', y='home_single_price', data=df_home.sort_values(by='home_single_price', ascending=False).iloc[:10, :], hue='home_commpy')ax.set_xlabel('楼盘名称')ax.set_ylabel('单价')ax.set_title('单价排名前十的楼盘公司')boxplot_fig = ax.get_figure()boxplot_fig.savefig('单价排名前十的楼盘公司.png', dpi=400)
plt.figure(figsize=(10, 8), dpi=100)ax = sns.barplot(x='loupan_name', y='home_single_price', data=df_home.sort_values(by='home_single_price', ascending=False).iloc[:10, :], hue='home_addres')ax.set_xlabel('楼盘名称')ax.set_ylabel('单价')ax.set_title('单价排名前十的楼盘地理位置')boxplot_fig = ax.get_figure()boxplot_fig.savefig('单价排名前十的楼盘地理位置.png', dpi=400)
plt.figure(figsize=(10, 8), dpi=100)ax = sns.barplot(x='loupan_name', y='home_single_price', data=df_home.sort_values(by='home_single_price', ascending=False).iloc[:10, :], hue='home_orientations')ax.set_xlabel('楼盘名称')ax.set_ylabel('单价')ax.set_title('单价排名前十的楼盘朝向')boxplot_fig = ax.get_figure()boxplot_fig.savefig('单价排名前十的楼盘朝向.png', dpi=400)
plt.figure(figsize=(10, 8), dpi=100)ax = df_home.groupby(by='home_layout')['loupan_name'].count().plot(kind='bar')ax.set_xlabel('布局')ax.set_ylabel('总数')ax.set_title('房子布局最多的类型')boxplot_fig = ax.get_figure()boxplot_fig.savefig('房子布局最多的类型.png', dpi=400)
ax = sns.jointplot(x="home_size", y="home_single_price", data=df_home, kind="reg", truncate=False,color="m", height=14)ax.fig.savefig('房子大小与价格的关系.png')
plt.figure(figsize=(18, 8), dpi=100)ax = sns.pointplot(x='home_year', y='home_single_price', data=df_home, color='g')ax.set_xlabel('年份')ax.set_ylabel('单价')ax.set_title('房子年份和价格的关系')boxplot_fig = ax.get_figure()boxplot_fig.savefig('房子年份和价格的关系.png', dpi=400)
plt.figure(figsize=(10, 8), dpi=100)ax = sns.pointplot(x='home_year', y='home_single_price', data=df_home.query('home_year>=').copy(), color='g')ax.set_xlabel('年份')ax.set_ylabel('单价')ax.set_title('之后房子年份和价格的关系')boxplot_fig = ax.get_figure()boxplot_fig.savefig('之后房子年份和价格的关系.png', dpi=400)
import imageio# 将特征转换为列表ls = df_home['home_title'].to_list()# 替换非中英文的字符feature_points = [re.sub(r'[^a-zA-Z\u4E00-\u9FA5]+',' ',str(feature)) for feature in ls]# 读取停用词stop_world = list(pd.read_csv('./百度停用词表.txt', engine='python', encoding='utf-8', names=['stopwords'])['stopwords'])feature_points2 = []for feature in feature_points: # 遍历每一条评论words = jieba.lcut(feature) # 精确模式,没有冗余.对每一条评论进行jieba分词ind1 = np.array([len(word) > 1 for word in words]) # 判断每个分词的长度是否大于1ser1 = pd.Series(words)ser2 = ser1[ind1] # 筛选分词长度大于1的分词留下ind2 = ~ser2.isin(stop_world) # 注意取反负号ser3 = ser2[ind2].unique() # 筛选出不在停用词表的分词留下,并去重if len(ser3) > 0:feature_points2.append(list(ser3))# 将所有分词存储到一个列表中wordlist = [word for feature in feature_points2 for word in feature]# 将列表中所有的分词拼接成一个字符串feature_str = ' '.join(wordlist) # 标题分析font_path = r'./simhei.ttf'shoes_box_jpg = imageio.imread('./home.jpg')wc=wordcloud.WordCloud(background_color='black',mask=shoes_box_jpg,font_path = font_path,min_font_size=5,max_font_size=50,width=260,height=260,)wc.generate(feature_str)plt.figure(figsize=(10, 8), dpi=100)plt.imshow(wc)plt.axis('off')plt.savefig('标题提取关键词')
def convert(s):if re.findall(f'.*(地铁).*', str(s)) != []:sr = re.findall(f'.*(地铁).*', str(s))[0]return srelif re.findall(f'.*(阳台).*', str(s)) != []:sr_one = re.findall(f'.*(阳台).*', str(s))[0]return sr_oneelif re.findall(f'.*(燃气).*', str(s)) != []:sr_three = re.findall(f'.*(燃气).*', str(s))[0]return sr_threeelif re.findall(f'.*(精装修).*', str(s)) != []:sr_two = re.findall(f'.*(精装修).*', str(s))[0]return sr_twodf_home['home_bytrain'] = df_home['home_title'].map(convert)
plt.figure(figsize=(10, 8), dpi=100)ax = sns.pointplot(x='home_year', y='home_single_price', data=df_home.query('home_year>=').copy(), color='red', hue='home_bytrain')ax.set_xlabel('年份')ax.set_ylabel('单价')ax.set_title('之后房子年份和价格的走向影响因素')boxplot_fig = ax.get_figure()boxplot_fig.savefig('之后房子年份和价格的走向影响因素.png', dpi=400)