1200字范文 > scrapy爬虫+echarts数据分析（安居客）

scrapy爬虫+echarts数据分析（安居客）

时间：2023-05-11 20:51:36

本次爬取是安居客的房产信息：话不多说，直接上代码！

一：爬虫板块：

1.运行文件：run.py

from scrapy import cmdline

cmdline.execute(‘scrapy crawl anjuke_shanghai’.split())

2.网页解析：anjuke_shanghai.py

import scrapy

import time

from anjuke.items import AnjukeItem

class AnjukeShanghaiSpider(scrapy.Spider):

name = ‘anjuke_shanghai’

allowed_domains = [‘’]

start_urls = [‘/sale/p11/#filtersort’]

next_page_id = 12def parse(self, response):for ajk in response.xpath("//ul[@id='houselist-mod-new']/li"):time.sleep(5)item = AnjukeItem()title = ajk.xpath(".//div[@class='house-title']/a/text()")[0].extract()time.sleep(1)item['title'] = title.strip()# print(item['title'])price = ajk.xpath(".//span[@class='price-det']/strong/text()")[0].extract()time.sleep(1)item['price'] = price# print(item['price'])unit_price = ajk.xpath(".//span[@class='unit-price']/text()")[0].extract()time.sleep(1)if len(unit_price) > 0:item['unit_price'] = unit_price.replace("元/m²", "")else:item['unit_price'] = ""# print(item['unit_price'])site = ajk.xpath(".//span[@class='comm-address']/text()").extract()time.sleep(1)if len(site) > 0:st = site[0].split()item['site'] = " ".join(st)else:item['site'] = ""# print(item['site'])house_type = ajk.xpath(".//div[@class='details-item']/span[1]/text()").extract()time.sleep(1)if len(house_type) > 0:item['house_type'] = house_type[0]else:item['house_type'] = ""# print(item['house_type'])area = ajk.xpath(".//div[@class='details-item']/span[2]/text()")[0].extract()time.sleep(1)if len(area) > 0:item['area'] = area.replace("m²", "")else:item['area'] = ""item['house_url'] = ajk.xpath(".//div[@class='house-title']/a/@href")[0].extract()time.sleep(1)# print(item['house_url'])yield itemurl = "/sale/p{}/#filtersort".format(self.next_page_id)if self.next_page_id < 50:time.sleep(5)yield scrapy.Request(url=url, dont_filter=True, callback=self.parse)# print(self.page_id)self.next_page_id += 13. items.py

Define here the models for your scraped items

See documentation in:

/en/latest/topics/items.html

import scrapy

class AnjukeItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

# 标题

title = scrapy.Field()

# 总价price = scrapy.Field()# 单价unit_price = scrapy.Field()# 地点site = scrapy.Field()# 类型house_type = scrapy.Field()# 面积area = scrapy.Field()# 链接house_url = scrapy.Field()4. middlewares.py

Define here the models for your spider middleware

See documentation in:

/en/latest/topics/spider-middleware.html

from scrapy import signals

useful for handling different item types with a single interface

from itemadapter import is_item, ItemAdapter

class AnjukeSpiderMiddleware:

# Not all methods need to be defined. If a method is not defined,

# scrapy acts as if the spider middleware does not modify the

# passed objects.

@classmethod

def from_crawler(cls, crawler):

# This method is used by Scrapy to create your spiders.

s = cls()

crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

return s

def process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return Nonedef process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, or item objects.for i in result:yield idef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Request or item objects.passdef process_start_requests(self, start_requests, spider):# Called with the start requests of the spider, and works# similarly to the process_spider_output() method, except# that it doesn’t have a response associated.# Must return only requests (not items).for r in start_requests:yield rdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)

class AnjukeDownloaderMiddleware:

# Not all methods need to be defined. If a method is not defined,

# scrapy acts as if the downloader middleware does not modify the

# passed objects.

@classmethod

def from_crawler(cls, crawler):

# This method is used by Scrapy to create your spiders.

s = cls()

crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

return s

def process_request(self, request, spider):# Called for each request that goes through the downloader# middleware.# Must either:# - return None: continue processing this request# - or return a Response object# - or return a Request object# - or raise IgnoreRequest: process_exception() methods of# installed downloader middleware will be calledreturn Nonedef process_response(self, request, response, spider):# Called with the response returned from the downloader.# Must either;# - return a Response object# - return a Request object# - or raise IgnoreRequestreturn responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpassdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)5. pipelines.py

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: /en/latest/topics/item-pipeline.html

useful for handling different item types with a single interface

from itemadapter import ItemAdapter

import pymysql

class AnjukePipeline:

definit(self):

self.connect = pymysql.connect(host=“localhost”, user=“root”, passwd=“1234”, db=“anjuke”)

self.cursor = self.connect.cursor()

print(“数据库连接成功”)

def process_item(self, item, spider):print("开始保存数据")insql = "insert into anjuke_shanghai(title,price,unit_price,site,house_type,area,house_url) values (%s,%s,%s,%s,%s,%s,%s)"self.cursor.execute(insql, (item['title'], item['price'], item['unit_price'], item['site'], item['house_type'],item['area'], item['house_url']))mit()print("保存数据成功")return itemdef parse_close(self):self.connect.close()self.cursor.close()6. settings.py

Scrapy settings for anjuke project

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

/en/latest/topics/settings.html

/en/latest/topics/downloader-middleware.html

/en/latest/topics/spider-middleware.html

BOT_NAME = ‘anjuke’

SPIDER_MODULES = [‘anjuke.spiders’]

NEWSPIDER_MODULE = ‘anjuke.spiders’

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = ‘anjuke (+)’

Obey robots.txt rules

ROBOTSTXT_OBEY = False

Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See /en/latest/topics/settings.html#download-delay

DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

CONCURRENT_REQUESTS_PER_DOMAIN = 16

CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

COOKIES_ENABLED = True

Disable Telnet Console (enabled by default)

TELNETCONSOLE_ENABLED = False

Override the default request headers:

DEFAULT_REQUEST_HEADERS = {

‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,

‘Accept-Language’: ‘en’,

‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36’,

‘Cookie’: ‘aQQ_ajkguid=8E3DD02F-E811-A2DA-DA53-C1B88CD60608; id58=e87rkF/lNzIYHcjBD+SdAg==; _ga=GA1.2.93190540.1608857396; _gid=GA1.2.334371282.1608857396; 58tj_uuid=6fc5ade0-bfd0-4187-bd4e-9686d7082817; new_uv=1; als=0; sessid=B70FA124-E42F-8DAD-3813-6C91C72B7A20; ctid=11; twe=2; obtain_by=2; ajk_member_verify=QUbPDLTnm9FWHSOd33buoCZE2z1wm%2FVudTO6LdSsWYs%3D; ajk_member_verify2=MTYwMDA4MTUwfFUxNTU3Mjk4NzEwNDM3NXwx; xxzl_cid=7380c6b8f44840bea607d5323fb011f4; xzuid=a8fd56b1-e885-46cd-b255-5dcd8fa79dc4; ajkAuthTicket=TT=f841c95d589fd9118d083c3ba68b97a3&TS=160889550&PBODY=VcG9Y6AtpZbA4ERSDzm8x-gaGSpJliB6sqdOLZ5r43ZgbMtoUuIQ3_UEzjH93WSEcM1W26Q_96d7T9tcmKpasHOQN42asUK9WLXeGZ4ssbi9u2MLY5aKXbsVALuXFkdG1gu6vlvjxUMNOn_EEGoo7fk8RHanQCv-vKtjgHmzDBk&VER=2’

}

Enable or disable spider middlewares

See /en/latest/topics/spider-middleware.html

SPIDER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeSpiderMiddleware’: 543,

}

Enable or disable downloader middlewares

See /en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeDownloaderMiddleware’: 543,

}

Enable or disable extensions

See /en/latest/topics/extensions.html

EXTENSIONS = {

‘scrapy.extensions.telnet.TelnetConsole’: None,

}

Configure item pipelines

See /en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

‘anjuke.pipelines.AnjukePipeline’: 300,

}

Enable and configure the AutoThrottle extension (disabled by default)

See /en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

The initial download delay

AUTOTHROTTLE_START_DELAY = 5

The maximum download delay to be set in case of high latencies

AUTOTHROTTLE_MAX_DELAY = 60

The average number of requests Scrapy should be sending in parallel to

each remote server

AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

Enable showing throttling stats for every response received:

AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See /en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = ‘httpcache’

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’

二：数据版块：

1.数据库内容：2.使用pyecharts分析截图：

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。

scrapy爬虫+echarts数据分析（安居客）

Define here the models for your scraped items

See documentation in:

/en/latest/topics/items.html

Define here the models for your spider middleware

See documentation in:

/en/latest/topics/spider-middleware.html

useful for handling different item types with a single interface

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: /en/latest/topics/item-pipeline.html

useful for handling different item types with a single interface

Scrapy settings for anjuke project

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

/en/latest/topics/settings.html

/en/latest/topics/downloader-middleware.html

/en/latest/topics/spider-middleware.html

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = ‘anjuke (+)’

Obey robots.txt rules

Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See /en/latest/topics/settings.html#download-delay

See also autothrottle settings and docs

DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

CONCURRENT_REQUESTS_PER_DOMAIN = 16

CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

Disable Telnet Console (enabled by default)

TELNETCONSOLE_ENABLED = False

Override the default request headers:

Enable or disable spider middlewares

See /en/latest/topics/spider-middleware.html

SPIDER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeSpiderMiddleware’: 543,

}

Enable or disable downloader middlewares

See /en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeDownloaderMiddleware’: 543,

}

Enable or disable extensions

See /en/latest/topics/extensions.html

EXTENSIONS = {

‘scrapy.extensions.telnet.TelnetConsole’: None,

}

Configure item pipelines

See /en/latest/topics/item-pipeline.html

Enable and configure the AutoThrottle extension (disabled by default)

See /en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

The initial download delay

The maximum download delay to be set in case of high latencies

The average number of requests Scrapy should be sending in parallel to

each remote server

Enable showing throttling stats for every response received:

AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See /en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = ‘httpcache’

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’