1200字范文,内容丰富有趣,写作的好帮手!
1200字范文 > scrapy爬虫+echarts数据分析(安居客)

scrapy爬虫+echarts数据分析(安居客)

时间:2023-05-11 20:51:36

相关推荐

scrapy爬虫+echarts数据分析(安居客)

本次爬取是安居客的房产信息:话不多说,直接上代码!

一:爬虫板块:

1.运行文件:run.py

from scrapy import cmdline

cmdline.execute(‘scrapy crawl anjuke_shanghai’.split())

2.网页解析:anjuke_shanghai.py

import scrapy

import time

from anjuke.items import AnjukeItem

class AnjukeShanghaiSpider(scrapy.Spider):

name = ‘anjuke_shanghai’

allowed_domains = [‘’]

start_urls = [‘/sale/p11/#filtersort’]

next_page_id = 12def parse(self, response):for ajk in response.xpath("//ul[@id='houselist-mod-new']/li"):time.sleep(5)item = AnjukeItem()title = ajk.xpath(".//div[@class='house-title']/a/text()")[0].extract()time.sleep(1)item['title'] = title.strip()# print(item['title'])price = ajk.xpath(".//span[@class='price-det']/strong/text()")[0].extract()time.sleep(1)item['price'] = price# print(item['price'])unit_price = ajk.xpath(".//span[@class='unit-price']/text()")[0].extract()time.sleep(1)if len(unit_price) > 0:item['unit_price'] = unit_price.replace("元/m²", "")else:item['unit_price'] = ""# print(item['unit_price'])site = ajk.xpath(".//span[@class='comm-address']/text()").extract()time.sleep(1)if len(site) > 0:st = site[0].split()item['site'] = " ".join(st)else:item['site'] = ""# print(item['site'])house_type = ajk.xpath(".//div[@class='details-item']/span[1]/text()").extract()time.sleep(1)if len(house_type) > 0:item['house_type'] = house_type[0]else:item['house_type'] = ""# print(item['house_type'])area = ajk.xpath(".//div[@class='details-item']/span[2]/text()")[0].extract()time.sleep(1)if len(area) > 0:item['area'] = area.replace("m²", "")else:item['area'] = ""item['house_url'] = ajk.xpath(".//div[@class='house-title']/a/@href")[0].extract()time.sleep(1)# print(item['house_url'])yield itemurl = "/sale/p{}/#filtersort".format(self.next_page_id)if self.next_page_id < 50:time.sleep(5)yield scrapy.Request(url=url, dont_filter=True, callback=self.parse)# print(self.page_id)self.next_page_id += 13. items.py

Define here the models for your scraped items

See documentation in:

/en/latest/topics/items.html

import scrapy

class AnjukeItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

# 标题

title = scrapy.Field()

# 总价price = scrapy.Field()# 单价unit_price = scrapy.Field()# 地点site = scrapy.Field()# 类型house_type = scrapy.Field()# 面积area = scrapy.Field()# 链接house_url = scrapy.Field()4. middlewares.py

Define here the models for your spider middleware

See documentation in:

/en/latest/topics/spider-middleware.html

from scrapy import signals

useful for handling different item types with a single interface

from itemadapter import is_item, ItemAdapter

class AnjukeSpiderMiddleware:

# Not all methods need to be defined. If a method is not defined,

# scrapy acts as if the spider middleware does not modify the

# passed objects.

@classmethod

def from_crawler(cls, crawler):

# This method is used by Scrapy to create your spiders.

s = cls()

crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

return s

def process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return Nonedef process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, or item objects.for i in result:yield idef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Request or item objects.passdef process_start_requests(self, start_requests, spider):# Called with the start requests of the spider, and works# similarly to the process_spider_output() method, except# that it doesn’t have a response associated.# Must return only requests (not items).for r in start_requests:yield rdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)

class AnjukeDownloaderMiddleware:

# Not all methods need to be defined. If a method is not defined,

# scrapy acts as if the downloader middleware does not modify the

# passed objects.

@classmethod

def from_crawler(cls, crawler):

# This method is used by Scrapy to create your spiders.

s = cls()

crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

return s

def process_request(self, request, spider):# Called for each request that goes through the downloader# middleware.# Must either:# - return None: continue processing this request# - or return a Response object# - or return a Request object# - or raise IgnoreRequest: process_exception() methods of# installed downloader middleware will be calledreturn Nonedef process_response(self, request, response, spider):# Called with the response returned from the downloader.# Must either;# - return a Response object# - return a Request object# - or raise IgnoreRequestreturn responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpassdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)5. pipelines.py

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: /en/latest/topics/item-pipeline.html

useful for handling different item types with a single interface

from itemadapter import ItemAdapter

import pymysql

class AnjukePipeline:

definit(self):

self.connect = pymysql.connect(host=“localhost”, user=“root”, passwd=“1234”, db=“anjuke”)

self.cursor = self.connect.cursor()

print(“数据库连接成功”)

def process_item(self, item, spider):print("开始保存数据")insql = "insert into anjuke_shanghai(title,price,unit_price,site,house_type,area,house_url) values (%s,%s,%s,%s,%s,%s,%s)"self.cursor.execute(insql, (item['title'], item['price'], item['unit_price'], item['site'], item['house_type'],item['area'], item['house_url']))mit()print("保存数据成功")return itemdef parse_close(self):self.connect.close()self.cursor.close()6. settings.py

Scrapy settings for anjuke project

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

/en/latest/topics/settings.html

/en/latest/topics/downloader-middleware.html

/en/latest/topics/spider-middleware.html

BOT_NAME = ‘anjuke’

SPIDER_MODULES = [‘anjuke.spiders’]

NEWSPIDER_MODULE = ‘anjuke.spiders’

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = ‘anjuke (+)’

Obey robots.txt rules

ROBOTSTXT_OBEY = False

Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See /en/latest/topics/settings.html#download-delay

See also autothrottle settings and docs

DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

CONCURRENT_REQUESTS_PER_DOMAIN = 16

CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

COOKIES_ENABLED = True

Disable Telnet Console (enabled by default)

TELNETCONSOLE_ENABLED = False

Override the default request headers:

DEFAULT_REQUEST_HEADERS = {

‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,

‘Accept-Language’: ‘en’,

‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36’,

‘Cookie’: ‘aQQ_ajkguid=8E3DD02F-E811-A2DA-DA53-C1B88CD60608; id58=e87rkF/lNzIYHcjBD+SdAg==; _ga=GA1.2.93190540.1608857396; _gid=GA1.2.334371282.1608857396; 58tj_uuid=6fc5ade0-bfd0-4187-bd4e-9686d7082817; new_uv=1; als=0; sessid=B70FA124-E42F-8DAD-3813-6C91C72B7A20; ctid=11; twe=2; obtain_by=2; ajk_member_verify=QUbPDLTnm9FWHSOd33buoCZE2z1wm%2FVudTO6LdSsWYs%3D; ajk_member_verify2=MTYwMDA4MTUwfFUxNTU3Mjk4NzEwNDM3NXwx; xxzl_cid=7380c6b8f44840bea607d5323fb011f4; xzuid=a8fd56b1-e885-46cd-b255-5dcd8fa79dc4; ajkAuthTicket=TT=f841c95d589fd9118d083c3ba68b97a3&TS=160889550&PBODY=VcG9Y6AtpZbA4ERSDzm8x-gaGSpJliB6sqdOLZ5r43ZgbMtoUuIQ3_UEzjH93WSEcM1W26Q_96d7T9tcmKpasHOQN42asUK9WLXeGZ4ssbi9u2MLY5aKXbsVALuXFkdG1gu6vlvjxUMNOn_EEGoo7fk8RHanQCv-vKtjgHmzDBk&VER=2’

}

Enable or disable spider middlewares

See /en/latest/topics/spider-middleware.html

SPIDER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeSpiderMiddleware’: 543,

}

Enable or disable downloader middlewares

See /en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeDownloaderMiddleware’: 543,

}

Enable or disable extensions

See /en/latest/topics/extensions.html

EXTENSIONS = {

‘scrapy.extensions.telnet.TelnetConsole’: None,

}

Configure item pipelines

See /en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

‘anjuke.pipelines.AnjukePipeline’: 300,

}

Enable and configure the AutoThrottle extension (disabled by default)

See /en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

The initial download delay

AUTOTHROTTLE_START_DELAY = 5

The maximum download delay to be set in case of high latencies

AUTOTHROTTLE_MAX_DELAY = 60

The average number of requests Scrapy should be sending in parallel to

each remote server

AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

Enable showing throttling stats for every response received:

AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See /en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = ‘httpcache’

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’

二:数据版块:

1.数据库内容:2.使用pyecharts分析截图:

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。