1200字范文,内容丰富有趣,写作的好帮手!
1200字范文 > java爬虫案例——SpringBoot使用HttpClient Jsoup爬取京东手机数据

java爬虫案例——SpringBoot使用HttpClient Jsoup爬取京东手机数据

时间:2020-05-13 10:31:03

相关推荐

java爬虫案例——SpringBoot使用HttpClient Jsoup爬取京东手机数据

文章目录

前言一、准备工作二、项目文件1.项目依赖2.项目配置文件3.pojo4.dao接口5.service接口及其实现类6.HttpClient封装工具类7.爬取任务实现8.启动类 三、项目执行效果总结

前言

之前同事分享了一些关于Java爬虫的视频,其中有一个是用HttpClient及Jsoup爬取京东上的一些手机数据(如图片、标题、sku、spu等),同时参考几篇博客后基本实现目标,在此篇做个简单记录。

一、准备工作

由于需要将爬取到的数据的数据存储到数据库表中,因此需要建库建表。建库建表SQL如下:

DROP DATABASE IF EXISTS `crawler`;CREATE DATABASE IF NOT EXISTS `crawler` DEFAULT CHARSET = `utf8`;USE `crawler`;SET FOREIGN_KEY_CHECKS = 0;DROP TABLE IF EXISTS `jd_item`;CREATE TABLE `jd_item` (`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',`spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',`sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id',`title` varchar(100) DEFAULT NULL COMMENT '商品标题',`price` bigint(10) DEFAULT NULL COMMENT '商品价格',`pic` varchar(200) DEFAULT NULL COMMENT '商品图片',`url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',`created` datetime DEFAULT NULL COMMENT '创建时间',`updated` datetime DEFAULT NULL COMMENT '更新时间',PRIMARY KEY (`id`),KEY `sku` (`sku`) USING BTREE ) ENGINE = InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET = utf8 COMMENT = '京东商品表';

项目目录

二、项目文件

1.项目依赖

pom.xml:

<?xml version="1.0" encoding="UTF-8"?><project xmlns="/POM/4.0.0"xmlns:xsi="/2001/XMLSchema-instance"xsi:schemaLocation="/POM/4.0.0 /xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><parent><artifactId>spring-boot-starter-parent</artifactId><groupId>org.springframework.boot</groupId><version>2.3.4.RELEASE</version></parent><groupId>cn.mlnt</groupId><artifactId>mlnt-crawler-jd</artifactId><version>1.0-SNAPSHOT</version><dependencies><!--SpringMVC--><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-web</artifactId></dependency><!--SpringData Jpa--><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-data-jpa</artifactId></dependency><!--MySQL连接包--><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId><version>8.0.21</version></dependency><!--HttpClient--><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId></dependency><!--Jsoup--><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.13.1</version></dependency><!--工具包--><dependency><groupId>mons</groupId><artifactId>commons-lang3</artifactId></dependency></dependencies></project>

2.项目配置文件

application.properties(或使用.yml):

#DB Configuration:spring.datasource.driverClassName=com.mysql.cj.jdbc.Driverspring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler?useUnicode=true&characterEncoding=utf8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghaispring.datasource.username=rootspring.datasource.password=123456#JPA Configuration:spring.jpa.database=MySQLspring.jpa.show-sql=true

3.pojo

Item.java:

package cn.mlnt.jd.pojo;import javax.persistence.*;import java.util.Date;@Entity@Table(name="jd_item")public class Item {// 主键@Id@GeneratedValue(strategy = GenerationType.IDENTITY)private Long id;// 标准产品单位(商品集合)private Long spu;// 库存量单位(最小品类单元)private Long sku;// 商品标题private String title;// 商品价格private Double price;// 商品图片private String pic;// 商品详情地址private String url;// 创建时间private Date created;// 更新时间private Date updated;public Long getId() {return id;}public void setId(Long id) {this.id = id;}public Long getSpu() {return spu;}public void setSpu(Long spu) {this.spu = spu;}public Long getSku() {return sku;}public void setSku(Long sku) {this.sku = sku;}public String getTitle() {return title;}public void setTitle(String title) {this.title = title;}public Double getPrice() {return price;}public void setPrice(Double price) {this.price = price;}public String getPic() {return pic;}public void setPic(String pic) {this.pic = pic;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public Date getCreated() {return created;}public void setCreated(Date created) {this.created = created;}public Date getUpdated() {return updated;}public void setUpdated(Date updated) {this.updated = updated;}}

4.dao接口

ItemDao.java:

package cn.mlnt.jd.dao;import cn.mlnt.jd.pojo.Item;import org.springframework.data.jpa.repository.JpaRepository;public interface ItemDao extends JpaRepository<Item, Long> {}

5.service接口及其实现类

ItemService.java:

package cn.mlnt.jd.service;import cn.mlnt.jd.pojo.Item;import java.util.List;public interface ItemService {/*** 保存商品* @param item*/public void save(Item item);/*** 根据条件查询商品* @param item* @return*/public List<Item> findAll(Item item);}

ItemServiceImpl.java:

package cn.mlnt.jd.service.impl;import cn.mlnt.jd.dao.ItemDao;import cn.mlnt.jd.pojo.Item;import cn.mlnt.jd.service.ItemService;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.data.domain.Example;import org.springframework.stereotype.Service;import org.springframework.transaction.annotation.Transactional;import java.util.List;@Servicepublic class ItemServiceImpl implements ItemService {@Autowiredprivate ItemDao itemDao;@Override@Transactionalpublic void save(Item item) {this.itemDao.save(item);}@Overridepublic List<Item> findAll(Item item) {// 声明查询条件Example<Item> example = Example.of(item);// 根据查询条件进行查询数据List<Item> list = this.itemDao.findAll(example);return list;}}

6.HttpClient封装工具类

HttpUtils.java:

package cn.mlnt.jd.util;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.ponent;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.OutputStream;import java.util.UUID;@Componentpublic class HttpUtils {private PoolingHttpClientConnectionManager cm;public HttpUtils() {this.cm = new PoolingHttpClientConnectionManager();// 设置最大连接数this.cm.setMaxTotal(100);// 设置每个主机的最大连接数this.cm.setDefaultMaxPerRoute(10);}/*** 根据请求地址下载页面数据* @param url* @return 页面数据*/public String doGetHtml(String url) {// 获取HttpClient对象CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();// 创建httpGet对象,设置url地址HttpGet httpGet = new HttpGet(url);// 设置请求信息httpGet.setConfig(this.getConfig());// 设置请求Request Headers中的User-Agent,浏览器访问httpGet.addHeader("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36");CloseableHttpResponse response = null;try {// 使用HttpClient发起请求,获取响应response = httpClient.execute(httpGet);// 解析响应,返回结果if(response.getStatusLine().getStatusCode() == 200) {String content = "";// 判断响应体Entity是否不为空,如果不为空就可以使用EntityUtilsif(response.getEntity() != null) {content = EntityUtils.toString(response.getEntity(), "utf8");return content;}}} catch (IOException e) {e.printStackTrace();} finally {// 关闭responseif(response != null) {try {response.close();} catch (IOException e) {e.printStackTrace();}}}// 返回空字符串return "";}/*** 下载图片* @param url* @return 图片名称*/public String doGetImage(String url) {// 获取HttpClient对象CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();// 创建httpGet对象,设置url地址HttpGet httpGet = new HttpGet(url);// 设置请求信息httpGet.setConfig(this.getConfig());CloseableHttpResponse response = null;try {// 使用HttpClient发起请求,获取响应response = httpClient.execute(httpGet);// 解析响应,返回结果if(response.getStatusLine().getStatusCode() == 200) {// 判断响应体Entity是否不为空if(response.getEntity() != null) {// 下载图片// 获取图片的后缀String extName = url.substring(url.lastIndexOf("."));// 创建图片名,重命名图片String picName = UUID.randomUUID().toString()+extName;// 下载图片// 声明OutPutStream,下载图片存储路径OutputStream outputStream = new FileOutputStream(new File("E:\\images\\"+picName));response.getEntity().writeTo(outputStream);// 返回图片名称return picName;}}} catch (IOException e) {e.printStackTrace();} finally {// 关闭responseif(response != null) {try {response.close();} catch (IOException e) {e.printStackTrace();}}}// 如果下载失败,返回空字符串return "";}/*** 设置请求信息* @return*/private RequestConfig getConfig() {RequestConfig config = RequestConfig.custom()// 创建链接的最长时间.setConnectTimeout(1000)// 获取连接到最长时间.setConnectionRequestTimeout(500)// 数据传输的最长时间.setSocketTimeout(10000).build();return config;}public static void main(String[] args) throws IOException {HttpUtils httpUtils = new HttpUtils();String itemInfo = httpUtils.doGetHtml("/100009082466.html");String title = Jsoup.parse(itemInfo).select("div#itemName").text();System.out.println(Jsoup.parse(itemInfo).select("div#itemName"));System.out.println(title);}}

7.爬取任务实现

ItemTask.java:

package cn.mlnt.jd.task;import cn.mlnt.jd.pojo.Item;import cn.mlnt.jd.service.ItemService;import cn.mlnt.jd.util.HttpUtils;import com.fasterxml.jackson.databind.ObjectMapper;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.springframework.scheduling.annotation.Scheduled;import org.ponent;import javax.annotation.Resource;import java.util.Date;import java.util.List;@Componentpublic class ItemTask {@Resourceprivate HttpUtils httpUtils;@Resourceprivate ItemService itemService;private static final ObjectMapper MAPPER = new ObjectMapper();/*** 当下载任务完成后,间隔多长时间进行下一次任务* @throws Exception*/@Scheduled(fixedDelay = 100*1000)public void itemTask() throws Exception {// 声明需要解析的初始地址String url = "/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=1e449f956a3b49319117b81bbde91f3c";// 按照页面对手机的搜索结果进行遍历解析for (int i = 1; i < 10; i=i+2) {String html = httpUtils.doGetHtml(url + i);// 解析页面,获取商品数据并存储if (html != null) {this.parse(html);}}System.out.println("手机数据抓取完成!");}/*** 解析页面,获取商品数据并存储* @param html* @throws Exception*/private void parse(String html) throws Exception {// 解析html获取Document对象Document document = Jsoup.parse(html);// 获取spuElements spuEles = document.select("div#J_goodsList > ul > li");for (Element spuEle : spuEles) {// 获取spuString attr = spuEle.attr("data-spu");long spu = Long.parseLong(attr.equals("") ? "0" : attr);// 获取skuElements skuEles = spuEle.select("li.ps-item");for (Element skuELe : skuEles) {// 获取skulong sku = Long.parseLong(skuELe.select("[data-sku]").attr("data-sku"));// 根据sku查询商品数据Item item = new Item();item.setSku(sku);List<Item> list = this.itemService.findAll(item);if(list.size() > 0) {// 如果商品存在,就进行下一个循环,该商品不保存,因为已存在continue;}// 设置商品的spuitem.setSpu(spu);// 获取商品详情的urlString itemUrl = "/" + sku + ".html";item.setUrl(itemUrl);// 获取商品的图片String picUrl = "https:" + skuELe.select("img[data-sku]").first().attr("data-lazy-img");//图片路径可能会为空的情况:一下为两种解决方式,第一种会让数据不全,第二种任会报错if(picUrl.equals("https:")){break;}picUrl = picUrl.replace("/n9/", "/n1/");String picName = this.httpUtils.doGetImage(picUrl);item.setPic(picName);// 获取商品的价格String priceJson = this.httpUtils.doGetHtml("/prices/mgets?skuIds=J_" + sku);double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();item.setPrice(price);//获取商品的标题String itemInfo = this.httpUtils.doGetHtml(item.getUrl());// String title = Jsoup.parse(itemInfo).select("div.sku-name").text();String title = Jsoup.parse(itemInfo).select("div#itemName").text();item.setTitle(title);item.setCreated(new Date());item.setUpdated(item.getCreated());// 保存商品数据到数据库中this.itemService.save(item);}}}}

8.启动类

Application.java:

package cn.mlnt.jd;import org.springframework.boot.SpringApplication;import org.springframework.boot.autoconfigure.SpringBootApplication;import org.springframework.scheduling.annotation.EnableScheduling;@SpringBootApplication/*** 使用定时任务,需要先开启定时任务,需添加注解*/@EnableSchedulingpublic class Application {public static void main(String[] args) {SpringApplication.run(Application.class, args);}}


三、项目执行效果

爬取到的图片

存储到数据库中的记录


总结

参照视频敲完后,执行项目并没有爬到数据,因为视频中没有提及要添加header,声明为浏览器访问。后来参考网上的博客后,遇到的问题基本解决。

//设置请求Request Headers中的User-Agent,告诉京东说这是浏览器访问httpGet.addHeader("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36");


参考文章地址:

/hellowork10/article/details/106292150/weixin_44505194/article/details/106634835

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。