1200字范文 > 使用JSoup实现爬虫操作（爬取网页图片文章内容）

使用JSoup实现爬虫操作（爬取网页图片文章内容）

时间：2019-06-27 15:57:58

一、基础配置：

-<dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.12.1</version></dependency>

二、代码程序：

import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import java.io.IOException;//我们要读取网站中的前100篇//分页的时候，每一页有18篇//100篇就是（18*5+10）public class Reptile03 {public static void main(String[] args) {int count=0; //计数器int totalQuantity=100;//总共100篇int page=0; //从第一页开始在这个题目中 page最大就是6 第六页的第十篇ReptileService reptileService = new ReptileService();String url = "https://www./gdyw/list1.htm"; //初始页面Document document=null;Elements elements=null;while(count<100){try {document = Jsoup.connect(url).get();elements = document.select("div.col_news_con>div>div>ul>li>span>a");//System.out.println(elements.size()); //可以查看有多少条内容for(Element element:elements){count++;// 每读取一个数据，计数器就+1if(count==totalQuantity+1){System.exit(0);}// 文章链接 System.out.println(element.attr("href"));System.out.println("****************************************************************************************************************************************************************************************************");//分割线System.out.println("****************************************************************************************************************************************************************************************************");//分割线System.out.println("文章章节数:"+count); //篇数System.out.println(element.ownText());//文章标题//通过文章链接提取文章内容//先判断文件是否是https开始if(element.attr("href").startsWith("https")){System.out.println("文章链接："+element.attr("href"));//运行到这里，说明是https开始，我们直接从连接中提取reptileService.getArticleByOfficial(element.attr("href"));}else{System.out.println("文章链接："+"https://www."+element.attr("href"));//运行到这里，说明不是https开始，我们要改变一下reptileService.getArticle("https://www."+element.attr("href"));}}url= reptileService.changePage(url,page,count);System.out.println(url);} catch (IOException e) {e.printStackTrace();}}}}

import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import java.io.IOException;public class ReptileService {// 从href中获取文章的内容public void getArticle(String href){Elements articles =null;Elements articlesPicture;try {Document document = Jsoup.connect(href).get();//文字articles = document.select("div.wp_articlecontent>p");for(Element article:articles){//获取文章文字System.out.println(article.ownText());//但是数据比较冗杂}// 图片articlesPicture = document.select("div.wp_articlecontent>p>img");for(Element picture:articlesPicture){System.out.println("https://www./"+picture.attr("src"));//但是数据比较冗杂}} catch (IOException e) {e.printStackTrace();}}// 从href中获取文章的内容（来源于公众号）public void getArticleByOfficial(String href){Elements articles =null;Elements articlesPicture=null;try {Document document = Jsoup.connect(href).get();// 文字articles = document.select("div.rich_media_content>section>p>span");for(Element article:articles){System.out.println(article.ownText());//但是数据比较冗杂}// 图片articlesPicture = document.select("div.rich_media_content>section>p>img");for(Element picture:articlesPicture){System.out.println(picture.attr("src"));//但是数据比较冗杂}} catch (IOException e) {e.printStackTrace();}}// 当此页面的内容提取完了之后，要换下一个分页public String changePage(String url,int page,int count){page = (count/18)+1; //比如count是22时，page就是2，刚好就是第二页System.out.println("下一次进入页数："+page+",count:"+count);url = url.replace(url.substring(28,33),"list"+String.valueOf(page)); //当页数改变的时候，url也需要改变return url;}}

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。