1200字范文 > 笑话集网站最近更新网站内容采集

笑话集网站最近更新网站内容采集

时间：2023-10-09 02:26:13

相关推荐

笑话集网站最近更新网站内容采集

转载请注明出处：/xiaojimanman/article/details/19158815

本篇博客主页介绍笑话集（）最近更新列表页内容的抓取实现方式，程序源代码下载地址：/detail/xiaojimanman/6918997

首先介绍一下抓取入口，这里的没有实现抓取程序的周期性采集，这里可以根据自己的需要来写相应的线程。

/** *@Description: 笑话集抓取调度入口*/ package cn.lulei.crawl.jokeji; import java.io.IOException;import java.util.ArrayList;import java.util.HashSet;import java.util.concurrent.TimeUnit;import cn.lulei.db.jokeji.JokeDbOperation;import cn.lulei.model.Jokeji;import cn.lulei.util.ParseUtil;public class JokeCrawl {//笑话集更新列表页url格式private static String listPageUrl = "/list_%pno%.htm";//两次访问页面事件间隔，单位msprivate static int sleepTime = 500;/*** @param start 起始页* @param end 终止页* @throws IOException* @Date: -2-12 * @Author: lulei * @Description: 抓取更新列表页上的内容*/public void crawlMain(int start, int end) throws IOException{start = start < 1 ? 1 : start;JokeDbOperation jokeDbOperation = new JokeDbOperation();for ( ; start <= end; start++) {sleep(sleepTime);JokeList jokeList = new JokeList(listPageUrl.replace("%pno%", start + ""));ArrayList<String> array = jokeList.getPageUrls();HashSet<String> hash = ParseUtil.parseArrayToHash(array);for (String s : hash) {JokeDetail jokeDetail = new JokeDetail(s);Jokeji jokeji = jokeDetail.getJokeji();jokeDbOperation.insert(jokeji);System.out.println("网址：" + s + "采集完成！");sleep(sleepTime);}}}/*** @param sleepTime* @Date: -2-13 * @Author: lulei * @Description: 线程暂停sleepTime毫秒*/public void sleep(int sleepTime){try {TimeUnit.MILLISECONDS.sleep(sleepTime);} catch (InterruptedException e) {// TODO Auto-generated catch block e.printStackTrace();}}public static void main(String[] args) {// TODO Auto-generated method stub try {new JokeCrawl().crawlMain(1, 380);} catch (Exception e) {// TODO Auto-generated catch block e.printStackTrace();} }}

函数public void crawlMain(int start, int end) 实现了列表页从start到end页的抓取，这里面设置的两次访问页面的时间间隔是500ms，可以根据自己电脑性能的配置和网速情况修改相应的配置，但是不建议将其修改太小，否则会被笑话集网站屏蔽。

下面的CrawlBase类将实现获取网页信息资源，pageSourceCode属性存储当前页面的源代码，做后续步骤的处理工作。这个类似获取网页资源信息的基类，可以根据不同的网页格式和抓取内容，构建相应的子类即可。

/** *@Description: 获取网页信息基类*/ package cn.lulei.crawl; import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.util.HashMap;import java.util.Iterator;import java.util.Map;import mons.httpclient.Header;import mons.httpclient.HttpClient;import mons.httpclient.HttpException;import mons.httpclient.HttpMethod;import mons.httpclient.HttpStatus;import mons.httpclient.methods.GetMethod;import mons.httpclient.methods.PostMethod;import org.apache.log4j.Logger;public abstract class CrawlBase {private static Logger log = Logger.getLogger(CrawlBase.class);//链接源代码private String pageSourceCode = "";//返回头信息private Header[] responseHeaders = null;//连接超时时间private static int connectTimeout = 3500;//连接读取时间private static int readTimeout = 3500;//默认最大访问次数private static int maxConnectTimes = 3;private static HttpClient httpClient = new HttpClient();static {httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(connectTimeout);httpClient.getHttpConnectionManager().getParams().setSoTimeout(readTimeout);}/*** @param urlStr* @param charsetName* @param method* @param params* @return* @throws HttpException* @throws IOException* @Date: -2-12 * @Author: lulei * @Description: method方式访问页面*/public boolean readPage(String urlStr, String charsetName, String method, HashMap<String, String> params) throws HttpException, IOException {if ("post".equals(method) || "POST".equals(method)) {return readPageByPost(urlStr, charsetName, params);} else {return readPageByGet(urlStr, charsetName, params);}}/*** @param urlStr* @param charsetName* @param params* @return 访问是否成功* @throws HttpException* @throws IOException* @Date: -9-12 * @Author: lulei * @Description: Get方式访问页面*/public boolean readPageByGet(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException {GetMethod getMethod = createGetMethod(urlStr, params);return readPage(getMethod, charsetName, urlStr);}/*** @param urlStr* @param charsetName* @param params* @return 访问是否成功* @throws HttpException* @throws IOException* @Date: -9-12 * @Author: lulei * @Description: Post方式访问页面*/public boolean readPageByPost(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException{PostMethod postMethod = createPostMethod(urlStr, params);return readPage(postMethod, charsetName, urlStr);}/*** @param method* @param charsetName* @param urlStr* @return 访问是否成功* @throws HttpException* @throws IOException* @Date: -9-12 * @Author: lulei * @Description: 读取页面信息和头信息*/private boolean readPage(HttpMethod method, String charsetName, String urlStr) throws HttpException, IOException{int n = maxConnectTimes;while (n > 0) {try {if (httpClient.executeMethod(method) != HttpStatus.SC_OK){log.error("can not connect " + urlStr);return false;}//获取头信息responseHeaders = method.getResponseHeaders();//获取页面源代码InputStream inputStream = method.getResponseBodyAsStream();BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));StringBuffer stringBuffer = new StringBuffer();String lineString = null;while ((lineString = bufferedReader.readLine()) != null){stringBuffer.append(lineString);}pageSourceCode = stringBuffer.toString();return true;} catch (Exception e) {System.out.println(urlStr + " -- can't connect " + (maxConnectTimes - n + 1));n--;}}return false;}/*** @param urlStr* @param params* @return GetMethod* @Date: -9-12 * @Author: lulei * @Description: 设置get请求参数*/@SuppressWarnings("rawtypes")private GetMethod createGetMethod(String urlStr, HashMap<String, String> params){GetMethod getMethod = new GetMethod(urlStr);if (params == null){return getMethod;}Iterator iter = params.entrySet().iterator();while (iter.hasNext()) {Map.Entry entry = (Map.Entry) iter.next();String key = (String) entry.getKey();String val = (String) entry.getValue();getMethod.setRequestHeader(key, val);}return getMethod;}/*** @param urlStr* @param params* @return PostMethod* @Date: -9-12 * @Author: lulei * @Description: 设置post请求参数*/@SuppressWarnings("rawtypes")private PostMethod createPostMethod(String urlStr, HashMap<String, String> params){PostMethod postMethod = new PostMethod(urlStr);if (params == null){return postMethod;}Iterator iter = params.entrySet().iterator();while (iter.hasNext()) {Map.Entry entry = (Map.Entry) iter.next();String key = (String) entry.getKey();String val = (String) entry.getValue();postMethod.setParameter(key, val);}return postMethod;}/*** @param urlStr* @param charsetName* @return 访问是否成功* @throws IOException* @Date: -9-12 * @Author: lulei * @Description: 不设置任何头信息直接访问网页*/public boolean readPageByGet(String urlStr, String charsetName) throws IOException{return this.readPageByGet(urlStr, charsetName, null);}/*** @return String* @Date: -9-12 * @Author: lulei * @Description: 获取网页源代码*/public String getPageSourceCode(){return pageSourceCode;}/*** @return Header[]* @Date: -9-12 * @Author: lulei * @Description: 获取网页返回头信息*/public Header[] getHeader(){return responseHeaders;}/*** @param timeout* @Date: -9-12 * @Author: lulei * @Description: 设置连接超时时间*/public void setConnectTimeout(int timeout){httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);}/*** @param timeout* @Date: -9-12 * @Author: lulei * @Description: 设置读取超时时间*/public void setReadTimeout(int timeout){httpClient.getHttpConnectionManager().getParams().setSoTimeout(timeout);}/*** @param maxConnectTimes* @Date: -2-12 * @Author: lulei * @Description: 设置最大访问次数，链接失败的情况下使用*/public static void setMaxConnectTimes(int maxConnectTimes) {CrawlBase.maxConnectTimes = maxConnectTimes;}/*** @param connectTimeout* @param readTimeout* @Date: -9-12 * @Author: lulei * @Description: 设置连接超时时间和读取超时时间*/public void setTimeout(int connectTimeout, int readTimeout){setConnectTimeout(connectTimeout);setReadTimeout(readTimeout);}}

对于更新列表页的详细页面的链接url，由于多数网站都有相同的共性，因此对CrawlBase进行再一次的封装成CrawlListPageBase类，实现更新列表页中链接url的获取。

/** *@Description: 获取页面链接地址信息基类 */ package cn.lulei.crawl; import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import cn.lulei.util.DoRegex;public abstract class CrawlListPageBase extends CrawlBase {private String pageurl;/*** @param urlStr* @param charsetName* @throws IOException*/public CrawlListPageBase(String urlStr, String charsetName) throws IOException{readPageByGet(urlStr, charsetName);pageurl = urlStr;}/*** @param urlStr* @param charsetName* @param method* @param params* @throws IOException*/public CrawlListPageBase(String urlStr, String charsetName, String method, HashMap<String, String> params) throws IOException{readPage(urlStr, charsetName, method, params);pageurl = urlStr;}/*** @return ArrayList<String>* @Date: -9-13 * @Author: lulei * @Description: 返回页面上需求的链接地址*/public ArrayList<String> getPageUrls(){ArrayList<String> pageUrls = new ArrayList<String>();pageUrls = DoRegex.getArrayList(getPageSourceCode(), getUrlRegexString(), pageurl, getUrlRegexStringNum());return pageUrls;}/*** @return String* @Date: -9-13 * @Author: lulei * @Description: 返回页面上需求的网址连接的正则表达式*/public abstract String getUrlRegexString();/*** @return int* @Date: -9-13 * @Author: lulei * @Description: 正则表达式中要去的字段位置*/public abstract int getUrlRegexStringNum();}

继承该类，只需要实现public abstract String getUrlRegexString();public abstract int getUrlRegexStringNum();这两个抽象方法即可，对于笑话集的更新列表页的实现如下：

/** *@Description: 笑话集最近更新列表页面 */ package cn.lulei.crawl.jokeji; import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import cn.lulei.crawl.CrawlListPageBase;/** *@Description: *@Author: lulei *@Date: -2-12 *@Version: 1.1.0 */public class JokeList extends CrawlListPageBase{//请求jokeji最新更新列表页参数private static HashMap<String, String> params = new HashMap<String, String>();static {params.put("Host", "");params.put("Pragma", "no-cache");params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");}public JokeList(String urlStr) throws IOException {this(urlStr, "gb2312"); }public JokeList(String urlStr, String charsetName) throws IOException {super(urlStr, charsetName, "get", params); // TODO Auto-generated constructor stub}@Overridepublic String getUrlRegexString() {// TODO Auto-generated method stub return "<li><a href=\"(.*?)\"target=\"_blank\"";//链接url正则表达式}@Overridepublic int getUrlRegexStringNum() {// TODO Auto-generated method stub return 1;}/** * @param args* @Date: -2-12 * @Author: lulei * @Description: main函数测试*/public static void main(String[] args) {// TODO Auto-generated method stub try {JokeList jokeList = new JokeList("/list_1.htm", "gb2312");ArrayList<String> array = jokeList.getPageUrls();for(String s : array){System.out.println(s);}} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}

经过上述的封装，对于实现列表页链接地址的获取将很容易实现。还有在上述的实现过程中使用到了正则匹配的工具类DoRegex，其实现如下：

/** * @Description: 正则处理工具 */ package cn.lulei.util; import java.io.UnsupportedEncodingException;import .URLEncoder;import java.util.ArrayList;import java.util.regex.Matcher;import java.util.regex.Pattern;public class DoRegex {private static String rootUrlRegex = "(http://.*?/)";private static String currentUrlRegex = "(http://.*/)";private static String ChRegex = "([\u4e00-\u9fa5]+)";/*** @param dealStr* @param regexStr* @param splitStr* @param n* @return String* @Date: -9-13 * @Author: lulei * @Description: 正则匹配结果，每条记录用splitStr分割*/public static String getString(String dealStr, String regexStr, String splitStr, int n){String reStr = "";if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){return reStr;}splitStr = (splitStr == null) ? "" : splitStr;Pattern pattern = pile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);Matcher matcher = pattern.matcher(dealStr);StringBuffer stringBuffer = new StringBuffer();while (matcher.find()) {stringBuffer.append(matcher.group(n).trim());stringBuffer.append(splitStr);}reStr = stringBuffer.toString();if (splitStr != "" && reStr.endsWith(splitStr)){reStr = reStr.substring(0, reStr.length() - splitStr.length());}return reStr;}/*** @param dealStr* @param regexStr* @param n* @return String* @Date: -9-13 * @Author: lulei * @Description: 正则匹配结果，将所有匹配记录组装成字符串*/public static String getString(String dealStr, String regexStr, int n){return getString(dealStr, regexStr, null, n);}/*** @param dealStr* @param regexStr* @param n* @return String* @Date: -9-13 * @Author: lulei * @Description: 正则匹配第一条结果*/public static String getFirstString(String dealStr, String regexStr, int n){if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){return "";}Pattern pattern = pile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);Matcher matcher = pattern.matcher(dealStr);while (matcher.find()) {return matcher.group(n).trim();}return "";}/*** @param dealStr* @param regexStr* @param n* @return ArrayList<String>* @Date: -9-13 * @Author: lulei * @Description: 正则匹配结果，将匹配结果组装成数组*/public static ArrayList<String> getArrayList(String dealStr, String regexStr, int n){ArrayList<String> reArrayList = new ArrayList<String>();if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){return reArrayList;}Pattern pattern = pile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);Matcher matcher = pattern.matcher(dealStr);while (matcher.find()) {reArrayList.add(matcher.group(n).trim());}return reArrayList;}/*** @param url* @param currentUrl* @return String* @Date: -9-13 * @Author: lulei * @Description: 组装网址，网页的url*/private static String getHttpUrl(String url, String currentUrl){try {url = encodeUrlCh(url);} catch (UnsupportedEncodingException e) {// TODO Auto-generated catch block e.printStackTrace();}if (url.indexOf("http") == 0){return url;}if (url.indexOf("/") == 0){return getFirstString(currentUrl, rootUrlRegex, 1) + url.substring(1);}return getFirstString(currentUrl, currentUrlRegex, 1) + url;}/*** @param dealStr* @param regexStr* @param currentUrl* @param n* @return ArrayList<String>* @Date: -9-13 * @Author: lulei * @Description: 获取和正则匹配的绝对链接地址*/public static ArrayList<String> getArrayList(String dealStr, String regexStr, String currentUrl, int n){ArrayList<String> reArrayList = new ArrayList<String>();if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){return reArrayList;}Pattern pattern = pile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);Matcher matcher = pattern.matcher(dealStr);while (matcher.find()) {reArrayList.add(getHttpUrl(matcher.group(n).trim(), currentUrl));}return reArrayList;}/*** @param url* @return* @throws UnsupportedEncodingException* @Date: -2-12 * @Author: lulei * @Description: 将连接地址中的中文进行编码处理*/public static String encodeUrlCh (String url) throws UnsupportedEncodingException {while (true) {String s = getFirstString(url, ChRegex, 1);if ("".equals(s)){return url;}url = url.replaceAll(s, URLEncoder.encode(s, "utf-8"));}}}

该类实现了正则表达式的一些匹配查找，已及网页相对地址转化为绝对地址等功能，详细参照程序中的注释。

通过JokeList类获取到详细页的url，现在只需要创建一个笑话集详细页的处理类JokeDetail即可，代码如下：

/** *@Description: 笑话集详细内容页*/ package cn.lulei.crawl.jokeji; import java.io.IOException;import java.util.HashMap;import mons.httpclient.HttpException;import cn.lulei.crawl.CrawlBase;import cn.lulei.model.Jokeji;import cn.lulei.util.DoRegex;import cn.lulei.util.ParseMD5;public class JokeDetail extends CrawlBase {//请求jokeji内容详细页请求参数private static HashMap<String, String> params = new HashMap<String, String>();//获取内容部分的正则表达式private static String contentAllRegexString = "(.*?)";private String pageUrl;static {params.put("Host", "");params.put("Pragma", "no-cache");params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");params.put("Referer", "/list.htm");}/*** @param urlStr* @throws HttpException* @throws IOException*/protected JokeDetail(String urlStr) throws HttpException, IOException {this(urlStr, "gb2312");}/*** @param urlStr* @param charsetName* @throws HttpException* @throws IOException*/protected JokeDetail(String urlStr, String charsetName) throws HttpException, IOException {this.pageUrl = urlStr;readPage(urlStr, charsetName, "get", params);}/*** @return* @Date: -2-12 * @Author: lulei * @Description: 获取笑话集详细页对象*/protected Jokeji getJokeji() {Jokeji jokeji = new Jokeji();jokeji.setPageUrl(pageUrl);jokeji.setUrlMd5(ParseMD5.ParseStrToMd5L32(pageUrl));jokeji.setContent(getContent());return jokeji;}/*** @return* @Date: -2-12 * @Author: lulei * @Description: 获取内容详细*/private String getContent() {String contentAll = DoRegex.getFirstString(getPageSourceCode(), contentAllRegexString, 1);contentAll = contentAll.replaceAll("&.*?;", "").replaceAll(" ", "#br#").replaceAll(" ", "#br#").replaceAll("", "#br#").replaceAll("", "#br#").replaceAll("", "#br#").replaceAll("", "#br#").replaceAll("<.*?>", "");return contentAll;}/** * @param args* @throws IOException * @throws HttpException * @Date: -2-12 * @Author: lulei * @Description: main函数测试*/public static void main(String[] args) throws HttpException, IOException {// TODO Auto-generated method stub JokeDetail jokeDetail = new JokeDetail("/jokehtml/bxnn/0926220449.htm");System.out.println(jokeDetail.getContent());}}

到目前为止，即完成了笑话集从更新列表页到详细页的相关内容获取，具体的业务逻辑参照上述的JokeCrawl类。

上面说阐述的均是笑话集网站的抓取，至于数据的存储并没有太多的设计，也就是在JokeCrawl类中调用了相应的数据存储和数据排重等方法，具体的实现参照对应的源代码即可。

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。