Java爬虫入门(二):单机单程序单线程-提供种子url用广度优先算法实现新闻资讯获取
2017-11-23 15:05
561 查看
在一的基础上,简单新增了广度url爬取算法。缺点:单线程,url爬取算法,新闻内容爬取,都丢在同一个线程,效率很慢。 后续继续优化。(自己有留意,没有爬取过疯狂(程序刚入门),所以没有ip跳板。)待解决问题: 用多线程,实现业务分离(内容爬取算法,url爬取算法),提高抓取效率,优化抓取算法,待抓取队列数据结构选用,----------------main测试方法-------------package com.kimt.newsdrawler;import com.kimt.newsdrawler.crawler.IFengCrawler;import com.kimt.newsdrawler.dto.News;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.util.HashSet;import java.util.List;import java.util.concurrent.LinkedBlockingQueue;/*** Created by man on 2017/11/21.*/public class UserMain {private static Logger logger = LoggerFactory.getLogger(UserMain.class);public static void main(String[] args) {/* version_1.0 new IFengCrawler("http://news.ifeng.com/a/20171121/53459907_0.shtml").parserForNews();*//* version_2.0测试广度优先遍历算法// 初始化待抓取url队列,已抓取url集合LinkedBlockingQueue<String> toCatcheUrl = new LinkedBlockingQueue<String>();HashSet<String> catchedUrl = new HashSet<String>();// 传入种子url,爬取url到队列中new IFengUrlCatcher(toCatcheUrl,catchedUrl).urlCatch("http://news.ifeng.com/");logger.info("info:",toCatcheUrl);*//*version_2.1 测试广度优先遍历算法,并且爬取数据*/LinkedBlockingQueue<String> toCatcheUrl = new LinkedBlockingQueue<String>();HashSet<String> catchedUrl = new HashSet<String>();List<News> list = new IFengCrawler(toCatcheUrl, catchedUrl).parserForNews("http://news.ifeng.com/");logger.info("一共爬取了 "+list.size()+" 条新闻");}}
----------------提供种子url,广度url爬取算法-------------
package com.kimt.newsdrawler.urlcatcher; import com.kimt.newsdrawler.httpclientutils.HttpClientUtil; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.*; import java.util.concurrent.LinkedBlockingQueue; /* 4000 * * @Date create on 2017/11/22 * @author man * @Description */ public class IFengUrlCatcher extends AbstractUrlCatcher { private static Logger logger = LoggerFactory.getLogger(IFengUrlCatcher.class); /** 待爬取的url队列 */ private LinkedBlockingQueue<String> toCatcheUrl; /** 已爬取的url集合*/ private HashSet<String> catchedUrl; /** * * @param toCatcheUrl 待抓取url队列 * @param catchedUrl 已抓取url集合 */ public IFengUrlCatcher(LinkedBlockingQueue<String> toCatcheUrl, HashSet<String> catchedUrl) { this.toCatcheUrl = toCatcheUrl; this.catchedUrl = catchedUrl; } @Override public void urlCatch(String seedUrl) { try { CloseableHttpResponse httpResponse = HttpClientUtil.getHttpResponse(seedUrl); HttpEntity entity = httpResponse.getEntity(); // 将Entity转成String格式html String html = EntityUtils.toString(entity, "utf-8"); // 遍历页面,可获取新闻内容的url入待抓取队 traversalUrlForIFengNews(html); } catch (IOException e) { e.printStackTrace(); } } /** * 遍历出html页面,获取所有能爬取新闻内容的url并且入队,不同的新闻网站算法不一样 * @param html */ private void traversalUrlForIFengNews(String html){ String baseUrl = "news.ifeng.com"; String url; Document doc = Jsoup.parse(html); //获取html页面的所有<a>标签 Elements elements = doc.getElementsByTag("a"); // 遍历所有<a>标签 for (Element e:elements) { // 获取所有<a>标签的href属性的值(url) url = e.attr("href"); // 如果该url是新闻网的内容页面,并且没有在已爬取队列中,则入队列 if(url.contains(baseUrl) && !catchedUrl.contains(url)){ try { toCatcheUrl.put(url.trim()); } catch (InterruptedException e1) { e1.printStackTrace(); logger.error("InterruptedException",e1.getMessage()); } } } } }
----------------包装httpClient工具类,减少重复代码(后续待优化)-------------
package com.kimt.newsdrawler.httpclientutils;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;/*** @author man* @Date create on 2017/11/22* @Description*/public class HttpClientUtil {private static Logger logger = LoggerFactory.getLogger(HttpClientUtil.class);public static CloseableHttpResponse getHttpResponse(String url) throws IOException {CloseableHttpClient client;client = HttpClients.createDefault();HttpGet httpGet = new HttpGet(url);// 设置请求头信息httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5");httpGet.setHeader("Accept-Charset", "utf-8;q=0.7,*;q=0.7");httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");httpGet.setHeader("Accept-Encoding", "gzip, deflate");// 执行get请求return client.execute(httpGet);}}------------新闻爬取类-----------
package com.kimt.newsdrawler.crawler;import com.kimt.newsdrawler.dto.News;import com.kimt.newsdrawler.httpclientutils.HttpClientUtil;import com.kimt.newsdrawler.urlcatcher.IFengUrlCatcher;import org.apache.http.HttpEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.*;import java.util.concurrent.LinkedBlockingQueue;/*** @author kimt* Created by man on 2017/11/23.*/public class IFengCrawler extends AbstractCrawler {private Logger logger = LoggerFactory.getLogger(IFengCrawler.class);/** 待爬取的url队列 */private LinkedBlockingQueue<String> toCatcheUrl;/** 已爬取的url集合*/private HashSet<String> catchedUrl;private IFengUrlCatcher urlCatcher ;public IFengCrawler(LinkedBlockingQueue<String> toCatcheUrl, HashSet<String> catchedUrl) {this.toCatcheUrl = toCatcheUrl;this.catchedUrl = catchedUrl;this.urlCatcher = new IFengUrlCatcher(toCatcheUrl,catchedUrl);}@Overridepublic List<News> parserForNews(String seedUrl) {// 先爬取种子url,初始化待爬取队列urlCatcher.urlCatch(seedUrl);List<News> list = new ArrayList<News>();try {String url;int stateCode;HttpEntity entity ;CloseableHttpResponse response;// 循环爬取待抓取队列中的url 200次for(int i =0;i<200;i++){// 从待抓取队列,拿出一条urlurl = toCatcheUrl.take();// 广度优先算法,遍历该url下的所有可爬取的url,内部实现入队操作urlCatcher.urlCatch(url);// 向指定url模拟发送GET请求response = HttpClientUtil.getHttpResponse(url);// 获取http状态码stateCode = response.getStatusLine().getStatusCode();if (stateCode == AbstractCrawler.HTTP_RESPONSE_CODE_SUCCESS) {News news = null;// 从response中获取entityentity = response.getEntity();// 将Entity转成String格式htmlString html = EntityUtils.toString(entity, "utf-8");// 用Jsoup解析htmlDocument doc = Jsoup.parse(html);String title = doc.title();// 凤凰新闻网的第一种新闻页面Element articleDiv = doc.getElementById("artical");if (articleDiv != null){news = parseOne(articleDiv, title);}else{// 凤凰新闻网的第二种新闻页面,使用第二种解析方式Element article2Div = doc.getElementsByClass("yc_main wrap").first();if(article2Div != null){news = parseTwo(article2Div, title);}}// 返回抓取到的新闻对象if(news != null){list.add(news);}// 标记为已抓取urlcatchedUrl.add(url);// 释放资源EntityUtils.consume(entity);}}} catch (IOException e) {e.printStackTrace();logger.error("IOException"+e.getMessage());} catch (ParseException e) {e.printStackTrace();logger.error("ParseException"+e.getMessage());} catch (InterruptedException e) {e.printStackTrace();logger.error("InterruptedException"+e.getMessage());}return list;}/**** @param articleDiv 最靠近新闻内容div* @param title 文章标题* @return News对象* 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据*/private News parseOne(Element articleDiv, String title) throws ParseException {News news = new News();news.setTitle(title);if (articleDiv != null){// 获取新闻来源,发布时间Element headDiv = articleDiv.getElementById("artical_sth");// 获取新闻内容Element contentDiv = articleDiv.getElementById("main_content");if (headDiv != null){// 获取发布时间String publishTime = headDiv.getElementsByClass("ss01").text();// 获取新闻来源String origin = headDiv.getElementsByClass("ss03").text();// 格式转换String->DataSimpleDateFormat sdf = new SimpleDateFormat("yyyy年MM月dd日 HH:mm:ss");Date date = sdf.parse(publishTime);// News对象成员赋值news.setPublishTime(date);news.setOrigin(origin);}if (contentDiv != null){// 删除img标签contentDiv.select("img").remove();// 获取新闻内容html,方便后续分段,而不是直接获取text()String content = contentDiv.html();// News对象成员赋值news.setContent(content);}}return news;}/**** @param article2Div 最靠近新闻内容div* @param title 文章标题* @return News对象* 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据*/private News parseTwo(Element article2Div, String title) throws ParseException {News news = new News();news.setTitle(title);if (article2Div != null){// 获取新闻来源,发布时间Element headDiv = article2Div.getElementsByClass("yc_tit").first();// 获取新闻内容Element contentDiv = article2Div.getElementById("yc_con_txt");if (headDiv != null){// 获取发布时间String publishTime = headDiv.getElementsByTag("span").text();// 获取新闻来源String origin = headDiv.getElementsByTag("a").first().text();// 格式转换String->DataSimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");Date date = sdf.parse(publishTime);// News对象成员赋值news.setPublishTime(date);news.setOrigin(origin);}if (contentDiv != null){// 删除没用的divcontentDiv.select("div").remove();contentDiv.select("script").remove();// 获取新闻内容html,方便后续分段,而不是直接获取text()String content = contentDiv.html();// News对象成员赋值news.setContent(content);}}return news;}}
相关文章推荐
- Java爬虫入门(一):单机单程序单线程-手动输入url获取新闻内容
- Java 网络爬虫获取网页源代码原理及实现
- Java 网络爬虫获取网页源代码原理及实现
- Java广度优先爬虫示例(抓取复旦新闻信息)
- dom4j来解析xml,通过URL获取服务器端返回的字符串,java swing 实现的页面
- (安全跨域)java.net.url实现后台发出http请求并输出获取到的数据
- Java 网络爬虫获取网页源代码原理及实现
- Java+MySQL实现网络爬虫程序
- Java实现爬虫给App提供数据(Jsoup 网络爬虫)
- 用多线程实现的Java爬虫程序2
- Java实现爬虫给App提供数据(Jsoup 网络爬虫)
- 入门的JAVA爬虫实现(附代码)
- Java 多线程 爬虫程序(spider)设计与实现
- Java+MySQL实现网络爬虫程序
- JTI + JNI,为Java程序提供获取JVM内部信息的通道
- Java实现的网络爬虫程序,简单易懂无框架(我的网络大作业)
- java实现网络爬虫程序
- 多线程实现的Java爬虫程序
- Java+MySQL实现网络爬虫程序
- 多线程实现的Java爬虫程序