您的位置:首页 > 编程语言 > Java开发

Java爬虫入门(二):单机单程序单线程-提供种子url用广度优先算法实现新闻资讯获取

2017-11-23 15:05 561 查看
在一的基础上,简单新增了广度url爬取算法。缺点:单线程,url爬取算法,新闻内容爬取,都丢在同一个线程,效率很慢。 后续继续优化。(自己有留意,没有爬取过疯狂(程序刚入门),所以没有ip跳板。)待解决问题: 用多线程,实现业务分离(内容爬取算法,url爬取算法),提高抓取效率,优化抓取算法,待抓取队列数据结构选用,----------------main测试方法-------------package com.kimt.newsdrawler;import com.kimt.newsdrawler.crawler.IFengCrawler;import com.kimt.newsdrawler.dto.News;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.util.HashSet;import java.util.List;import java.util.concurrent.LinkedBlockingQueue;/*** Created by man on 2017/11/21.*/public class UserMain {private static Logger logger = LoggerFactory.getLogger(UserMain.class);public static void main(String[] args) {/* version_1.0 new IFengCrawler("http://news.ifeng.com/a/20171121/53459907_0.shtml").parserForNews();*//* version_2.0测试广度优先遍历算法// 初始化待抓取url队列,已抓取url集合LinkedBlockingQueue<String> toCatcheUrl = new LinkedBlockingQueue<String>();HashSet<String> catchedUrl = new HashSet<String>();// 传入种子url,爬取url到队列中new IFengUrlCatcher(toCatcheUrl,catchedUrl).urlCatch("http://news.ifeng.com/");logger.info("info:",toCatcheUrl);*//*version_2.1 测试广度优先遍历算法,并且爬取数据*/LinkedBlockingQueue<String> toCatcheUrl = new LinkedBlockingQueue<String>();HashSet<String> catchedUrl = new HashSet<String>();List<News> list = new IFengCrawler(toCatcheUrl, catchedUrl).parserForNews("http://news.ifeng.com/");logger.info("一共爬取了 "+list.size()+" 条新闻");}}
----------------提供种子url,广度url爬取算法-------------
package com.kimt.newsdrawler.urlcatcher;

import com.kimt.newsdrawler.httpclientutils.HttpClientUtil;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.*;
import java.util.concurrent.LinkedBlockingQueue;

/*
4000
*
* @Date create on 2017/11/22
* @author man
* @Description
*/
public class IFengUrlCatcher extends AbstractUrlCatcher {
private static Logger logger = LoggerFactory.getLogger(IFengUrlCatcher.class);
/** 待爬取的url队列 */
private LinkedBlockingQueue<String> toCatcheUrl;
/** 已爬取的url集合*/
private HashSet<String> catchedUrl;

/**
*
* @param toCatcheUrl 待抓取url队列
* @param catchedUrl  已抓取url集合
*/
public IFengUrlCatcher(LinkedBlockingQueue<String> toCatcheUrl, HashSet<String> catchedUrl) {
this.toCatcheUrl = toCatcheUrl;
this.catchedUrl = catchedUrl;
}

@Override
public void urlCatch(String seedUrl) {
try {
CloseableHttpResponse httpResponse = HttpClientUtil.getHttpResponse(seedUrl);
HttpEntity entity = httpResponse.getEntity();
// 将Entity转成String格式html
String html = EntityUtils.toString(entity, "utf-8");
// 遍历页面,可获取新闻内容的url入待抓取队
traversalUrlForIFengNews(html);
} catch (IOException e) {
e.printStackTrace();
}
}

/**
* 遍历出html页面,获取所有能爬取新闻内容的url并且入队,不同的新闻网站算法不一样
* @param html
*/
private void traversalUrlForIFengNews(String html){
String baseUrl = "news.ifeng.com";
String url;
Document doc = Jsoup.parse(html);
//获取html页面的所有<a>标签
Elements elements = doc.getElementsByTag("a");
// 遍历所有<a>标签
for (Element e:elements) {
// 获取所有<a>标签的href属性的值(url)
url = e.attr("href");
// 如果该url是新闻网的内容页面,并且没有在已爬取队列中,则入队列
if(url.contains(baseUrl) && !catchedUrl.contains(url)){
try {
toCatcheUrl.put(url.trim());
} catch (InterruptedException e1) {
e1.printStackTrace();
logger.error("InterruptedException",e1.getMessage());
}
}
}
}

}
----------------包装httpClient工具类,减少重复代码(后续待优化)-------------
package com.kimt.newsdrawler.httpclientutils;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;/*** @author man* @Date create on 2017/11/22* @Description*/public class HttpClientUtil {private static Logger logger = LoggerFactory.getLogger(HttpClientUtil.class);public static CloseableHttpResponse getHttpResponse(String url) throws IOException {CloseableHttpClient client;client = HttpClients.createDefault();HttpGet httpGet = new HttpGet(url);// 设置请求头信息httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5");httpGet.setHeader("Accept-Charset", "utf-8;q=0.7,*;q=0.7");httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");httpGet.setHeader("Accept-Encoding", "gzip, deflate");// 执行get请求return client.execute(httpGet);}}
------------新闻爬取类-----------
package com.kimt.newsdrawler.crawler;import com.kimt.newsdrawler.dto.News;import com.kimt.newsdrawler.httpclientutils.HttpClientUtil;import com.kimt.newsdrawler.urlcatcher.IFengUrlCatcher;import org.apache.http.HttpEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.*;import java.util.concurrent.LinkedBlockingQueue;/*** @author kimt* Created by man on 2017/11/23.*/public class IFengCrawler extends AbstractCrawler {private Logger logger = LoggerFactory.getLogger(IFengCrawler.class);/** 待爬取的url队列 */private LinkedBlockingQueue<String> toCatcheUrl;/** 已爬取的url集合*/private HashSet<String> catchedUrl;private IFengUrlCatcher urlCatcher ;public IFengCrawler(LinkedBlockingQueue<String> toCatcheUrl, HashSet<String> catchedUrl) {this.toCatcheUrl = toCatcheUrl;this.catchedUrl = catchedUrl;this.urlCatcher = new IFengUrlCatcher(toCatcheUrl,catchedUrl);}@Overridepublic List<News> parserForNews(String seedUrl) {// 先爬取种子url,初始化待爬取队列urlCatcher.urlCatch(seedUrl);List<News> list = new ArrayList<News>();try {String url;int stateCode;HttpEntity entity ;CloseableHttpResponse response;// 循环爬取待抓取队列中的url 200次for(int i =0;i<200;i++){// 从待抓取队列,拿出一条urlurl = toCatcheUrl.take();// 广度优先算法,遍历该url下的所有可爬取的url,内部实现入队操作urlCatcher.urlCatch(url);// 向指定url模拟发送GET请求response = HttpClientUtil.getHttpResponse(url);// 获取http状态码stateCode = response.getStatusLine().getStatusCode();if (stateCode == AbstractCrawler.HTTP_RESPONSE_CODE_SUCCESS) {News news = null;// 从response中获取entityentity = response.getEntity();// 将Entity转成String格式htmlString html = EntityUtils.toString(entity, "utf-8");// 用Jsoup解析htmlDocument doc = Jsoup.parse(html);String title = doc.title();// 凤凰新闻网的第一种新闻页面Element articleDiv = doc.getElementById("artical");if (articleDiv != null){news = parseOne(articleDiv, title);}else{// 凤凰新闻网的第二种新闻页面,使用第二种解析方式Element article2Div = doc.getElementsByClass("yc_main wrap").first();if(article2Div != null){news = parseTwo(article2Div, title);}}// 返回抓取到的新闻对象if(news != null){list.add(news);}// 标记为已抓取urlcatchedUrl.add(url);// 释放资源EntityUtils.consume(entity);}}} catch (IOException e) {e.printStackTrace();logger.error("IOException"+e.getMessage());} catch (ParseException e) {e.printStackTrace();logger.error("ParseException"+e.getMessage());} catch (InterruptedException e) {e.printStackTrace();logger.error("InterruptedException"+e.getMessage());}return list;}/**** @param articleDiv 最靠近新闻内容div* @param title 文章标题* @return News对象* 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据*/private News parseOne(Element articleDiv, String title) throws ParseException {News news = new News();news.setTitle(title);if (articleDiv != null){// 获取新闻来源,发布时间Element headDiv = articleDiv.getElementById("artical_sth");// 获取新闻内容Element contentDiv = articleDiv.getElementById("main_content");if (headDiv != null){// 获取发布时间String publishTime = headDiv.getElementsByClass("ss01").text();// 获取新闻来源String origin = headDiv.getElementsByClass("ss03").text();// 格式转换String->DataSimpleDateFormat sdf = new SimpleDateFormat("yyyy年MM月dd日 HH:mm:ss");Date date = sdf.parse(publishTime);// News对象成员赋值news.setPublishTime(date);news.setOrigin(origin);}if (contentDiv != null){// 删除img标签contentDiv.select("img").remove();// 获取新闻内容html,方便后续分段,而不是直接获取text()String content = contentDiv.html();// News对象成员赋值news.setContent(content);}}return news;}/**** @param article2Div 最靠近新闻内容div* @param title 文章标题* @return News对象* 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据*/private News parseTwo(Element article2Div, String title) throws ParseException {News news = new News();news.setTitle(title);if (article2Div != null){// 获取新闻来源,发布时间Element headDiv = article2Div.getElementsByClass("yc_tit").first();// 获取新闻内容Element contentDiv = article2Div.getElementById("yc_con_txt");if (headDiv != null){// 获取发布时间String publishTime = headDiv.getElementsByTag("span").text();// 获取新闻来源String origin = headDiv.getElementsByTag("a").first().text();// 格式转换String->DataSimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");Date date = sdf.parse(publishTime);// News对象成员赋值news.setPublishTime(date);news.setOrigin(origin);}if (contentDiv != null){// 删除没用的divcontentDiv.select("div").remove();contentDiv.select("script").remove();// 获取新闻内容html,方便后续分段,而不是直接获取text()String content = contentDiv.html();// News对象成员赋值news.setContent(content);}}return news;}}

                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: