您的位置:首页 > 编程语言 > Java开发

java 实现爬虫(多线程)

2018-04-12 11:55 627 查看

1. 单线程爬虫



import java.util.ArrayList;
import java.util.List;


import javax.annotation.Resource;


import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;


import com.bm.entity.metabolize.Food;
import com.bm.service.download.FoodService;
import com.bm.util.HttpUtils;


@Controller
public class DownLoadController {


@Resource(name = "foodService")
private FoodService foodService;


/**
* 食物知识数据爬去

* @param args
* @throws Exception
*/
//@RequestMapping("download")
public void execute(String[] args) throws Exception {
// 初始化一个httpclient
@SuppressWarnings("deprecation")
HttpClient client = new DefaultHttpClient();
// 我们要爬取的一个地址,这里可以从数据库中抽取数据,然后利用循环,可以爬取一个URL队列
String urls[] = {
"https://www.cndzys.com/foodcalories/listbytype-%E8%9B%8B%E7%B1%BB.html",
"https://www.cndzys.com/foodcalories/listbytype-%E8%B1%86%E7%B1%BB.html",
"https://www.cndzys.com/foodcalories/listbytype-%E8%B0%B7%E7%B1%BB.html",
"https://www.cndzys.com/foodcalories/listbytype-%E5%9D%9A%E6%9E%9C.html",
"https://www.cndzys.com/foodcalories/listbytype-%E9%85%92%E9%A5%AE.html",
"https://www.cndzys.com/foodcalories/listbytype-%E8%8D%AF%E9%A3%9F.html", };
// 爬取的数据
List<List<Food>> fooddatas = getURLParser(client, urls);
// 将抓取的数据插入数据库
foodService.insertFood(fooddatas);
}


/**
* 根据目标URL爬取页面

* @param client
* @param url
* @return
* @throws Exception
*/
public List<List<Food>> getURLParser(HttpClient client, String urls[])
throws Exception {
// 用来接收解析的数据
List<List<Food>> foodDatas = new ArrayList<List<Food>>();
for (String url : urls) {
List<Food> foodData = new ArrayList<Food>();
// 获取网站响应的html,这里调用了HTTPUtils类
HttpResponse response = HttpUtils.getRawHtml(client, url);
// 获取响应状态码
int StatusCode = response.getStatusLine().getStatusCode();
// 如果状态响应码为200,则获取html实体内容或者json文件
if (StatusCode == 200) {
String entity = EntityUtils.toString(response.getEntity(),
"utf-8");
foodData = getData(entity);
foodDatas.add(foodData);
EntityUtils.consume(response.getEntity());
} else {
// 否则,消耗掉实体
EntityUtils.consume(response.getEntity());
}
}
return foodDatas;
}


/**
* 食物能量页面解析获取目标数据

* @param html
* @return
* @throws Exception
*/
public List<Food> getData(String html) throws Exception {
// 获取的数据,存放在集合中
List<Food> data = new ArrayList<Food>();
// 采用Jsoup解析
Document doc = Jsoup.parse(html);
// 获取html标签中的内容
Elements elements = doc.select("ul[class=list1]").select("li");
for (Element ele : elements) {
String foodName = ele.select("li").select("h5").select("a").text();
String foodEnargy = ele.select("li").select("p").text();
String energyValue=foodEnargy.substring(3);
energyValue=energyValue.substring(0,energyValue.indexOf("大卡"));
// 对象封装数据
Food food = new Food();
food.setFoodName(foodName);
food.setFoodEnergy(foodEnargy);
food.setEnergyValue(energyValue);
// 将每一个对象的值,保存到List集合中
data.add(food);
}
// 返回数据
return data;
}


}


2. 多线程实现爬虫

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;


import javax.annotation.Resource;


import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpResponse;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StopWatch;


import com.bm.dao.DaoSupport;
import com.bm.entity.BaseEntity;
import com.bm.service.system.exercise.ExerciseService;
import com.bm.util.HttpUtils;
import com.bm.util.UuidUtil;


/**
 * 运动知识
 * 
 * @author zhangdong
 * 
 *         2018-4-1 上午10:34:56
 */
public class ExerciseTask {


@Resource(name = "exerciseService")
private ExerciseService exerciseService;


private volatile static int count = 0;
private final static Logger logger = LoggerFactory
.getLogger(ExerciseTask.class);


private static final String BASE_URL1 = "http://www.cndzys.com/";
// 大众养生 批量爬取
private static String B2_URL1 = "http://www.cndzys.com/yundong/changshi/index{#}.html";


// 三联
private static String B2_URL2 = "http://www.3lian.com/zl/y14/1402/index{#}.html";


public void execute() throws Exception {
System.out.println("定时任务开启来-------6666666666----------");
// 定时任务开始之前清空表
exerciseService.delete();


// 解决报错:Invalid use of SingleClientConnManager: connection still
// allocated.Make sure to release the connection
@SuppressWarnings("deprecation")
HttpClient client = new DefaultHttpClient(
new ThreadSafeClientConnManager());
spiderData(client, B2_URL1, 20);
// 执行B2
// spiderData(client, B2_URL2, 112);
// 执行B2


// spiderData(client,B2_URL2);
}


// 批量爬虫
public List<BaseEntity> spiderData(HttpClient client, String urls[])
throws ParseException, IOException {
StopWatch watch = new StopWatch();
logger.info("爬虫开始....");
// 秒表开始计时
watch.start("spiderData");
// 使用现线程池提交任务
ExecutorService executorService = Executors.newCachedThreadPool();


WaitTimeMonitoringExecutorService monitorExector = new WaitTimeMonitoringExecutorService(
executorService);
final CountDownLatch countDownLatch = new CountDownLatch(18);
for (int i = 0; i < urls.length; i++) {
final HttpResponse response = HttpUtils.getRawHtml(client, urls[i]);
// 获取响应状态码
int StatusCode = response.getStatusLine().getStatusCode();


if (StatusCode == 200) {
try {
monitorExector.submit((new Runnable() {
public void run() {
try {
System.out.println("线程"
+ Thread.currentThread().getId()
+ "开始出发");
// 解析並且入庫
parseandAdd(
EntityUtils.toString(
response.getEntity(), "utf-8"),
"");
countDownLatch.countDown();


} catch (Exception e) {
e.printStackTrace();
}
}
}));
} catch (Exception e1) {


e1.printStackTrace();
}


} else {
// 否则,消耗掉实体
EntityUtils.consume(response.getEntity());
}


}


try {
countDownLatch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
executorService.shutdown();


watch.stop();
logger.info("爬虫结束....");
System.out.println("--------任务执行详情------------");
System.out.println("秒表计时详情:\n" + watch.prettyPrint() + "\n共爬取" + count
+ "条运动数据");
return null;
}


// 批量爬虫 有规则的爬取
public List<BaseEntity> spiderData(HttpClient client, String url, int j)
throws ParseException, IOException {
StopWatch watch = new StopWatch();
logger.info("爬虫开始....");
// 秒表开始计时
watch.start("spiderData");
// 使用现线程池提交任务
ExecutorService executorService = Executors.newFixedThreadPool(30);


final CountDownLatch countDownLatch = new CountDownLatch(j);
for (int i = 0; i < j; i++) {
String newUrl = "";
int u = i + 1;
if (u == i) {
newUrl = url.replace("{#}", "");
} else {
if (url.contains("http://www.3lian.com/")) {
// 三联的网址链接需要加上下划线
newUrl = url.replace("{#}", "_" + i + "");
} else {
newUrl = url.replace("{#}", "" + i + "");
}


}
final String finalUrl = newUrl;
final HttpResponse response = HttpUtils.getRawHtml(client, newUrl);
// 获取响应状态码
int StatusCode = response.getStatusLine().getStatusCode();


if (StatusCode == 200) {
try {
executorService.submit((new Runnable() {
public void run() {
try {
final Thread currentThread = Thread
.currentThread();
System.out.println("线程:"
+ currentThread.getId() + "开始");
// 解析並且入庫
parseandAdd(
EntityUtils.toString(
response.getEntity(), "utf-8"),
finalUrl);
countDownLatch.countDown();
System.out.println(countDownLatch.getCount());


} catch (Exception e) {
e.printStackTrace();
}
}
}));
} catch (Exception e1) {


e1.printStackTrace();
}


} else {
// 否则,消耗掉实体
EntityUtils.consume(response.getEntity());
}


}


try {
countDownLatch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
executorService.shutdown();


watch.stop();
logger.info("爬虫结束....");
System.out.println("--------任务执行详情------------");
System.out.println("秒表计时详情:\n" + watch.prettyPrint() + "\n共爬取" + count
+ "条运动数据");
return null;
}


// 使用內部類
private static class InnerRunnable implements Runnable {
/** 处理main线程阻塞(等待所有子线程) */
private CountDownLatch countDown;


/** 线程名字 */
private String threadName;


public InnerRunnable(CountDownLatch countDownLatch, String threadName) {
this.countDown = countDownLatch;
this.threadName = threadName;
}


@Override
public void run() {


}


}


/**
* 解析,入庫

* @param html
* @throws Exception
*/
private void parseandAdd(String html, String url) throws Exception {
// 获取的数据,存放在集合中
List<BaseEntity> data = new ArrayList<BaseEntity>();
// 采用Jsoup解析
Document doc = Jsoup.parse(html);
Elements elements = null;
// 每个网站的解析逻辑不同
// 如果是三联的
if (url.startsWith("http://www.3lian.com/")) {
elements = doc.getElementsByTag("dl");


for (Element ele : elements) {


// 获取图片的信息
Elements img = ele.getElementsByTag("img");
String imgLink = img.attr("src");
String width = img.attr("width");


// 获取内容的信息
Elements aContent = ele.getElementsByTag("a");
// 获取链接中信息
String linkHref = aContent.attr("href");
String linkText = aContent.attr("title");
Elements pcontent = ele.getElementsByTag("p");
String content = pcontent.text();
if (StringUtils.isNotBlank(linkText)
&& StringUtils.isNotBlank(linkHref)) {
// 必要的筛选
if (linkHref.contains("userid"))
continue;


BaseEntity base = new BaseEntity();
base.setId(UuidUtil.get32UUID());
base.setLinkHref(linkHref);
base.setLinkText(linkText);
base.setContent(content);
base.setWidth(width);
base.setImgLink(imgLink);
System.out.println(base.toString());
// 将每一个对象的值,保存到List集合中
data.add(base);
count++;
} else {
continue;
}
}
// 入库逻辑
try {
exerciseService.saveExercise(data);
} catch (Exception e) {
e.printStackTrace();
}
} else {
// 健康养生网的方式
elements = doc.select("div[class='news']");
for (Element ele : elements) {


// 获取图片的信息
Elements img = ele.getElementsByTag("img");
String imgLink = img.attr("src");
String width = img.attr("height");


// 获取内容的信息
Element aContent = ele.select("div[class=news_title]").first()
.children().first().getElementsByTag("a").first();
// 获取链接中信息
String linkHref = BASE_URL1 + aContent.attr("href");
String linkText = aContent.text();
Elements pcontent = ele.getElementsByTag("p");
String content = pcontent.text();
if (StringUtils.isNotBlank(linkText)
&& StringUtils.isNotBlank(linkHref)) {
// 必要的筛选


BaseEntity base = new BaseEntity();
base.setId(UuidUtil.get32UUID());
base.setLinkHref(linkHref);
base.setLinkText(linkText);
base.setContent(content);
base.setWidth(width);
base.setImgLink(imgLink);
System.out.println(base.toString());
// 将每一个对象的值,保存到List集合中
data.add(base);
count++;
System.out.println(count);
} else {
continue;
}
}


// 入库逻辑
try {
exerciseService.saveExercise(data);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}


}
}

3.线程池监控

import java.util.Collection;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StopWatch;


/**
 * 
 * 线程池的监控执行
 * 
 * @author zhangdong
 * 
 *         2018-4-3 下午10:52:44
 */
public class WaitTimeMonitoringExecutorService implements ExecutorService {
private final static Logger logger = LoggerFactory
.getLogger(ExerciseTask.class);


private final ExecutorService target;


public WaitTimeMonitoringExecutorService(ExecutorService target) {
this.target = target;
}


@Override
public void execute(Runnable command) {
// TODO Auto-generated method stub


}


@Override
public void shutdown() {
// TODO Auto-generated method stub


}


@Override
public List<Runnable> shutdownNow() {
// TODO Auto-generated method stub
return null;
}


@Override
public boolean isShutdown() {
// TODO Auto-generated method stub
return false;
}


@Override
public boolean isTerminated() {
// TODO Auto-generated method stub
return false;
}


@Override
public boolean awaitTermination(long timeout, TimeUnit unit)
throws InterruptedException {
// TODO Auto-generated method stub
return false;
}


@Override
public <T> Future<T> submit(final Callable<T> task) {


final StopWatch watch = new StopWatch();
watch.start(task.toString());
return (Future<T>) target.submit(new Runnable() {
@Override
public void run() {


try {
watch.stop();
logger.debug(watch.prettyPrint());
task.call();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}


});


}


@Override
public <T> Future<T> submit(Runnable task, T result) {
// TODO Auto-generated method stub
return null;
}


@Override
public Future<?> submit(final Runnable task) {


final StopWatch watch = new StopWatch();
watch.start(task.toString());
return (Future<?>) target.submit(new Runnable() {
@Override
public void run() {


try {
watch.stop();
System.out.println(watch.prettyPrint());
task.run();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}


});
}


@Override
public <T> List<Future<T>> invokeAll(Collection<? extends Callable<T>> tasks)
throws InterruptedException {
// TODO Auto-generated method stub
return null;
}


@Override
public <T> List<Future<T>> invokeAll(
Collection<? extends Callable<T>> tasks, long timeout, TimeUnit unit)
throws InterruptedException {
// TODO Auto-generated method stub
return null;
}


@Override
public <T> T invokeAny(Collection<? extends Callable<T>> tasks)
throws InterruptedException, ExecutionException {
// TODO Auto-generated method stub
return null;
}


@Override
public <T> T invokeAny(Collection<? extends Callable<T>> tasks,
long timeout, TimeUnit unit) throws InterruptedException,
ExecutionException, TimeoutException {
// TODO Auto-generated method stub
return null;
}


}





阅读更多
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: