您的位置:首页 > 编程语言 > Java开发

多线程爬虫Java调用wget下载文件,独立线程读取输出缓冲区

2015-01-28 17:27 609 查看
写了个抓取appstore的,要抓取大量的app,本来是用httpclient,但是效果不理想,于是直接调用wget下载,但是由于标准输出、错误输出的原因会导致卡住,另外wget也会莫名的卡住。

所以我采用:

一、独立线程读取输出信息;

二、自己实现doWaitFor方法来代替api提供的waitFor()方法,避免子进程卡死。

三、设置超时,杀死wget子进程,没有正确返回的话,重试一次,并把超时时间加倍;

有了以上操作,wget不会卡死,就算卡住了也会因为超时被干掉再重试一次,所以绝大部分的app可以被抓取下来。

import com.google.common.io.Files;
import com.xxx.appstore.service.crawler.CalcMD5Service;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.RandomUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

public class CrawlerUtils {

public static final String APK_DOWNLOAD_PATH = "/data/appstore/category/";
private static Logger LOGGER = LoggerFactory.getLogger(CrawlerUtils.class);

/**
* 使用wget下载文件
*
* @param displayName  appName
* @param category     分类
* @param download_url 下载地址
* @return 成功返回文件路径,失败返回null
*/
public static String downloadFileByWget(String displayName, String category, String download_url) {
if (StringUtils.isBlank(displayName) || StringUtils.isBlank(category) || StringUtils.isBlank(download_url)) {
LOGGER.info("downloadFileByWget ERROR, displayName:{}, category:{}, download_url:{}", new Object[]{displayName, category, download_url});
return null;
}
String fileName = CalcMD5Service.encoder(displayName + RandomUtils.nextInt(1000));
String seed = CalcMD5Service.encoder(category);
String midPath = StringUtils.left(seed, 10);
String filePath = APK_DOWNLOAD_PATH + midPath + "/" + fileName + ".apk";
File file = new File(filePath);
try {
Files.createParentDirs(file);
} catch (IOException e) {
LOGGER.warn("IOException", e);
return null;
}
int retry = 2;
int res = -1;
int time = 1;
while (retry-- > 0) {
ProcessBuilder pb = new ProcessBuilder("wget", download_url, "-t", "2", "-T", "10", "-O", filePath);
LOGGER.info("wget shell: {}", pb.command());
Process ps = null;
try {
ps = pb.start();
} catch (IOException e) {
LOGGER.error("IOException", e);
}
res = doWaitFor(ps, 30 * time++);
if (res != 0) {
LOGGER.warn("Wget download failed...");
} else {
break;
}
}
if (res != 0) {
return null;
}
return filePath;
}

/**
* @param ps      sub process
* @param timeout 超时时间,SECONDS
* @return 正常结束返回0
*/
private static int doWaitFor(Process ps, int timeout) {
int res = -1;
if (ps == null) {
return res;
}
List<String> stdoutList = new ArrayList<>();
List<String> erroroutList = new ArrayList<>();
boolean finished = false;
int time = 0;
ThreadUtil stdoutUtil = new ThreadUtil(ps.getInputStream(), stdoutList);
ThreadUtil erroroutUtil = new ThreadUtil(ps.getErrorStream(), erroroutList);
//启动线程读取缓冲区数据
stdoutUtil.start();
erroroutUtil.start();
while (!finished) {
time++;
if (time >= timeout) {
LOGGER.info("Process wget timeout 30s, destroyed!");
ps.destroy();
break;
}
try {
res = ps.exitValue();
finished = true;
} catch (IllegalThreadStateException e) {
try {
TimeUnit.SECONDS.sleep(1);
} catch (InterruptedException e1) {

}
}
}
return res;
}
}


import org.apache.commons.io.Charsets;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.List;

public class ThreadUtil implements Runnable {
// 设置读取的字符编码
private String character = Charsets.UTF_8.displayName();
private List<String> list;
private InputStream inputStream;

public ThreadUtil(InputStream inputStream, List<String> list) {
this.inputStream = inputStream;
this.list = list;
}

public void start() {
Thread thread = new Thread(this);
thread.setDaemon(true);//将其设置为守护线程
thread.start();
}

public void run() {
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(inputStream, character));
String line = null;
while ((line = br.readLine()) != null) {
list.add(line);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
//释放资源
inputStream.close();
if (br != null) {
br.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: