您的位置:首页 > Web前端 > JavaScript

Jsoup初接触-发一个Jsoup抓取图片的程序

2013-12-05 11:31 288 查看
主要有两个线程:图片url抓取线程、图片下载保存线程。

图片下载保存采用线程池处理,主要利用java的ThreadPoolExecutor实现。

url抓取线程:

package sys.gifspider;

import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import sys.gifspider.utils.PropertyUtil;

public class GifSpider implements Runnable
{

volatile boolean isRunning = true;
private ThreadPoolExecutor threadPool;
BlockingQueue<String> queue;

public GifSpider(BlockingQueue<String> queue)
{
this.queue = queue;
this.init();
}

/**
* 线程池初始化
*/
private void init()
{
Properties pro = PropertyUtil.getProperties();
int corePoolSize = Integer.parseInt(pro.getProperty("threadpool.corePoolSize"));
int maxPoolSize = Integer.parseInt(pro.getProperty("threadpool.maxPoolSize"));
int keepAliveSeconds = Integer.parseInt(pro.getProperty("threadpool.keepAliveSeconds"));
int queueCap = Integer.parseInt(pro.getProperty("threadpool.queueCapacity"));
BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(queueCap);
this.threadPool = new ThreadPoolExecutor(
corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS,
queue);
}
public boolean isRunning()
{
return isRunning;
}

public void setRunning(boolean isRunning)
{
this.isRunning = isRunning;
}

@Override
public void run()
{
while (this.isRunning)
{
try
{

String url = this.queue.take();
System.out.println("请求url:" + url);
Document doc = Jsoup.connect(url).get();
//获取所有<a href>
Elements s = doc.select("div.pic_list2").first().select("a[href]");
for (Element e : s)
{
//有img 和  文字 两种href,指向相同德图片,只过滤图片href就行了
Elements s1 = e.select("img");
if (s1.size() != 0)
{
String imgUrl = e.absUrl("href");
String text = s1.attr("alt");
Document doc1 = Jsoup.connect(imgUrl).get();
Elements e1 = doc1.getElementById("endtext").select("img");
//网页源码中是相对路径,要获取绝对路径
String realUrl = e1.attr("abs:src");
System.out.println("获取图片url:" + realUrl);
//获取到图片url,扔给线程池处理
GifProcessor pro = new GifProcessor(text,realUrl);
this.threadPool.execute(pro);
}

}
Thread.sleep(1000);
} catch (InterruptedException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
}

}

}


图片处理线程很简单,就是图片下载和保存:
package sys.gifspider;

import sys.gifspider.utils.FileProcessor;

public class GifProcessor implements Runnable
{

private String imgName;
private String imgUrl;

public GifProcessor(String name,String url)
{
this.imgName = name;
this.imgUrl = url;
}
@Override
public void run()
{
FileProcessor fp = new FileProcessor(this.imgName,this.imgUrl);
try
{
System.out.println("下载保存图片url:"+this.imgUrl);
fp.saveGif();

}catch(Exception e)
{
System.out.println("下载保存图片失败,url:"+this.imgUrl);
e.printStackTrace();
}

}

}


下载保存:

package sys.gifspider.utils;

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

public class FileProcessor
{
private String imgName;
private String imgUrl;

public FileProcessor(String name,String url)
{
this.imgName = name;
this.imgUrl = url;
}

/**
* 保存路径,不存在就创建
* @return
*/
private String makeDir()
{
String strdir = PropertyUtil.getProperties().getProperty("dir");
File dir = new File(strdir);
if (!dir.exists())
{
dir.mkdir();
}
return strdir;
}

/**
* 保存
* @throws Exception
*/
public void saveGif() throws Exception
{
String dir = makeDir();
String file = dir + this.imgName + this.imgUrl.substring(this.imgUrl.lastIndexOf("."));
BufferedOutputStream out = null;
byte[] bit = this.download();
if (bit.length > 0)
{
try
{
out = new BufferedOutputStream(new FileOutputStream(file));
out.write(bit);
out.flush();
} finally
{
if (out != null)
out.close();
}
}
}
/**
* 下载
* @return
* @throws Exception
*/
private byte[] download() throws Exception
{
URL url = new URL(this.imgUrl);
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
httpConn.connect();
InputStream cin = httpConn.getInputStream();
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len = 0;
while ((len = cin.read(buffer)) != -1) {
outStream.write(buffer, 0, len);
}
cin.close();
byte[] fileData = outStream.toByteArray();
outStream.close();
return fileData;
}
}


程序入口如下:

package sys.gifspider;

import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import sys.gifspider.utils.PropertyUtil;

public class Main
{
public static void main(String[] args)
{
init();

}
public static void init()
{
Properties pro = PropertyUtil.getProperties();
int startPage = Integer.parseInt(pro.getProperty("startPage"));
int endPage = Integer.parseInt(pro.getProperty("endPage"));
String url = pro.getProperty("url");
int count = endPage - startPage +1;
BlockingQueue<String> queue = new LinkedBlockingQueue<String>(count);
for (int i = 1; i <= count; i++)
{
queue.add(String.format(url, i));
}
int spiderCount = Integer.parseInt(pro.getProperty("spiderThread"));
for (int i = 0; i < spiderCount; i++)
{
GifSpider spider = new GifSpider(queue);
Thread t = new Thread(spider);
t.start();
}
}

}


配置文件:

spiderThread=1

threadpool.corePoolSize=8
threadpool.maxPoolSize=10
threadpool.keepAliveSeconds=600
threadpool.queueCapacity=1000

startPage=1
endPage=20
url=http://www.haha365.com/gxtp/index_gif_%d.htm

dir=E:/spider/


用haha365的动态gif做了下测试,如果想趴别的网站,自己根据人家的html结构,改一下爬取规则即可。

程序中没做过多的容错处理,可能存在一定的bug。

源码下载
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: