Jsoup初接触-发一个Jsoup抓取图片的程序
2013-12-05 11:31
288 查看
主要有两个线程:图片url抓取线程、图片下载保存线程。
图片下载保存采用线程池处理,主要利用java的ThreadPoolExecutor实现。
url抓取线程:
图片处理线程很简单,就是图片下载和保存:
下载保存:
程序入口如下:
配置文件:
用haha365的动态gif做了下测试,如果想趴别的网站,自己根据人家的html结构,改一下爬取规则即可。
程序中没做过多的容错处理,可能存在一定的bug。
源码下载
图片下载保存采用线程池处理,主要利用java的ThreadPoolExecutor实现。
url抓取线程:
package sys.gifspider; import java.io.IOException; import java.util.Properties; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import sys.gifspider.utils.PropertyUtil; public class GifSpider implements Runnable { volatile boolean isRunning = true; private ThreadPoolExecutor threadPool; BlockingQueue<String> queue; public GifSpider(BlockingQueue<String> queue) { this.queue = queue; this.init(); } /** * 线程池初始化 */ private void init() { Properties pro = PropertyUtil.getProperties(); int corePoolSize = Integer.parseInt(pro.getProperty("threadpool.corePoolSize")); int maxPoolSize = Integer.parseInt(pro.getProperty("threadpool.maxPoolSize")); int keepAliveSeconds = Integer.parseInt(pro.getProperty("threadpool.keepAliveSeconds")); int queueCap = Integer.parseInt(pro.getProperty("threadpool.queueCapacity")); BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(queueCap); this.threadPool = new ThreadPoolExecutor( corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS, queue); } public boolean isRunning() { return isRunning; } public void setRunning(boolean isRunning) { this.isRunning = isRunning; } @Override public void run() { while (this.isRunning) { try { String url = this.queue.take(); System.out.println("请求url:" + url); Document doc = Jsoup.connect(url).get(); //获取所有<a href> Elements s = doc.select("div.pic_list2").first().select("a[href]"); for (Element e : s) { //有img 和 文字 两种href,指向相同德图片,只过滤图片href就行了 Elements s1 = e.select("img"); if (s1.size() != 0) { String imgUrl = e.absUrl("href"); String text = s1.attr("alt"); Document doc1 = Jsoup.connect(imgUrl).get(); Elements e1 = doc1.getElementById("endtext").select("img"); //网页源码中是相对路径,要获取绝对路径 String realUrl = e1.attr("abs:src"); System.out.println("获取图片url:" + realUrl); //获取到图片url,扔给线程池处理 GifProcessor pro = new GifProcessor(text,realUrl); this.threadPool.execute(pro); } } Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } }
图片处理线程很简单,就是图片下载和保存:
package sys.gifspider; import sys.gifspider.utils.FileProcessor; public class GifProcessor implements Runnable { private String imgName; private String imgUrl; public GifProcessor(String name,String url) { this.imgName = name; this.imgUrl = url; } @Override public void run() { FileProcessor fp = new FileProcessor(this.imgName,this.imgUrl); try { System.out.println("下载保存图片url:"+this.imgUrl); fp.saveGif(); }catch(Exception e) { System.out.println("下载保存图片失败,url:"+this.imgUrl); e.printStackTrace(); } } }
下载保存:
package sys.gifspider.utils; import java.io.BufferedOutputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.URL; public class FileProcessor { private String imgName; private String imgUrl; public FileProcessor(String name,String url) { this.imgName = name; this.imgUrl = url; } /** * 保存路径,不存在就创建 * @return */ private String makeDir() { String strdir = PropertyUtil.getProperties().getProperty("dir"); File dir = new File(strdir); if (!dir.exists()) { dir.mkdir(); } return strdir; } /** * 保存 * @throws Exception */ public void saveGif() throws Exception { String dir = makeDir(); String file = dir + this.imgName + this.imgUrl.substring(this.imgUrl.lastIndexOf(".")); BufferedOutputStream out = null; byte[] bit = this.download(); if (bit.length > 0) { try { out = new BufferedOutputStream(new FileOutputStream(file)); out.write(bit); out.flush(); } finally { if (out != null) out.close(); } } } /** * 下载 * @return * @throws Exception */ private byte[] download() throws Exception { URL url = new URL(this.imgUrl); HttpURLConnection httpConn = (HttpURLConnection) url.openConnection(); httpConn.connect(); InputStream cin = httpConn.getInputStream(); ByteArrayOutputStream outStream = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int len = 0; while ((len = cin.read(buffer)) != -1) { outStream.write(buffer, 0, len); } cin.close(); byte[] fileData = outStream.toByteArray(); outStream.close(); return fileData; } }
程序入口如下:
package sys.gifspider; import java.io.IOException; import java.util.Properties; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import sys.gifspider.utils.PropertyUtil; public class Main { public static void main(String[] args) { init(); } public static void init() { Properties pro = PropertyUtil.getProperties(); int startPage = Integer.parseInt(pro.getProperty("startPage")); int endPage = Integer.parseInt(pro.getProperty("endPage")); String url = pro.getProperty("url"); int count = endPage - startPage +1; BlockingQueue<String> queue = new LinkedBlockingQueue<String>(count); for (int i = 1; i <= count; i++) { queue.add(String.format(url, i)); } int spiderCount = Integer.parseInt(pro.getProperty("spiderThread")); for (int i = 0; i < spiderCount; i++) { GifSpider spider = new GifSpider(queue); Thread t = new Thread(spider); t.start(); } } }
配置文件:
spiderThread=1 threadpool.corePoolSize=8 threadpool.maxPoolSize=10 threadpool.keepAliveSeconds=600 threadpool.queueCapacity=1000 startPage=1 endPage=20 url=http://www.haha365.com/gxtp/index_gif_%d.htm dir=E:/spider/
用haha365的动态gif做了下测试,如果想趴别的网站,自己根据人家的html结构,改一下爬取规则即可。
程序中没做过多的容错处理,可能存在一定的bug。
源码下载
相关文章推荐
- 判断宽度的js
- js给页面加style
- [置顶] Jsp中的table多表头导出excel文件
- 浏览器图片选择预览、旋转、批量上传的JS代码实现
- 【转发】网易邮箱前端技术分享之javascript编码规范
- JSF简单实例及无法正常访问的解决方法(404和The markup in the document preceding the root element must be well-formed. )
- javascript中比较文本框中数字的大小
- 在Visualforce page中用自带的控件实现Ajax回调后台方法(并且可以用js去动态给parameters赋值)
- JSP向controller传参时,如何把动态记录传过去
- jsp servlet 中 session 以及 cookie 如何正确使用?
- JS禁止浏览器后退键
- silverlight和javascript交互
- 浅谈JavaScript函数参数的可修改性问题
- ExtJs htmleditor工具栏设置
- Extjs 疑难杂症 (LoadMark 遮罩、Panel Update无效、chrome浏览器date控件全屏)
- 浅析JavaScript中的隐式类型转换
- jsp获取当前时间
- JavaScript使用需要注意的细节
- JSTL标签的用法详解
- 开源的javascript实现页面打印功能,兼容所有的浏览器(情况属实)