使用阻塞队列爬取代理ip实现爬虫
2017-06-11 13:34
537 查看
package util.common; import org.apache.commons.io.IOUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import javax.net.ssl.HttpsURLConnection; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.*; import java.util.HashMap; import java.util.Map; import java.util.PriorityQueue; import java.util.Queue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingDeque; /** * Created by hms on 2017/6/11. */ public class GetIpAddressUtil { public static void getIpAddress(BlockingQueue<Ip> queue) { Map<String, String> maps = new HashMap<String, String>(); for(int i = 1 ; i < 20; ++i) { try { Document doc = Jsoup.connect("http://www.xicidaili.com/nn/" + i) .data("query", "Java") .userAgent("Netscape/5") .cookie("auth", "token") .timeout(3000) .get(); String regex = "((?:(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d)))\\.){3}(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d))))"; Elements elements = doc.select("td:matches(" + regex + ")"); for(int j = 0; j < elements.size(); ++j) { Element e = (Element) elements.get(j); Element e1 = e.nextElementSibling(); String ip = e.text(); String prot = e1.text(); if(isPing(ip)) { //System.out.println(ip + " " + prot); try { queue.put(new Ip(ip, prot)); } catch (InterruptedException e2) { e2.printStackTrace(); } } } } catch (IOException e) { e.printStackTrace(); } } } public static boolean isPing(String ip) { boolean status = false; if(ip != null) { try { status = InetAddress.getByName(ip).isReachable(3000); } catch(UnknownHostException e) { } catch(IOException e) { } } return status; } public static void main(String[] args) { final BlockingQueue<Ip> queue = new LinkedBlockingDeque<>(); Thread thread = new Thread(new Runnable() { @Override public void run() { getIpAddress(queue); } }); Thread thread1 = new Thread(new Runnable() { @Override public void run() { parse(queue); } }); thread.start(); thread1.start(); } public static void parse(BlockingQueue<Ip> queue) { while (true) { Ip ip = null; try { ip = queue.take(); } catch (InterruptedException e) { e.printStackTrace(); } while (true) { System.out.println(ip.ip + " " + ip.port); SocketAddress addr = new InetSocketAddress(ip.ip, Integer.parseInt(ip.port)); Proxy proxy = new Proxy(Proxy.Type.HTTP, addr); try{ URL url = new URL("https://api.douban.com/v2/book/isbn/7505715666"); HttpsURLConnection conn = (HttpsURLConnection)url.openConnection(proxy); conn.setConnectTimeout(5000); conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; NT 5.1; GTB5; .NET CLR 2.0.50727; CIBA)"); conn.connect(); String result = ""; BufferedReader in = null; in = new BufferedReader(new InputStreamReader( conn.getInputStream(),"UTF-8")); String line; while ((line = in.readLine()) != null) { result += line; } System.out.println(result); }catch (Exception e) { // e.printStackTrace(); break; } } } } } class Ip{ String ip; String port; public Ip(String ip, String port) { this.ip = ip; this.port = port; } }
相关文章推荐
- python3实现网络爬虫(7)-- 使用ip代理抓取网页
- 使用lock和condition实现的阻塞队列-字符串
- Java并发编程笔记 使用阻塞队列实现生产者-消费者模型
- Linux C++ 使用condition实现阻塞队列
- Python3 爬虫使用User Agent和代理IP隐藏身份
- 爬虫使用代理ip
- 使用Semaphore实现阻塞队列
- Python 爬虫入门(二)—— IP代理使用
- 使用阻塞队列BlockingQueue实现生产者消费者
- java 使用ReentrantLock Condition实现阻塞队列
- Java使用阻塞队列实现指定文件夹下面搜索指定关键字
- Linux C++ 使用condition实现阻塞队列
- java并发编程学习: 阻塞队列 使用 及 实现原理
- java使用阻塞队列实现生产者消费者模式
- java中使用阻塞队列实现生产这与消费这之间的关系
- 使用阻塞队列实现生产者和消费者问题
- Java使用阻塞队列BlockingQueue实现线程同步
- 使用TaskManager爬取2万条代理IP实现自动投票功能
- 如何使用ip代理爬虫
- Jsoup使用代理ip爬虫