您的位置:首页 > 其它

使用阻塞队列爬取代理ip实现爬虫

2017-06-11 13:34 537 查看
package util.common;

import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javax.net.ssl.HttpsURLConnection;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.*;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingDeque;

/**
* Created by hms on 2017/6/11.
*/
public class GetIpAddressUtil {
public static void getIpAddress(BlockingQueue<Ip> queue) {
Map<String, String> maps = new HashMap<String, String>();
for(int i = 1 ; i < 20; ++i) {
try {
Document doc = Jsoup.connect("http://www.xicidaili.com/nn/" + i)
.data("query", "Java")
.userAgent("Netscape/5")
.cookie("auth", "token")
.timeout(3000)
.get();
String regex =
"((?:(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d)))\\.){3}(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d))))";
Elements elements =
doc.select("td:matches(" + regex + ")");
for(int j = 0; j < elements.size(); ++j) {
Element e = (Element) elements.get(j);
Element e1 = e.nextElementSibling();
String ip = e.text();
String prot = e1.text();
if(isPing(ip)) {
//System.out.println(ip + " " + prot);
try {
queue.put(new Ip(ip, prot));
} catch (InterruptedException e2) {
e2.printStackTrace();
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static boolean isPing(String ip) {
boolean status = false;
if(ip != null) {
try {
status = InetAddress.getByName(ip).isReachable(3000);
} catch(UnknownHostException e) {
}
catch(IOException e) {
}
}
return status;
}

public static void main(String[] args) {
final BlockingQueue<Ip> queue = new LinkedBlockingDeque<>();
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
getIpAddress(queue);
}
});
Thread thread1 = new Thread(new Runnable() {
@Override
public void run() {
parse(queue);
}
});

thread.start();
thread1.start();
}

public static void parse(BlockingQueue<Ip> queue) {

while (true) {
Ip ip = null;
try {
ip = queue.take();
} catch (InterruptedException e) {
e.printStackTrace();
}
while (true) {
System.out.println(ip.ip + " " + ip.port);
SocketAddress addr = new InetSocketAddress(ip.ip, Integer.parseInt(ip.port));
Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
try{
URL url = new URL("https://api.douban.com/v2/book/isbn/7505715666");
HttpsURLConnection conn = (HttpsURLConnection)url.openConnection(proxy);
conn.setConnectTimeout(5000);
conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; NT 5.1; GTB5; .NET CLR 2.0.50727; CIBA)");
conn.connect();
String result = "";
BufferedReader in = null;
in = new BufferedReader(new InputStreamReader(
conn.getInputStream(),"UTF-8"));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
System.out.println(result);
}catch (Exception e) {
// e.printStackTrace();
break;
}
}
}
}
}
class Ip{
String ip;
String port;

public Ip(String ip, String port) {
this.ip = ip;
this.port = port;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  爬虫