您的位置:首页 > 编程语言

使用gecco获取代理IP,仅测试代码用,不要用于非法用途

2016-03-29 16:25 459 查看
这里用到了三个类

首先是gecco开始的地方,抓取ip列表

package com.geccocrawler.gecco.demo.ipcatch;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.List;

import com.geccocrawler.gecco.GeccoEngine;
import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.annotation.HtmlField;
import com.geccocrawler.gecco.annotation.Request;
import com.geccocrawler.gecco.annotation.RequestParameter;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HtmlBean;

@Gecco(matchUrl="http://www.kuaidaili.com/free/inha/{currPage}/", pipelines={"consolePipeline", "IpListPipeline"})
public class IpList implements HtmlBean {

private static final long serialVersionUID = 4544492736813943899L;

@Request
private HttpRequest request;

@RequestParameter("currPage")
private String currPage;

/**
* 抓取列表项的详细内容,包括ip,端口等
*/
@HtmlField(cssPath="#list")
private List<IpDetail> ips;

public HttpRequest getRequest() {
return request;
}

public void setRequest(HttpRequest request) {
this.request = request;
}

public String getCurrPage() {
return currPage;
}

public void setCurrPage(String currPage) {
this.currPage = currPage;
}

public List<IpDetail> getIps() {
return ips;
}

public void setIps(List<IpDetail> ips) throws IOException {
String path = "E:/ip/ip.txt";
File file = new File(path);
if (!file.getParentFile().exists()) {
file.getParentFile().mkdirs();
}
try {
file.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
FileWriter  pw = new FileWriter (file,true);
String[] ipGroup = ips.get(0).getIp().split("\n");
String[] portGroup = ips.get(0).getPort().split("\n");
for(int i = 0;i<ipGroup.length;i++){
String ip = ipGroup[i] + ":" + portGroup[i];
System.out.println(ipGroup[i]);
pw.write(ip);
pw.write(13);
pw.write(10);
pw.flush();
}
pw.close();
this.ips = ips;
}

public static void main(String[] args) {
GeccoEngine.create()
.classpath("com.geccocrawler.gecco.demo.ipcatch")
//开始抓取的页面地址
.start("http://www.kuaidaili.com/free/inha/1/")
//开启几个爬虫线程,线程数量最好不要大于start request数量
.thread(1)
//单个爬虫每次抓取完一个请求后的间隔时间
.interval(5000)
.run();
}
}


难点是jsoup无法获取没有table标签的td标签,只能得到一个字符串

下面是详细信息

package com.geccocrawler.gecco.demo.ipcatch;

import com.geccocrawler.gecco.annotation.HtmlField;
import com.geccocrawler.gecco.spider.HtmlBean;

public class IpDetail implements HtmlBean {

private static final long serialVersionUID = 2555530396237160927L;

@HtmlField(cssPath="td:nth-child(1)")
private String ip;

@HtmlField(cssPath="td:nth-child(2)")
private String port;

public String getIp() {
return ip;
}

public void setIp(String ip) {
this.ip = ip;
}

public String getPort() {
return port;
}

public void setPort(String port) {
this.port = port;
}

public void text(String ip,String port){

}

}


然后是分页抓取

package com.geccocrawler.gecco.demo.ipcatch;

import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;

@PipelineName("IpListPipeline")
public class IpListPipeline implements Pipeline<IpList> {

@Override
public void process(IpList ipList) {
HttpRequest currRequest = ipList.getRequest();
//下一页继续抓取
String currPage = ipList.getCurrPage();
int nextPage = Integer.parseInt(currPage) + 1;
if(nextPage <= 100) {
String nextUrl = "";
String currUrl = currRequest.getUrl();
if(currUrl.indexOf("inha") != -1) {
nextUrl = "http://www.kuaidaili.com/free/inha/"+nextPage+"/";
} else {
nextUrl = currUrl + "/" + nextPage +"/";
}
SchedulerContext.into(currRequest.subRequest(nextUrl));
}
}

}


当然,这是基于gecco的,首先要引入gecco,地址:https://github.com/xtuhcy/gecco
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: