您的位置：首页 > 编程语言 > Java开发

Java爬虫到一些总结和心得

2014-04-07 11:41 519 查看

最近做了很多关于爬虫到项目，写点感想，以后查询

1.请求http连接，并保存内容，catch不同到exception进行反爬处理

int countUrl=0;

public String getOneHtml(String htmlurl,String encoding,String cookie) throws IOException, InterruptedException
{

//最多重复请求5次，用来反爬的
if(countUrl==5){
countUrl=0;
return "0";
}
//System.out.println(cookie);

String temp;
final StringBuffer sb = new StringBuffer();
HttpURLConnection httpConn = null;
try
{
URL url = new URL(htmlurl);

httpConn = (HttpURLConnection) url.openConnection();
//头设置，get方法

HttpURLConnection.setFollowRedirects(true);
httpConn.setRequestMethod("GET");
httpConn.setRequestProperty("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36");
httpConn.setRequestProperty("Connection","keep-alive");
httpConn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml");
httpConn.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
httpConn.setRequestProperty("cookie",cookie);
httpConn.setRequestProperty("Cache-control","no-cache, no-store");
httpConn.setRequestProperty("Host","www.linkedin.com");
httpConn.setConnectTimeout(20000);
httpConn.setReadTimeout(20000);
// logger.info(httpConn.getResponseMessage());
BufferedReader in = new BufferedReader(new InputStreamReader(httpConn.getInputStream(), encoding));////打开连接，获取内容
if(httpConn.getResponseCode()!=200){
//System.out.println(httpConn.getHeaderField("location"));
// System.out.println(httpConn.getResponseCode()+htmlurl);
httpConn.disconnect();
Thread.sleep(30000);

cookie=login();
return getOneHtml(htmlurl,encoding,cookie);
}
while ((temp = in.readLine()) != null)
//替换点一些无用到符号
{
temp=temp.replaceAll("	","");
temp=temp.replaceAll("\\u002d","-");
temp=temp.replaceAll("\\u0026","&");
temp=temp.replaceAll("\\\\u002d","-");
temp=temp.replaceAll("\\\\u0026","&");
temp=temp.replaceAll("\n","");
temp=temp.replaceAll("\t","");
temp=temp.replaceAll("\r","");
sb.append(temp);
}
in.close();
httpConn.disconnect();

}
catch (final MalformedURLException me)
{
System.out.println("url不存在!");
me.getMessage();
throw me;
}
catch (final FileNotFoundException me)
{
System.out.println(htmlurl+"反爬启动");
return "0";
}
catch (final IOException e)
{
e.printStackTrace();
System.out.println("反爬启动:"+htmlurl+"次数:"+countUrl++);
httpConn.disconnect();
Thread.sleep(20000);
return this.getOneHtml(htmlurl, encoding,cookie);
}

//System.out.println(sb);
countUrl=0;
httpConn.disconnect();

return sb.toString();

}

2.模拟登录，获取cookie：

public String login() throws MalformedURLException, InterruptedException{
//Thread.sleep(3000000);
String htmlurl="https://www.linkedin.com/uas/login-submit";
HttpURLConnection httpConn = null;
String cookie="";
try
{
URL url = new URL(htmlurl);

httpConn = (HttpURLConnection) url.openConnection();

HttpURLConnection.setFollowRedirects(true);
httpConn.setRequestMethod("POST");
httpConn.setRequestProperty("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36");
httpConn.setRequestProperty("Connection","keep-alive");
httpConn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml");
httpConn.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
httpConn.setRequestProperty("Cache-control","no-cache, no-store");
httpConn.setRequestProperty("Host","www.linkedin.com");
//httpConn.setRequestProperty("Referer","https://www.linkedin.com/uas/login?session_redirect=http://www.linkedin.com/profile/view?id=222323610&authType=name&authToken=fcEe");
//post方法，重定向设置
httpConn.setDoOutput(true);
httpConn.setDoInput(true);
httpConn.setUseCaches(false);
httpConn.setInstanceFollowRedirects(true);
//写入，post方法必须用流写入的方式传输数据
StringBuffer str_buf = new StringBuffer(4096);
OutputStream os = httpConn.getOutputStream();
str_buf.append("session_key").append("=").append("email").append("&");
str_buf.append("session_password").append("=").append("gmail").append("&");
//str_buf.append("session_redirect").append("=").append(redictURL);
os.write(str_buf.toString().getBytes());
os.flush();
os.close();
httpConn.setConnectTimeout(20000);
httpConn.setReadTimeout(20000);
//获取重定向和cookie
//String redictURL= httpConn.getHeaderField( "Location" );
//System.out.println("第一次请求重定向地址 location="+redictURL);

//获取cookie
Map<String,List<String>> map=httpConn.getHeaderFields();
//System.out.println(map.toString());
Set<String> set=map.keySet();
for (Iterator<String> iterator = set.iterator(); iterator.hasNext();) {
String key = iterator.next();
if(key!=null){
if (key.equals("Set-Cookie")) {
System.out.println("key=" + key+",开始获取cookie");
List<String> list = map.get(key);
for (String str : list) {
String temp=str.split("=")[0];
//System.out.println(temp);

//cookie包含到信息非常多，调试发现登录只需这条信息
if(temp.equals("li_at")){
cookie=str;
return cookie;
}

}
}
}

}
httpConn.disconnect();

}
catch (final MalformedURLException me)
{
System.out.println("url不存在!");
me.getMessage();
throw me;
}
catch (final FileNotFoundException me)
{
System.out.println(htmlurl+"反爬启动");
return "0";
}
catch (final IOException e)
{
e.printStackTrace();
System.out.println("反爬启动:"+htmlurl+"次数:"+countUrl++);
httpConn.disconnect();
Thread.sleep(20000);
return login();
}

//System.out.println(sb);
return cookie;
//return redictURL;
}

以上是http处理部分，灵活应用post和get方法，可以获取HTML内容。

但是不同网站反爬策略不同。有的封IP，需要登录到有封帐号的，我这个是最简单到断开链接的，直接进程休眠。。。需要换IP，代理，cookie的情况，可以自己分析，基本也就是设置httpConn的一些值。

3.数据获取：

我一般采用正则匹配，这比较适用于爬取数据不多，网站只返回HTML内容，非常不规范的。。。比如linkedin，所有数据都在一个注释到json里，各种链接和奇怪的符号，用工具很难解析。。。

//教育信息"fosList":.*?schoolLogo
String edu="null";
ArrayList<EduInfor> listEdu=new ArrayList<EduInfor>();
String regex1 = "\"fosList\":.*?schoolLogo";
Pattern pa1 = Pattern.compile(regex1, Pattern.DOTALL);
Matcher ma1 = pa1.matcher(s);
while(ma1.find()){
EduInfor ei=new EduInfor(ui.getCv_id());
edu=ma1.group();
//学校
String school="null";
String regex = "\"schoolName\":.*?,";
Pattern pa= Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(edu);
if(ma.find()){
school=ma.group();
school=school.replaceAll("\"schoolName\":", "");
school=school.replaceAll("\"", "");
school=school.replaceAll(",", "");
if(!school.equals("")){
ei.setCollege(school);
}
}
//学位
String degree="null";
regex = "\"fmt__degree_highlight\":.*?,";
pa= Pattern.compile(regex, Pattern.DOTALL);
ma = pa.matcher(edu);
if(ma.find()){
degree=ma.group();
degree=degree.replaceAll("\"fmt__degree_highlight\":", "");
degree=degree.replaceAll("\"", "");
degree=degree.replaceAll(",", "");
degree=degree.replaceAll("\\u0027s", "");
if(!degree.equals("")){
ei.setDegree_name(degree);
}
}
//专业
String major="null";
regex = "\"fmt__fos_highlight\":.*?,";
pa= Pattern.compile(regex, Pattern.DOTALL);
ma = pa.matcher(edu);
if(ma.find()){
major=ma.group();
major=major.replaceAll("\"fmt__fos_highlight\":", "");
major=major.replaceAll("\"", "");
major=major.replaceAll(",", "");
if(!major.equals("")){
ei.setMajor(major);
}
}
//学历"grade":"1st"
String academic="null";
regex = "\"grade\":.*?,";
pa= Pattern.compile(regex, Pattern.DOTALL);
ma = pa.matcher(edu);
if(ma.find()){
academic=ma.group();
academic=academic.replaceAll("\"grade\":", "");
academic=academic.replaceAll("\"", "");
academic=academic.replaceAll(",", "");
if(!academic.equals("")){
ei.setAcademic_name(academic);
}
}
//时间"enddate_my":"2005","startdate_my":"2002"
String s_time="null";
regex = "\"startdate_my\":.*?,";
pa= Pattern.compile(regex, Pattern.DOTALL);
ma = pa.matcher(edu);
if(ma.find()){
s_time=ma.group();
s_time=s_time.replaceAll("\"startdate_my\":", "");
s_time=s_time.replaceAll("\"", "");
s_time=s_time.replaceAll(",", "");
s_time=s_time.replaceAll(" ", "");
if(!s_time.equals("")){
ei.setStart_time(s_time);
}
}

String e_time="null";
regex = "\"enddate_my\":.*?,";
pa= Pattern.compile(regex, Pattern.DOTALL);
ma = pa.matcher(edu);
if(ma.find()){
e_time=ma.group();
e_time=e_time.replaceAll("\"enddate_my\":", "");
e_time=e_time.replaceAll("\"", "");
e_time=e_time.replaceAll(",", "");
e_time=e_time.replaceAll(" ", "");
if(!e_time.equals("")){
ei.setEnd_time(e_time);
}
}else{
ei.setEnd_time("目前");
}
listEdu.add(ei);

}

很多人都说正则匹配复杂难用，记不住。。。其实我也记不住：（但是只需要用 .*? 的匹配方式，把有用数据取出来，再replace不用的信息。。这样到代码重用率很高，写起来没那么痛苦。

4.数据输出

项目需要，全部转换成json格式，使用google的GSON包，一句话就把java的类转换成json格式的String，非常好用：）

Gson gson=new Gson();
UserInfor ui=new UserInfor();
ui.setCv_origin("linkedin");
ui.setCv_id(it.getKey());
ui=cl.getInfor(tmp_content, ui);
out.write((gson.toJson(ui)+"\n").getBytes());

5.多线程：

爬虫当然要多线程，很容易改就不写了。。。不过要写出优秀到多线程还是需要内功的，还在学。

总结：

爬虫的步骤很简单，请求http（包括处理各种异常），按需求获取数据，转换标准格式输出。。。最难的其实是第一步，包括了很多抓包分析的工作，这个很依靠经验的，多积累，不好说。。。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航