您的位置:首页 > 理论基础 > 计算机网络

HttpClient使用例子:读取CSDN的投票列表并正则解析

2009-01-06 16:07 621 查看
这个属于前一个例子的实际应用版本,用来读取真实的页面并进行正则解析
package com.laozizhu.apache.httpclient;

import java.net.Socket;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.ConnectionReuseStrategy;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.impl.DefaultConnectionReuseStrategy;
import org.apache.http.impl.DefaultHttpClientConnection;
import org.apache.http.message.BasicHttpRequest;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.BasicHttpProcessor;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.protocol.HttpRequestExecutor;
import org.apache.http.protocol.RequestConnControl;
import org.apache.http.protocol.RequestContent;
import org.apache.http.protocol.RequestExpectContinue;
import org.apache.http.protocol.RequestTargetHost;
import org.apache.http.protocol.RequestUserAgent;
import org.apache.http.util.EntityUtils;

/**
* HttpClient使用例子:读取CSDN的所有投票状态
*
* @author 老紫竹(java2000.net)
*/
public class HttpGet {
public static void main(String[] args) throws Exception {

HttpParams params = new BasicHttpParams();
// HTTP 协议的版本,1.1/1.0/0.9
HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);
// 字符集
HttpProtocolParams.setContentCharset(params, "UTF-8");
// 伪装的浏览器类型
// IE7 是
// Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)
//
// Firefox3.03
// Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.9.0.3)
// Gecko/2008092417 Firefox/3.0.3
//
HttpProtocolParams.setUserAgent(params, "HttpComponents/1.1");
HttpProtocolParams.setUseExpectContinue(params, true);

BasicHttpProcessor httpproc = new BasicHttpProcessor();

httpproc.addInterceptor(new RequestContent());
httpproc.addInterceptor(new RequestTargetHost());

httpproc.addInterceptor(new RequestConnControl());
httpproc.addInterceptor(new RequestUserAgent());
httpproc.addInterceptor(new RequestExpectContinue());

HttpRequestExecutor httpexecutor = new HttpRequestExecutor();

HttpContext context = new BasicHttpContext(null);
HttpHost host = new HttpHost("vote.csdn.net", 80);

DefaultHttpClientConnection conn = new DefaultHttpClientConnection();
ConnectionReuseStrategy connStrategy = new DefaultConnectionReuseStrategy();

context.setAttribute(ExecutionContext.HTTP_CONNECTION, conn);
context.setAttribute(ExecutionContext.HTTP_TARGET_HOST, host);
System.out.println("<table>");
try {
// 这个85是因为目前有85页,如果有更多的页,需要手工修改或者传参数进来
for (int i = 1; i <= 85; i++) {
if (!conn.isOpen()) {
Socket socket = new Socket(host.getHostName(), host.getPort());
conn.bind(socket, params);
}
BasicHttpRequest request = new BasicHttpRequest("GET",
"http://vote.csdn.net/VoteList.aspx?page=" + i);

context.setAttribute(ExecutionContext.HTTP_REQUEST, request);
request.setParams(params);
httpexecutor.preProcess(request, httpproc, context);
HttpResponse response = httpexecutor.execute(request, conn, context);
response.setParams(params);
httpexecutor.postProcess(response, httpproc, context);

// 返回码
if (response.getStatusLine().getStatusCode() != 200) {
break;
}
parseData(EntityUtils.toString(response.getEntity()));
if (!connStrategy.keepAlive(response, context)) {
conn.close();
}
}
} finally {
conn.close();
}
System.out.println("</table>");
}

static final Pattern p = Pattern
.compile(
"<h4>.*?<a href=.*?voteid=(//d+)/">(.*?)</a></h4>.*?发起人:<a href=.*?>(.*?)</a>.*?<a href=.*?>(//d+) 人投票</a>",
Pattern.DOTALL);

/**
* 解析页面,得到投票编号,题目,发起人和参与人数
*
* @param msg
*/
public static void parseData(String msg) {
String[] parts = msg.split("div class=/"kimi_modifysty/">");
Matcher m;
for (String s : parts) {
m = p.matcher(s);
if (m.find()) {
System.out.println("<tr><td>" + m.group(1)
+ "</td><td><a href='http://vote.csdn.net/VotePost.aspx?voteid=" + m.group(1) + "'>"
+ m.group(2).replace(",", ",") + "</a></td><td>" + m.group(3) + "</td><td>"
+ m.group(4) + "</td></tr>");
}
}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: