您的位置:首页 > 其它

抓取腾讯新闻评论

2015-10-12 15:56 323 查看
package com.orange.qqnews;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.sf.json.JSONObject;

public class Test {
public static void main(String[] args) {
//新闻正文正则
String regex1 = "<div id=\"Cnt-Main-Article-QQ\" bossZone=\"content\">([\\d\\D]*?)</div>";
//评论ID正则
String regex2 = "cmt_id = ([\\d]*?);";
//获取网页源代码
String html = openUrl("http://news.qq.com/a/20150825/004734.htm","gb2312");
//获取新闻正则
String content = getContent(regex1,html);
System.out.println(content);
//获取评论ID
String cmtId = getContent(regex2,html);
System.out.println(cmtId);

//拼接评论地址
String cmtUrl = "http://coral.qq.com/article/"+cmtId+"/comment?commentid=0&reqnum=20";
String cmt = openUrl(cmtUrl,"gb2312");

JSONObject jsonMap = new JSONObject();
Map map = jsonMap.fromObject(cmt);
Map<String,List> data = (Map)map.get("data");
List<Map<String,String>> comments = data.get("commentid");

for(Map<String,String> m : comments){
String cmtContent = m.get("content"); //评论

//其他信息略过(回复人,回复时间,赞等)

System.out.println(cmtContent);
}

}

/**
* 访问url返回url的html代码
*/
public static String openUrl(String currentUrl,String charset) {
InputStream is = null;
BufferedReader br = null;
URL url;
StringBuffer html = new StringBuffer();
try {
url = new URL(currentUrl);
URLConnection conn = url.openConnection();
conn.setReadTimeout(5000);
conn.connect();
is = conn.getInputStream();
br = new BufferedReader(new InputStreamReader(is,charset));
String str;
while (null != (str = br.readLine())) {
html.append(str).append("\n");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}

}
return html.toString();
}

private static String getContent(String regex,String text) {
String content = "";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(text);
while(matcher.find()) {
content = matcher.group(1).toString();
}
return content;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: