抓取腾讯新闻评论
2015-10-12 15:56
323 查看
package com.orange.qqnews; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.sf.json.JSONObject; public class Test { public static void main(String[] args) { //新闻正文正则 String regex1 = "<div id=\"Cnt-Main-Article-QQ\" bossZone=\"content\">([\\d\\D]*?)</div>"; //评论ID正则 String regex2 = "cmt_id = ([\\d]*?);"; //获取网页源代码 String html = openUrl("http://news.qq.com/a/20150825/004734.htm","gb2312"); //获取新闻正则 String content = getContent(regex1,html); System.out.println(content); //获取评论ID String cmtId = getContent(regex2,html); System.out.println(cmtId); //拼接评论地址 String cmtUrl = "http://coral.qq.com/article/"+cmtId+"/comment?commentid=0&reqnum=20"; String cmt = openUrl(cmtUrl,"gb2312"); JSONObject jsonMap = new JSONObject(); Map map = jsonMap.fromObject(cmt); Map<String,List> data = (Map)map.get("data"); List<Map<String,String>> comments = data.get("commentid"); for(Map<String,String> m : comments){ String cmtContent = m.get("content"); //评论 //其他信息略过(回复人,回复时间,赞等) System.out.println(cmtContent); } } /** * 访问url返回url的html代码 */ public static String openUrl(String currentUrl,String charset) { InputStream is = null; BufferedReader br = null; URL url; StringBuffer html = new StringBuffer(); try { url = new URL(currentUrl); URLConnection conn = url.openConnection(); conn.setReadTimeout(5000); conn.connect(); is = conn.getInputStream(); br = new BufferedReader(new InputStreamReader(is,charset)); String str; while (null != (str = br.readLine())) { html.append(str).append("\n"); } } catch (Exception e) { e.printStackTrace(); } finally { if (br != null) { try { br.close(); } catch (IOException e) { e.printStackTrace(); } } if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } } return html.toString(); } private static String getContent(String regex,String text) { String content = ""; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(text); while(matcher.find()) { content = matcher.group(1).toString(); } return content; } }
相关文章推荐
- Oracle查询表里的重复数据方法:
- 关于LR中的EXTRARES
- mybatis
- OTG_FS_ID功能及引申
- Java基础知识强化之IO流笔记58:内存操作流
- maven中snapshot快照库和release发布库的区别和作用
- unity3d创建一个跟随人物的血条
- 关于形如--error LNK2005: xxx 已经在 msvcrtd.lib ( MSVCR90D.dll ) 中定义--的问题分析解决
- ibatis resultMap使用的问题
- iOS解决键盘遮挡输入框的问题
- 利用开源工具搭一套汉英翻译系统(三):语言模型工具SRILM
- centos 安装 git
- KinSlideshow出现空格问题
- android Activity实现从底部弹出或滑出选择菜单或窗口
- BlueDroid 蓝牙启动流程分析
- 日志工具类
- Hdu 5496 Beauty of Sequence (组合数)
- Android Animation学习笔记
- 移动APP如何摆脱“叫好不叫座”的局面
- 韩顺平 java 第36讲 事件监听