利用正则表达式抽取网页信息
2015-08-11 09:54
330 查看
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ObtainNews {
/**
*
* @param htmlurl
* @return 读取一个网页全部内容
* @throws IOException
*/
public String getHtml(final String htmlurl) throws IOException {
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try {
url = new URL(htmlurl);
final BufferedReader in = new BufferedReader(new InputStreamReader(
url.openStream(), "utf-8"));// 读取网页全部内容
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
in.close();
} catch (final MalformedURLException me) {
System.out.println("你输入的URL格式有问题!请仔细输入");
me.getMessage();
throw me;
} catch (final IOException e) {
e.printStackTrace();
throw e;
}
return sb.toString();
}
/**
* 获取 标题/正文/发布时间/发布者:null/来源站点:reuters/记者/分类频道/专题标识:null/图片/视频
*/
public String getTitle(final String s) {
String regex;
String title = "";
final List<String> list = new ArrayList<String>();
regex = "<h1>.*?</h1>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
title = title + list.get(i);
}
return title;
}
public String getContent(final String s) {
String regex;
String content = "";
final List<String> list = new ArrayList<String>();
regex = "<span id=\"midArticle_start\"></span>.*?</span></span>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
content = content + list.get(i);
}
return content;
}
public String getTime(final String s) {
String regex;
String time = "";
final List<String> list = new ArrayList<String>();
regex = "<span class=\"timestamp\">.*?</span> </p>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
time = time + list.get(i);
}
return time;
}
public String getReporter(final String s) {
String regex;
String reporter = "";
final List<String> list = new ArrayList<String>();
regex = "<p class=\"byline\">.*?</p> <p>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
reporter = reporter + " " + list.get(i);
}
return reporter;
}
public String getChannel(final String s) {
String regex;
String channel = "";
final List<String> list = new ArrayList<String>();
regex = "<div class=\"actionButton\">.*?</a></div>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
channel = channel + " " + list.get(i);
}
return channel;
}
public String getImgsrc(final String s) {
String regex;
String imgsrc = "";
final List<String> list = new ArrayList<String>();
regex = "<img src=\".*?\"
border";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
imgsrc = imgsrc + list.get(i);
}
return imgsrc;
}
public String getVideosrc(final String s) {
String regex;
String videosrc = "";
final List<String> list = new ArrayList<String>();
regex = "<div class=\"photo\">.*?<img";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
videosrc = videosrc + list.get(i);
}
return videosrc;
}
/**
*
* @param s
* @return 去除标签
*/
public String outTag(final String s) {
return s.replaceAll("<.*?>", "");
}
/**
*
* @param s
* @return 获取内容
*/
public HashMap<String, String> getFromWeb(final String s) {
final HashMap<String, String> hm = new HashMap<String, String>();
String html = "";
System.out.println("\n开始读取网页(" + s + ")");
try {
html = getHtml(s);
} catch (final Exception e) {
e.getMessage();
}
System.out.println(html);
System.out.println("分析(" + s + ")结果\n");
String title = outTag(getTitle(html));
String content = outTag(getContent(html));
String time = outTag(getTime(html));
String reporter = outTag(getReporter(html)).replaceAll("By ", "");
String channel = outTag(getChannel(html));
String imgsrc = getImgsrc(html)
.replaceAll("<img src=\"", "").replaceAll("\"
border", "").replaceAll(" ", "");
String videosrc = getVideosrc(html)
.replaceAll("<div class=\"photo\"><a href=\"", "").replaceAll("\"><img", "");
hm.put("title", title);
hm.put("content", content);
hm.put("time", time);
hm.put("reporter", reporter);
hm.put("channel", channel);
hm.put("imgsrc", imgsrc);
hm.put("videosrc", videosrc);
return hm;
}
/**
*
* @param args
* 测试网页www.reuters.com
*/
public static void main(final String args[]) {
String url = "";
final List<String> list = new ArrayList<String>();
System.out.print("输入新闻页面网址,换行输入run\n");
final BufferedReader br = new BufferedReader(new InputStreamReader(
System.in));
//http://www.reuters.com/article/2014/04/04/us-usa-cia-interrogation-idUSBREA321UC20140404
//http://www.reuters.com/article/2014/04/04/us-congress-justice-highspeed-idUSBREA3310O20140404
try {
while (!(url = br.readLine()).equals("run")) {
list.add(url);
}
} catch (final Exception e) {
e.getMessage();
}
final ObtainNews on = new ObtainNews();
HashMap<String, String> hm = new HashMap<String, String>();
for (int i = 0; i < list.size(); i++) {
hm = on.getFromWeb(list.get(i));
String title = hm.get("title");
String content = hm.get("content");
String time = hm.get("time");
String publisher = null;
String site = "reuters";
String reporter = hm.get("reporter"); if(reporter == "") reporter = null;
String channel = hm.get("channel"); if(channel == "") channel = null;
String subject = null;
String imgsrc = hm.get("imgsrc"); if(imgsrc == "") imgsrc = null;
String videosrc = hm.get("videosrc");
if(videosrc == "") videosrc = null;
else {
videosrc = "http://www.reuters.com" + videosrc;
videosrc = videosrc.replaceAll(" ", "");
}
String str = list.get(i)+"\t"+title+"\t"+content+"\t"+time+"\t"+publisher+"\t"
+site+"\t"+reporter+"\t"+channel+"\t"+subject+"\t"+imgsrc+"\t"+videosrc+"\n";
System.out.println("URL: " + list.get(i));
System.out.println("标题: " + title);
System.out.println("正文: " + content);
System.out.println("发布时间: " + time);
System.out.println("发布者:" + publisher);
System.out.println("来源站点:" + site);
System.out.println("记者:" + reporter);
System.out.println("分类频道:" + channel);
System.out.println("主题:" + subject);
System.out.println("图片链接:" + imgsrc);
System.out.println("视频链接:" + videosrc);
System.out.println(str);
try {
FileOutputStream fos = new FileOutputStream("D://News.txt",true);
OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
BufferedWriter bw = new BufferedWriter(osw);
bw.write(str);
bw.flush();
bw.close();
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ObtainNews {
/**
*
* @param htmlurl
* @return 读取一个网页全部内容
* @throws IOException
*/
public String getHtml(final String htmlurl) throws IOException {
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try {
url = new URL(htmlurl);
final BufferedReader in = new BufferedReader(new InputStreamReader(
url.openStream(), "utf-8"));// 读取网页全部内容
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
in.close();
} catch (final MalformedURLException me) {
System.out.println("你输入的URL格式有问题!请仔细输入");
me.getMessage();
throw me;
} catch (final IOException e) {
e.printStackTrace();
throw e;
}
return sb.toString();
}
/**
* 获取 标题/正文/发布时间/发布者:null/来源站点:reuters/记者/分类频道/专题标识:null/图片/视频
*/
public String getTitle(final String s) {
String regex;
String title = "";
final List<String> list = new ArrayList<String>();
regex = "<h1>.*?</h1>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
title = title + list.get(i);
}
return title;
}
public String getContent(final String s) {
String regex;
String content = "";
final List<String> list = new ArrayList<String>();
regex = "<span id=\"midArticle_start\"></span>.*?</span></span>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
content = content + list.get(i);
}
return content;
}
public String getTime(final String s) {
String regex;
String time = "";
final List<String> list = new ArrayList<String>();
regex = "<span class=\"timestamp\">.*?</span> </p>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
time = time + list.get(i);
}
return time;
}
public String getReporter(final String s) {
String regex;
String reporter = "";
final List<String> list = new ArrayList<String>();
regex = "<p class=\"byline\">.*?</p> <p>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
reporter = reporter + " " + list.get(i);
}
return reporter;
}
public String getChannel(final String s) {
String regex;
String channel = "";
final List<String> list = new ArrayList<String>();
regex = "<div class=\"actionButton\">.*?</a></div>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
channel = channel + " " + list.get(i);
}
return channel;
}
public String getImgsrc(final String s) {
String regex;
String imgsrc = "";
final List<String> list = new ArrayList<String>();
regex = "<img src=\".*?\"
border";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
imgsrc = imgsrc + list.get(i);
}
return imgsrc;
}
public String getVideosrc(final String s) {
String regex;
String videosrc = "";
final List<String> list = new ArrayList<String>();
regex = "<div class=\"photo\">.*?<img";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
videosrc = videosrc + list.get(i);
}
return videosrc;
}
/**
*
* @param s
* @return 去除标签
*/
public String outTag(final String s) {
return s.replaceAll("<.*?>", "");
}
/**
*
* @param s
* @return 获取内容
*/
public HashMap<String, String> getFromWeb(final String s) {
final HashMap<String, String> hm = new HashMap<String, String>();
String html = "";
System.out.println("\n开始读取网页(" + s + ")");
try {
html = getHtml(s);
} catch (final Exception e) {
e.getMessage();
}
System.out.println(html);
System.out.println("分析(" + s + ")结果\n");
String title = outTag(getTitle(html));
String content = outTag(getContent(html));
String time = outTag(getTime(html));
String reporter = outTag(getReporter(html)).replaceAll("By ", "");
String channel = outTag(getChannel(html));
String imgsrc = getImgsrc(html)
.replaceAll("<img src=\"", "").replaceAll("\"
border", "").replaceAll(" ", "");
String videosrc = getVideosrc(html)
.replaceAll("<div class=\"photo\"><a href=\"", "").replaceAll("\"><img", "");
hm.put("title", title);
hm.put("content", content);
hm.put("time", time);
hm.put("reporter", reporter);
hm.put("channel", channel);
hm.put("imgsrc", imgsrc);
hm.put("videosrc", videosrc);
return hm;
}
/**
*
* @param args
* 测试网页www.reuters.com
*/
public static void main(final String args[]) {
String url = "";
final List<String> list = new ArrayList<String>();
System.out.print("输入新闻页面网址,换行输入run\n");
final BufferedReader br = new BufferedReader(new InputStreamReader(
System.in));
//http://www.reuters.com/article/2014/04/04/us-usa-cia-interrogation-idUSBREA321UC20140404
//http://www.reuters.com/article/2014/04/04/us-congress-justice-highspeed-idUSBREA3310O20140404
try {
while (!(url = br.readLine()).equals("run")) {
list.add(url);
}
} catch (final Exception e) {
e.getMessage();
}
final ObtainNews on = new ObtainNews();
HashMap<String, String> hm = new HashMap<String, String>();
for (int i = 0; i < list.size(); i++) {
hm = on.getFromWeb(list.get(i));
String title = hm.get("title");
String content = hm.get("content");
String time = hm.get("time");
String publisher = null;
String site = "reuters";
String reporter = hm.get("reporter"); if(reporter == "") reporter = null;
String channel = hm.get("channel"); if(channel == "") channel = null;
String subject = null;
String imgsrc = hm.get("imgsrc"); if(imgsrc == "") imgsrc = null;
String videosrc = hm.get("videosrc");
if(videosrc == "") videosrc = null;
else {
videosrc = "http://www.reuters.com" + videosrc;
videosrc = videosrc.replaceAll(" ", "");
}
String str = list.get(i)+"\t"+title+"\t"+content+"\t"+time+"\t"+publisher+"\t"
+site+"\t"+reporter+"\t"+channel+"\t"+subject+"\t"+imgsrc+"\t"+videosrc+"\n";
System.out.println("URL: " + list.get(i));
System.out.println("标题: " + title);
System.out.println("正文: " + content);
System.out.println("发布时间: " + time);
System.out.println("发布者:" + publisher);
System.out.println("来源站点:" + site);
System.out.println("记者:" + reporter);
System.out.println("分类频道:" + channel);
System.out.println("主题:" + subject);
System.out.println("图片链接:" + imgsrc);
System.out.println("视频链接:" + videosrc);
System.out.println(str);
try {
FileOutputStream fos = new FileOutputStream("D://News.txt",true);
OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
BufferedWriter bw = new BufferedWriter(osw);
bw.write(str);
bw.flush();
bw.close();
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
相关文章推荐
- 计算自己活了多少天 SimpleDateFormat Date getTime()
- Magento中如何判断一个用户是否已经登陆
- 手机访问pc网站html,body默认渲染成980宽度
- POJ 1466:Girls and Boys 二分图的最大点独立集
- iOS库 .a与.framework区别
- FLURRY 文档摘要及备注
- 解决Qt中QLabel控件加载动态图出现黑色背景的问题
- [中文版/英文版]微软推出Win10功能演示网站 帮助新手快速上手
- URAL 1037 Memory Management
- linux 安装jpeg 出错。
- Linux下文件权限更改问题
- 读书笔记-《 我的成功可以复制》三
- HTML5
- HDU 5353—— Average——————【贪心+枚举】
- socket client简单传输数据
- php -> =>的问题
- TCP/IP协议学习之一(以太网帧格式)
- 【SpringMVC】annotation配置(事务不失效)
- configure配置及其cmake配置-静态库生成
- Spring小记(一)