您的位置:首页 > 其它

利用正则表达式抽取网页信息

2015-08-11 09:54 330 查看
import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;

import java.io.UnsupportedEncodingException;

import java.net.MalformedURLException;

import java.net.URL;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class ObtainNews {

/**

*

* @param htmlurl

* @return 读取一个网页全部内容

* @throws IOException

*/

public String getHtml(final String htmlurl) throws IOException {

URL url;

String temp;

final StringBuffer sb = new StringBuffer();

try {

url = new URL(htmlurl);

final BufferedReader in = new BufferedReader(new InputStreamReader(

url.openStream(), "utf-8"));// 读取网页全部内容

while ((temp = in.readLine()) != null) {

sb.append(temp);

}

in.close();

} catch (final MalformedURLException me) {

System.out.println("你输入的URL格式有问题!请仔细输入");

me.getMessage();

throw me;

} catch (final IOException e) {

e.printStackTrace();

throw e;

}

return sb.toString();

}

/**

* 获取 标题/正文/发布时间/发布者:null/来源站点:reuters/记者/分类频道/专题标识:null/图片/视频

*/

public String getTitle(final String s) {

String regex;

String title = "";

final List<String> list = new ArrayList<String>();

regex = "<h1>.*?</h1>";

final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

final Matcher ma = pa.matcher(s);

while (ma.find()) {

list.add(ma.group());

}

for (int i = 0; i < list.size(); i++) {

title = title + list.get(i);

}

return title;

}

public String getContent(final String s) {

String regex;

String content = "";

final List<String> list = new ArrayList<String>();

regex = "<span id=\"midArticle_start\"></span>.*?</span></span>";

final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

final Matcher ma = pa.matcher(s);

while (ma.find()) {

list.add(ma.group());

}

for (int i = 0; i < list.size(); i++) {

content = content + list.get(i);

}

return content;

}

public String getTime(final String s) {

String regex;

String time = "";

final List<String> list = new ArrayList<String>();

regex = "<span class=\"timestamp\">.*?</span> </p>";

final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

final Matcher ma = pa.matcher(s);

while (ma.find()) {

list.add(ma.group());

}

for (int i = 0; i < list.size(); i++) {

time = time + list.get(i);

}

return time;

}

public String getReporter(final String s) {

String regex;

String reporter = "";

final List<String> list = new ArrayList<String>();

regex = "<p class=\"byline\">.*?</p> <p>";

final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

final Matcher ma = pa.matcher(s);

while (ma.find()) {

list.add(ma.group());

}

for (int i = 0; i < list.size(); i++) {

reporter = reporter + " " + list.get(i);

}

return reporter;

}

public String getChannel(final String s) {

String regex;

String channel = "";

final List<String> list = new ArrayList<String>();

regex = "<div class=\"actionButton\">.*?</a></div>";

final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

final Matcher ma = pa.matcher(s);

while (ma.find()) {

list.add(ma.group());

}

for (int i = 0; i < list.size(); i++) {

channel = channel + " " + list.get(i);

}

return channel;

}

public String getImgsrc(final String s) {

String regex;

String imgsrc = "";

final List<String> list = new ArrayList<String>();

regex = "<img src=\".*?\"
border";

final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

final Matcher ma = pa.matcher(s);

while (ma.find()) {

list.add(ma.group());

}

for (int i = 0; i < list.size(); i++) {

imgsrc = imgsrc + list.get(i);

}

return imgsrc;

}

public String getVideosrc(final String s) {

String regex;

String videosrc = "";

final List<String> list = new ArrayList<String>();

regex = "<div class=\"photo\">.*?<img";

final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

final Matcher ma = pa.matcher(s);

while (ma.find()) {

list.add(ma.group());

}

for (int i = 0; i < list.size(); i++) {

videosrc = videosrc + list.get(i);

}

return videosrc;

}

/**

*

* @param s

* @return 去除标签

*/

public String outTag(final String s) {

return s.replaceAll("<.*?>", "");

}

/**

*

* @param s

* @return 获取内容

*/

public HashMap<String, String> getFromWeb(final String s) {

final HashMap<String, String> hm = new HashMap<String, String>();

String html = "";

System.out.println("\n开始读取网页(" + s + ")");

try {

html = getHtml(s);

} catch (final Exception e) {

e.getMessage();

}

System.out.println(html);

System.out.println("分析(" + s + ")结果\n");

String title = outTag(getTitle(html));

String content = outTag(getContent(html));

String time = outTag(getTime(html));

String reporter = outTag(getReporter(html)).replaceAll("By ", "");

String channel = outTag(getChannel(html));

String imgsrc = getImgsrc(html)

.replaceAll("<img src=\"", "").replaceAll("\"
border", "").replaceAll(" ", "");

String videosrc = getVideosrc(html)

.replaceAll("<div class=\"photo\"><a href=\"", "").replaceAll("\"><img", "");

hm.put("title", title);

hm.put("content", content);

hm.put("time", time);

hm.put("reporter", reporter);

hm.put("channel", channel);

hm.put("imgsrc", imgsrc);

hm.put("videosrc", videosrc);

return hm;

}

/**

*

* @param args

* 测试网页www.reuters.com

*/

public static void main(final String args[]) {

String url = "";

final List<String> list = new ArrayList<String>();

System.out.print("输入新闻页面网址,换行输入run\n");

final BufferedReader br = new BufferedReader(new InputStreamReader(

System.in));

//http://www.reuters.com/article/2014/04/04/us-usa-cia-interrogation-idUSBREA321UC20140404

//http://www.reuters.com/article/2014/04/04/us-congress-justice-highspeed-idUSBREA3310O20140404

try {

while (!(url = br.readLine()).equals("run")) {

list.add(url);

}

} catch (final Exception e) {

e.getMessage();

}

final ObtainNews on = new ObtainNews();

HashMap<String, String> hm = new HashMap<String, String>();

for (int i = 0; i < list.size(); i++) {

hm = on.getFromWeb(list.get(i));

String title = hm.get("title");

String content = hm.get("content");

String time = hm.get("time");

String publisher = null;

String site = "reuters";

String reporter = hm.get("reporter"); if(reporter == "") reporter = null;

String channel = hm.get("channel"); if(channel == "") channel = null;

String subject = null;

String imgsrc = hm.get("imgsrc"); if(imgsrc == "") imgsrc = null;

String videosrc = hm.get("videosrc");

if(videosrc == "") videosrc = null;

else {

videosrc = "http://www.reuters.com" + videosrc;

videosrc = videosrc.replaceAll(" ", "");

}

String str = list.get(i)+"\t"+title+"\t"+content+"\t"+time+"\t"+publisher+"\t"

+site+"\t"+reporter+"\t"+channel+"\t"+subject+"\t"+imgsrc+"\t"+videosrc+"\n";

System.out.println("URL: " + list.get(i));

System.out.println("标题: " + title);

System.out.println("正文: " + content);

System.out.println("发布时间: " + time);

System.out.println("发布者:" + publisher);

System.out.println("来源站点:" + site);

System.out.println("记者:" + reporter);

System.out.println("分类频道:" + channel);

System.out.println("主题:" + subject);

System.out.println("图片链接:" + imgsrc);

System.out.println("视频链接:" + videosrc);

System.out.println(str);

try {

FileOutputStream fos = new FileOutputStream("D://News.txt",true);

OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");

BufferedWriter bw = new BufferedWriter(osw);

bw.write(str);

bw.flush();

bw.close();

} catch (FileNotFoundException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

} catch (UnsupportedEncodingException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: