您的位置:首页 > Web前端 > HTML

HtmlParser【ed2k】

2015-12-28 21:39 701 查看
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import sun.net.www.URLConnection;

public class HtmlParser {
 /**
  * 要分析的网页
  */
 String htmlUrl;

 /**
  * 分析结果
  */
 ArrayList<String> hrefList = new ArrayList();

 /**
  * 网页编码方式
  */
 String charSet;

 public HtmlParser(String htmlUrl) {
  // TODO 自动生成的构造函数存根
  this.htmlUrl = htmlUrl;
 }

 /**
  * 获取分析结果
  * 
  * @throws IOException
  */
 public ArrayList<String> getHrefList() throws IOException {

  parser();
  return hrefList;
 }

 /**
  * 解析网页链接
  * 
  * @return
  * @throws IOException
  */
 private void parser() throws IOException {
  URL url = new URL(htmlUrl);
  if(htmlUrl.startsWith("http"))
  {
	  HttpURLConnection   connection = (HttpURLConnection)url.openConnection();
	  connection.setDoOutput(true);

	  String contenttype = connection.getContentType();
	  charSet = getCharset(contenttype);
	  //System.out.printf("%s\n",charSet);
	  if(charSet == null)
	      charSet="GBK";
	  InputStreamReader isr = new InputStreamReader(
	    connection.getInputStream(), charSet);
	  BufferedReader br = new BufferedReader(isr);

	  String str = null, rs = null;
	  while ((str = br.readLine()) != null) {
	   rs = getHref(str);

	   if (rs != null)
	    hrefList.add(rs);
      }
  }
  else if(htmlUrl.startsWith("file"))
  {
	  sun.net.www.protocol.file.FileURLConnection  connection = (sun.net.www.protocol.file.FileURLConnection) url.openConnection();
	  
	  connection.setDoOutput(true);

	  String contenttype = connection.getContentType();
	  charSet = getCharset(contenttype);
	  //System.out.printf("%s\n",charSet);
	  if(charSet == null)
		  charSet="GBK";
	  InputStreamReader isr = new InputStreamReader(
		connection.getInputStream(), charSet);
	  BufferedReader br = new BufferedReader(isr);

	  String str = null, rs = null;
	  while ((str = br.readLine()) != null) {
	   rs = getHref(str);

	   if (rs != null)
		hrefList.add(rs);
	  }
  }

 }

 /**
  * 获取网页编码方式
  * 
  * @param str
  */
 private String getCharset(String str) {
  Pattern pattern = Pattern.compile("charset=.*");
  Matcher matcher = pattern.matcher(str);
  if (matcher.find())
   return matcher.group(0).split("charset=")[1];
  return null;
 }

 /**
  * 从一行字符串中读取链接
  * 
  * @return
  */
 private String getHref(String str) {
  //Pattern pattern = Pattern.compile("<a href=.*</a>");
  Pattern pattern = Pattern.compile("ed2k://.*[|]/");
  Matcher matcher = pattern.matcher(str);
  if (matcher.find())
  {
   return matcher.group(0);
   // if (matcher.find())
   //return matcher.group(0).split("charset=")[1];
   //return str_tmp;
   }
  return null;
 }

 public static void main(String[] arg) throws IOException {
	 //System.out.printf("%d,%s\n",arg.length,arg[0]);
	 //if(arg.length !=0) return;
	 HtmlParser a = null;
	 if(arg.length !=0)
		    a = new HtmlParser(arg[0]);
		 else
			a = new HtmlParser("http://news.163.com/");
	  ArrayList<String> hrefList = a.getHrefList();
	  for (int i = 0; i < hrefList.size(); i++)
	   System.out.println(hrefList.get(i));

 }

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: