您的位置:首页 > 编程语言 > Java开发

java正则表达式解析html示例分享

2016-01-05 00:56 591 查看
代码如下:

package work;
import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.util.regex.Matcher;

import java.util.regex.Pattern;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.HttpException;

import org.apache.commons.httpclient.HttpStatus;

import org.apache.commons.httpclient.methods.GetMethod;

import org.apache.commons.httpclient.params.HttpMethodParams;
public class chuanboyi {
public static void main(String[] args){

// TODO Auto-generated method stub

StringBuffer html = new StringBuffer();

HttpClient httpclient = new HttpClient();

//创建GET方法实例

GetMethod getMethod = new GetMethod("http://www.jb51.net");

//使用系统提供的默认恢复策略

getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());

try{

//执行GET方法

int statusCode = httpclient.executeMethod(getMethod);

if(statusCode != HttpStatus.SC_OK){

System.out.println("Method is wrong " + getMethod.getStatusLine());

}

InputStream responseBody = getMethod.getResponseBodyAsStream();

BufferedReader reader = new BufferedReader(new InputStreamReader(responseBody,"utf-8"));

String line = reader.readLine();

while(line != null){

html.append(line).append("\n");

line = reader.readLine();

}

reader.close();

//正则表达式

String regex = "<form name=\"compareForm\"[\\s\\S]+>[\\s\\S]+</form>.*<script.*>";

String regexa ="(?<=<li>)[\\s\\S]+?(?=</li>)";

Pattern pattern = Pattern.compile(regex);

Matcher m = pattern.matcher(html);

StringBuffer str = new StringBuffer();

int i = 0;

while(m.find()){

str.append(m.group());

}

pattern = Pattern.compile(regexa);

m = pattern.matcher(str);

while(m.find()){

attrs(m.group());

i++;

}

System.out.println("共有"+i+"条数据!");

}catch (HttpException e) {

// TODO: handle exception

System.out.println("Please check your provided http address!");

e.printStackTrace();

}catch (IOException e) {

// TODO: handle exception

System.out.println("the line is wrong!");

e.printStackTrace();

}finally{

getMethod.releaseConnection();//释放链接

}

}

public static void attrs(String str){

//获取url的正则表达式

String regexURL = "[a-z]+-[0-9]+\\.html";

//获取Name的正则表达式

String regexName = "(?<=title=\")[[\\w-\\s][^x00-xff]]+(?=\")";

//获取图片的正则表达式

String regexPicture = "images.*\\.jpg";

Pattern patternURL = Pattern.compile(regexURL);

Pattern patternName = Pattern.compile(regexName);

Pattern patternPicture = Pattern.compile(regexPicture);

Matcher mURL = patternURL.matcher(str);

Matcher mName = patternName.matcher(str);

Matcher mPicture = patternPicture.matcher(str);

if(mName.find()){

System.out.println("名字:"+mName.group());

}

if(mURL.find()){

System.out.println("链接:"+mURL.group());

}

if(mPicture.find()){

System.out.println("图片:"+mPicture.group());

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: