java正则表达式解析html示例分享
2016-01-05 00:56
591 查看
代码如下:
package work;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
public class chuanboyi {
public static void main(String[] args){
// TODO Auto-generated method stub
StringBuffer html = new StringBuffer();
HttpClient httpclient = new HttpClient();
//创建GET方法实例
GetMethod getMethod = new GetMethod("http://www.jb51.net");
//使用系统提供的默认恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
try{
//执行GET方法
int statusCode = httpclient.executeMethod(getMethod);
if(statusCode != HttpStatus.SC_OK){
System.out.println("Method is wrong " + getMethod.getStatusLine());
}
InputStream responseBody = getMethod.getResponseBodyAsStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(responseBody,"utf-8"));
String line = reader.readLine();
while(line != null){
html.append(line).append("\n");
line = reader.readLine();
}
reader.close();
//正则表达式
String regex = "<form name=\"compareForm\"[\\s\\S]+>[\\s\\S]+</form>.*<script.*>";
String regexa ="(?<=<li>)[\\s\\S]+?(?=</li>)";
Pattern pattern = Pattern.compile(regex);
Matcher m = pattern.matcher(html);
StringBuffer str = new StringBuffer();
int i = 0;
while(m.find()){
str.append(m.group());
}
pattern = Pattern.compile(regexa);
m = pattern.matcher(str);
while(m.find()){
attrs(m.group());
i++;
}
System.out.println("共有"+i+"条数据!");
}catch (HttpException e) {
// TODO: handle exception
System.out.println("Please check your provided http address!");
e.printStackTrace();
}catch (IOException e) {
// TODO: handle exception
System.out.println("the line is wrong!");
e.printStackTrace();
}finally{
getMethod.releaseConnection();//释放链接
}
}
public static void attrs(String str){
//获取url的正则表达式
String regexURL = "[a-z]+-[0-9]+\\.html";
//获取Name的正则表达式
String regexName = "(?<=title=\")[[\\w-\\s][^x00-xff]]+(?=\")";
//获取图片的正则表达式
String regexPicture = "images.*\\.jpg";
Pattern patternURL = Pattern.compile(regexURL);
Pattern patternName = Pattern.compile(regexName);
Pattern patternPicture = Pattern.compile(regexPicture);
Matcher mURL = patternURL.matcher(str);
Matcher mName = patternName.matcher(str);
Matcher mPicture = patternPicture.matcher(str);
if(mName.find()){
System.out.println("名字:"+mName.group());
}
if(mURL.find()){
System.out.println("链接:"+mURL.group());
}
if(mPicture.find()){
System.out.println("图片:"+mPicture.group());
}
}
}
package work;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
public class chuanboyi {
public static void main(String[] args){
// TODO Auto-generated method stub
StringBuffer html = new StringBuffer();
HttpClient httpclient = new HttpClient();
//创建GET方法实例
GetMethod getMethod = new GetMethod("http://www.jb51.net");
//使用系统提供的默认恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
try{
//执行GET方法
int statusCode = httpclient.executeMethod(getMethod);
if(statusCode != HttpStatus.SC_OK){
System.out.println("Method is wrong " + getMethod.getStatusLine());
}
InputStream responseBody = getMethod.getResponseBodyAsStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(responseBody,"utf-8"));
String line = reader.readLine();
while(line != null){
html.append(line).append("\n");
line = reader.readLine();
}
reader.close();
//正则表达式
String regex = "<form name=\"compareForm\"[\\s\\S]+>[\\s\\S]+</form>.*<script.*>";
String regexa ="(?<=<li>)[\\s\\S]+?(?=</li>)";
Pattern pattern = Pattern.compile(regex);
Matcher m = pattern.matcher(html);
StringBuffer str = new StringBuffer();
int i = 0;
while(m.find()){
str.append(m.group());
}
pattern = Pattern.compile(regexa);
m = pattern.matcher(str);
while(m.find()){
attrs(m.group());
i++;
}
System.out.println("共有"+i+"条数据!");
}catch (HttpException e) {
// TODO: handle exception
System.out.println("Please check your provided http address!");
e.printStackTrace();
}catch (IOException e) {
// TODO: handle exception
System.out.println("the line is wrong!");
e.printStackTrace();
}finally{
getMethod.releaseConnection();//释放链接
}
}
public static void attrs(String str){
//获取url的正则表达式
String regexURL = "[a-z]+-[0-9]+\\.html";
//获取Name的正则表达式
String regexName = "(?<=title=\")[[\\w-\\s][^x00-xff]]+(?=\")";
//获取图片的正则表达式
String regexPicture = "images.*\\.jpg";
Pattern patternURL = Pattern.compile(regexURL);
Pattern patternName = Pattern.compile(regexName);
Pattern patternPicture = Pattern.compile(regexPicture);
Matcher mURL = patternURL.matcher(str);
Matcher mName = patternName.matcher(str);
Matcher mPicture = patternPicture.matcher(str);
if(mName.find()){
System.out.println("名字:"+mName.group());
}
if(mURL.find()){
System.out.println("链接:"+mURL.group());
}
if(mPicture.find()){
System.out.println("图片:"+mPicture.group());
}
}
}
相关文章推荐
- 10Java语法回顾之异常处理
- Spring架包变更org.springframework.dao.DataAccessResourceFailureException
- Java集合:线性表: JAVA_ArrayIntList
- IO流字符流-缓冲区
- Java Day6
- 使用spring拦截器做频率限制
- spring mail 邮件发送 附带 昵称
- java 深拷贝
- java使用Executor(执行器)管理线程
- eclipse 安装 git 插件
- 在蜂窝教育Java培训 毕业即就业
- 用plotly包创建交互式网页图形
- myeclipse连接SQL 2012
- 【Java资源大全】SonarQube:开源的代码质量管理工具
- Java:String和Date、Timestamp之间的转换
- eclipse validating 卡死
- maven 运行报错 rather than a JDK?
- "Initializing Java Tooling".
- 设计模式之结构模式
- spring mvc 单例模式