用java解析html标签
2008-11-11 09:29
323 查看
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.hothouseobjects.tags.Inspector;
import com.hothouseobjects.tags.Tag;
import com.hothouseobjects.tags.TagTiller;
import com.hothouseobjects.tags.Text;
public class ParseHtml {
public static void main(String[] args) {
try {
File file = new File("d://ttt.htm");
int len = (int)file.length();
byte[] b;
b = new byte[len];
FileInputStream fis = new FileInputStream(file);
fis.read(b);
fis.close();
Reader read;
read = new StringReader(new String(b));
TagTiller tagtiller = new TagTiller(read);
tagtiller.runTiller();
Tag thePage = tagtiller.getTilledTags();
// search <a href=......
List theHref = Inspector.collectByType(thePage,"a");
int i = theHref.size();
while (i>0) {
//System.out.println(((Tag)theHref.get(i-1)).toHTML());
System.out.println(((Tag)theHref.get(i-1)).getAttributeValue("href"));
// System.out.println(((Tag)theHref.get(i-1)).getAttributeValue("target"));
// System.out.println(((Tag)theHref.get(i-1)).getItem(0).toHTML());
i -=1;
}
// search <img src=.....
List theImg = Inspector.collectByType(thePage,"img");
int j = 0;
while (j < theImg.size()) {
System.out.println(((Tag)theImg.get(j)).toHTML());
System.out.println(((Tag)theImg.get(j)).getAttributeValue("src"));
System.out.println(((Tag)theImg.get(j)).getAttributeValue("alt"));
j +=1;
}
// out put file of html
Tag big = new Tag ("H1");
Tag red = new Tag ("FONT");
Tag underlined = new Tag ("U");
red.setAttribute ("color", "#CCOOOO");
big.addItem (red);
red.addItem (underlined);
underlined.addItem (new Text ("Sorry, no addresses found."));
Tag title = Inspector.locateByType(thePage,"title");
title.addItem(big);
StringWriter writer = new StringWriter();
thePage.toHTML(writer);
File file1 = new File("test000.html");
FileOutputStream fos = new FileOutputStream(file1);
fos.write(writer.toString().getBytes());
fos.close();
// search of <table
List theTd = Inspector.collectByType(thePage,"td");
int k = 0;
while (k < theTd.size()) {
System.out.println(((Tag)theTd.get(k)).toHTML());
k +=1;
}
}
catch (Exception ex) {
ex.printStackTrace();
}
}
}
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.hothouseobjects.tags.Inspector;
import com.hothouseobjects.tags.Tag;
import com.hothouseobjects.tags.TagTiller;
import com.hothouseobjects.tags.Text;
public class ParseHtml {
public static void main(String[] args) {
try {
File file = new File("d://ttt.htm");
int len = (int)file.length();
byte[] b;
b = new byte[len];
FileInputStream fis = new FileInputStream(file);
fis.read(b);
fis.close();
Reader read;
read = new StringReader(new String(b));
TagTiller tagtiller = new TagTiller(read);
tagtiller.runTiller();
Tag thePage = tagtiller.getTilledTags();
// search <a href=......
List theHref = Inspector.collectByType(thePage,"a");
int i = theHref.size();
while (i>0) {
//System.out.println(((Tag)theHref.get(i-1)).toHTML());
System.out.println(((Tag)theHref.get(i-1)).getAttributeValue("href"));
// System.out.println(((Tag)theHref.get(i-1)).getAttributeValue("target"));
// System.out.println(((Tag)theHref.get(i-1)).getItem(0).toHTML());
i -=1;
}
// search <img src=.....
List theImg = Inspector.collectByType(thePage,"img");
int j = 0;
while (j < theImg.size()) {
System.out.println(((Tag)theImg.get(j)).toHTML());
System.out.println(((Tag)theImg.get(j)).getAttributeValue("src"));
System.out.println(((Tag)theImg.get(j)).getAttributeValue("alt"));
j +=1;
}
// out put file of html
Tag big = new Tag ("H1");
Tag red = new Tag ("FONT");
Tag underlined = new Tag ("U");
red.setAttribute ("color", "#CCOOOO");
big.addItem (red);
red.addItem (underlined);
underlined.addItem (new Text ("Sorry, no addresses found."));
Tag title = Inspector.locateByType(thePage,"title");
title.addItem(big);
StringWriter writer = new StringWriter();
thePage.toHTML(writer);
File file1 = new File("test000.html");
FileOutputStream fos = new FileOutputStream(file1);
fos.write(writer.toString().getBytes());
fos.close();
// search of <table
List theTd = Inspector.collectByType(thePage,"td");
int k = 0;
while (k < theTd.size()) {
System.out.println(((Tag)theTd.get(k)).toHTML());
k +=1;
}
}
catch (Exception ex) {
ex.printStackTrace();
}
}
}
相关文章推荐
- Java解析HTML标签
- jsoup 解析html网页标签获取数据(java 网页解析 数据)
- Java解析Html自定义标签的属性
- Jsoup-Java解析Html类库,替换img标签style样式
- java如何用正则解析HTML中img标签里图片的路径
- JAVA解析html标签
- java截取带html标签的字符串并把标签补全(保证格式)
- Java中替换HTML标签的方法代码
- 使用HTMLParser 解析html字符串,去除html标签,提取纯文本
- android Html img 标签解析
- java用正则去除html标签
- java 去html标签,去除字符串中的空格,回车,换行符,制表符
- Java发送Http请求,解析html返回
- JAVA中利用Htmlparse解析HTM…
- java 使用正则表达式过滤HTML中标签
- iOS解析HTML标签
- Java中使用XML标签<x:transform />转化XML为HTML
- 请教:如何通过word解析HTML标签(急哦!)
- 如何在<textarea>标签中使用并解析HTML标签
- Java过滤任意(script,html,style)标签符,返回纯文本--封装类