博客首页博文截取列表
2009-08-31 22:17
260 查看
1 在sql语句中运用 substring()方法将包含博文字段内容截取部分;
2 运用htmlparser 处理截取的内容;
3 运用htmlparser补齐标签;
代码如下:
package cn.blog.parser;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.tags.CompositeTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;
public class ParserTest {
private static final Logger logger = Logger.getLogger(Parser.class);
public static String readHtml(String url) {
try {
Parser parser = new Parser();
parser.setURL(url);
parser.setEncoding(parser.getEncoding());
NodeVisitor visitor = new NodeVisitor() {
public void visitTag(Tag tag) {
logger.fatal("testVisitorAll() Tag name is :"
+ tag.getTagName() + " /n Class is :"
+ tag.getClass());
}
};
parser.visitAllNodesWith(visitor);
} catch (ParserException e) {
e.printStackTrace();
}
return "";
}
public static String readWithTag(String url, int len) throws IOException {
// java.io.FileReader fred = new FileReader(url);
FileInputStream fin = new FileInputStream(url);
byte[] bb = new byte[fin.available()];
fin.read(bb);
String content = new String(bb);
fin.close();
return content.substring(0, content.length() > len ? len : content
.length());
}
public static void writeWithTag(String content, String url)
throws IOException {
FileOutputStream fou = new FileOutputStream(url);
byte[] bb = content.getBytes();
fou.write(bb);
fou.close();
}
public static String subcontent(String content) {
try {
content = endTagValidate(content);
Parser parser = Parser.createParser(content, "utf-8");
NodeList list = parser.parse(null);
//处理页面内容
visitNodeList(list);
content = list.toHtml();
//补齐标签
return addEndTag(content);
} catch (Exception e) {
return "";
}
}
//处理截取完后的最后标签
private static String endTagValidate(String content)
{
int end = content.lastIndexOf("<");
String ss = content.substring(end, content.length()).toLowerCase();
//"" 替换成视频标签
if (ss.startsWith("img") || ss.startsWith(""))
{
if (ss.indexOf(">") == -1) {
content = content.substring(0, end);
}
}
return content;
}
/**
* 补齐标签的结尾
* @param html
* @return
* @throws UnsupportedEncodingException
* @throws ParserException
*/
private static String addEndTag(String html)
throws UnsupportedEncodingException, ParserException {
Parser parser = Parser.createParser(html, "utf-8");
NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
if (node instanceof CompositeTag) {
return true;
}
return false;
}
});
String str = "";
String tmp = "";
for (int i = 0; i < nodelist.size(); i++) {
CompositeTag testTag = (CompositeTag) nodelist.elementAt(i);
if (testTag.getParent() == null) {
// 记住这里只需循环第一层就能帮你补齐的了
tmp = testTag.toHtml();
str += tmp + "/n";
}
}
return str + "...";
}
/**
* 处理html的内容 (去除script、 将图片的高和宽限制了)
*
* @param list
*/
private static void visitNodeList(NodeList list) {
for (int i = 0; i < list.size(); i++) {
Node node = list.elementAt(i);
if (node instanceof ScriptTag) {
list.remove(i);
continue;
}
if (node instanceof ImageTag) {
ImageTag img = (ImageTag) node;
//限制图片的长宽
img.setAttribute("width", "/"100/"");
img.setAttribute("height", "/"75/"");
}
NodeList children = node.getChildren();
if (children != null && children.size() > 0) {
visitNodeList(children);
}
}
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
// readHtml("http://www.google.com");
try {
// String content = readWithTag("D://htmlParser//y1.txt", 40000);
String content = "<html><head><title>asdf</title></head><body>nihaodcesljk啊是的拉快点放假啊两节课";
System.out.println(content);
System.out.println(subcontent(content));
// writeWithTag(subcontent(content),
// "D://htmlParser//1.html");
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
2 运用htmlparser 处理截取的内容;
3 运用htmlparser补齐标签;
代码如下:
package cn.blog.parser;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.tags.CompositeTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;
public class ParserTest {
private static final Logger logger = Logger.getLogger(Parser.class);
public static String readHtml(String url) {
try {
Parser parser = new Parser();
parser.setURL(url);
parser.setEncoding(parser.getEncoding());
NodeVisitor visitor = new NodeVisitor() {
public void visitTag(Tag tag) {
logger.fatal("testVisitorAll() Tag name is :"
+ tag.getTagName() + " /n Class is :"
+ tag.getClass());
}
};
parser.visitAllNodesWith(visitor);
} catch (ParserException e) {
e.printStackTrace();
}
return "";
}
public static String readWithTag(String url, int len) throws IOException {
// java.io.FileReader fred = new FileReader(url);
FileInputStream fin = new FileInputStream(url);
byte[] bb = new byte[fin.available()];
fin.read(bb);
String content = new String(bb);
fin.close();
return content.substring(0, content.length() > len ? len : content
.length());
}
public static void writeWithTag(String content, String url)
throws IOException {
FileOutputStream fou = new FileOutputStream(url);
byte[] bb = content.getBytes();
fou.write(bb);
fou.close();
}
public static String subcontent(String content) {
try {
content = endTagValidate(content);
Parser parser = Parser.createParser(content, "utf-8");
NodeList list = parser.parse(null);
//处理页面内容
visitNodeList(list);
content = list.toHtml();
//补齐标签
return addEndTag(content);
} catch (Exception e) {
return "";
}
}
//处理截取完后的最后标签
private static String endTagValidate(String content)
{
int end = content.lastIndexOf("<");
String ss = content.substring(end, content.length()).toLowerCase();
//"" 替换成视频标签
if (ss.startsWith("img") || ss.startsWith(""))
{
if (ss.indexOf(">") == -1) {
content = content.substring(0, end);
}
}
return content;
}
/**
* 补齐标签的结尾
* @param html
* @return
* @throws UnsupportedEncodingException
* @throws ParserException
*/
private static String addEndTag(String html)
throws UnsupportedEncodingException, ParserException {
Parser parser = Parser.createParser(html, "utf-8");
NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
if (node instanceof CompositeTag) {
return true;
}
return false;
}
});
String str = "";
String tmp = "";
for (int i = 0; i < nodelist.size(); i++) {
CompositeTag testTag = (CompositeTag) nodelist.elementAt(i);
if (testTag.getParent() == null) {
// 记住这里只需循环第一层就能帮你补齐的了
tmp = testTag.toHtml();
str += tmp + "/n";
}
}
return str + "...";
}
/**
* 处理html的内容 (去除script、 将图片的高和宽限制了)
*
* @param list
*/
private static void visitNodeList(NodeList list) {
for (int i = 0; i < list.size(); i++) {
Node node = list.elementAt(i);
if (node instanceof ScriptTag) {
list.remove(i);
continue;
}
if (node instanceof ImageTag) {
ImageTag img = (ImageTag) node;
//限制图片的长宽
img.setAttribute("width", "/"100/"");
img.setAttribute("height", "/"75/"");
}
NodeList children = node.getChildren();
if (children != null && children.size() > 0) {
visitNodeList(children);
}
}
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
// readHtml("http://www.google.com");
try {
// String content = readWithTag("D://htmlParser//y1.txt", 40000);
String content = "<html><head><title>asdf</title></head><body>nihaodcesljk啊是的拉快点放假啊两节课";
System.out.println(content);
System.out.println(subcontent(content));
// writeWithTag(subcontent(content),
// "D://htmlParser//1.html");
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
相关文章推荐
- 阿里的博客 http://blog.sina.com.cn/haodengli [订阅][手机订阅] 首页博文目录图片关于我 个人资料 阿里 阿里 微博 加好友发纸条 写留言加关注 博客十周年地图 博
- 怎样控制wordpress博客首页博文显示内容字数!
- 博客首页及分类等处文章列表中显示文章的方式更改为文章摘要
- 博客更改地址后在home首页访问不了以前的博文,以此推想博客园的数据库设计
- 让wordpress博客首页、分类页 显示文章标题列表或摘要
- Blog首页增加作者列表 [ 光影人像 东海陈光剑 的博客 ]
- 从零开始,做一个NodeJS博客(二):实现首页-加载文章列表和详情
- 让wordpress博客首页、分类页 显示文章标题列表或摘要
- 【Android 我的博客APP】1.抓取博客首页文章列表内容——网页数据抓取
- 什么样的博文才能上首页呢?『博客使用技巧』
- 被推荐上博客首页的博文是怎样炼成的
- Python实现抓取CSDN博客首页文章列表
- 博客园首页的博客列表 与 自己首页的排名为什么不一致?
- 很高兴我的博文《Spring搭配Ehcache实例解析 》被推荐到CSDN博客首页啦
- Python实现抓取CSDN博客首页文章列表
- 老男孩51CTO博客博文列表整理版20170620更新
- python抓取CSDN博客首页的所有博文,对标题分词存入mongodb中
- 我的博客网站开发4——博客首页功能实现之博文摘要
- 在自己的豆瓣首页上增加最近的博客内容