Java网络爬虫crawler4j学习笔记<20> 网页内容转码解析
2016-11-10 14:20
525 查看
简介
网页内容解析相关的类和接口位于包edu.uci.ics.crawler4j.parser中,用于拆分解析html网页的各部分内容。下面的Parser的基本作用就是从各种各样的数据(二进制,文本)中抽取出我们需要的html页面。源代码
ParseData接口
ParseData 接口包含getOutgoingUrls方法,用于获取当前页面的所有外链。package edu.uci.ics.crawler4j.parser; import edu.uci.ics.crawler4j.url.WebURL; import java.util.Set; //抽象接口 public interface ParseData { //得到当前页面的所有外链 Set<WebURL> getOutgoingUrls(); void setOutgoingUrls(Set<WebURL> outgoingUrls); @Override String toString(); }
HtmlParseData类
package edu.uci.ics.crawler4j.parser; import edu.uci.ics.crawler4j.url.WebURL; import java.util.Map; import java.util.Set; // 用于处理html页面的parse类 public class HtmlParseData implements ParseData { private String html; private String text; private String title; private Map<String, String> metaTags; private Set<WebURL> outgoingUrls; public String getHtml() { return html; } public void setHtml(String html) { this.html = html; } public String getText() { return text; } public void setText(String text) { this.text = text; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public Map<String, String> getMetaTags() { return metaTags; } public void setMetaTags(Map<String, String> metaTags) { this.metaTags = metaTags; } @Override public Set<WebURL> getOutgoingUrls() { return outgoingUrls; } @Override public void setOutgoingUrls(Set<WebURL> outgoingUrls) { this.outgoingUrls = outgoingUrls; } @Override public String toString() { return text; } }
TextParseData类
package edu.uci.ics.crawler4j.parser; import edu.uci.ics.crawler4j.url.WebURL; import java.util.HashSet; import java.util.Set; //对文本数据进行parse的类 public class TextParseData implements ParseData { private String textContent; private Set<WebURL> outgoingUrls = new HashSet<>(); public String getTextContent() { return textContent; } public void setTextContent(String textContent) { this.textContent = textContent; } @Override public Set<WebURL> getOutgoingUrls() { return outgoingUrls; } @Override public void setOutgoingUrls(Set<WebURL> outgoingUrls) { this.outgoingUrls = outgoingUrls; } @Override public String toString() { return textContent; } }
BinaryParseData类
package edu.uci.ics.crawler4j.parser; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.util.HashSet; import java.util.Set; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import edu.uci.ics.crawler4j.url.WebURL; // Tika是一个内容分析工具,自带全面的parser工具类, // 能解析基本所有常见格式的文件,得到文件的metadata,content等内容,返回格式化信息 import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; //二进制文件解析类 public class BinaryParseData implements ParseData { private static final Logger logger = LoggerFactory.getLogger(BinaryParseData.class); private static final String DEFAULT_ENCODING = "UTF-8"; // 默认编码utf-8 private static final String DEFAULT_OUTPUT_FORMAT = "html"; //默认输出格式html // Creates an auto-detecting parser instance using the default Tika configuration. private static final Parser AUTO_DETECT_PARSER = new AutoDetectParser(); private static final SAXTransformerFactory SAX_TRANSFORMER_FACTORY = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); // Parse context. Used to pass context information to Tika parsers. private final ParseContext context = new ParseContext(); // 页面的所有外链 private Set<WebURL> outgoingUrls = new HashSet<>(); // 从二进制数据中得到的html内容 private String html = null; public BinaryParseData() { context.set(Parser.class, AUTO_DETECT_PARSER); } public void setBinaryContent(byte[] data) { InputStream inputStream = new ByteArrayInputStream(data); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); try { TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING); AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context); // Hacking the following line to remove Tika's inserted DocType String htmlContent = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace("http://www.w3.org/1999/xhtml", ""); setHtml(htmlContent); } catch (Exception e) { logger.error("Error parsing file", e); } } /** * Returns a transformer handler that serializes incoming SAX events to * XHTML or HTML (depending the given method) using the given output encoding. * * @param encoding output encoding, or <code>null</code> for the platform default */ private static TransformerHandler getTransformerHandler(OutputStream out, String method, String encoding) throws TransformerConfigurationException { TransformerHandler transformerHandler = SAX_TRANSFORMER_FACTORY.newTransformerHandler(); Transformer transformer = transformerHandler.getTransformer(); transformer.setOutputProperty(OutputKeys.METHOD, method); // html // the Transformer may add additional whitespace when outputting the result tree; transformer.setOutputProperty(OutputKeys.INDENT, "yes"); if (encoding != null) { transformer.setOutputProperty(OutputKeys.ENCODING, encoding); } transformerHandler.setResult(new StreamResult(new PrintStream(out))); return transformerHandler; } /** @return Parsed binary content or null */ public String getHtml() { return html; } public void setHtml(String html) { this.html = html; } @Override public Set<WebURL> getOutgoingUrls() { return outgoingUrls; } @Override public void setOutgoingUrls(Set<WebURL> outgoingUrls) { this.outgoingUrls = outgoingUrls; } @Override public String toString() { return (html == null || html.isEmpty()) ? "No data parsed yet" : getHtml(); } }
相关文章推荐
- Java网络爬虫crawler4j学习笔记<19> SAX解析工具类
- java爬虫爬取网页内容前,对网页内容的编码格式进行判断的方式
- Java-->Json解析网页数据
- Java网络爬虫crawler4j学习笔记<1>入门
- Java网络爬虫crawler4j学习笔记<2> Util类
- Java网络爬虫crawler4j学习笔记<3> IO类
- Java网络爬虫crawler4j学习笔记<4> Net类
- Java网络爬虫crawler4j学习笔记<5> TLDList类
- Java网络爬虫crawler4j学习笔记<7> UrlResolver类
- Java网络爬虫crawler4j学习笔记<8> URLCanonicalizer类
- Java网络爬虫crawler4j学习笔记<9> RuleSet类
- Java网络爬虫crawler4j学习笔记<12> RobotstxtParser类
- Java网络爬虫crawler4j学习笔记<13> AuthInfo类
- Java网络爬虫crawler4j学习笔记<14> BasicAuthInfo类
- Java网络爬虫crawler4j学习笔记<15> FormAuthInfo类
- Java网络爬虫crawler4j学习笔记<16> exceptions
- Java网络爬虫crawler4j学习笔记<17> CrawlConfig类
- Java网络爬虫crawler4j学习笔记<18> Configurable类
- Java网络爬虫crawler4j学习笔记<21> Page 类
- Java网络爬虫crawler4j学习笔记<22> Parser 类