您的位置:首页 > 运维架构 > 网站架构

Nutch学习笔记13---以某网站为例写解析插件

2014-08-01 11:02 393 查看

编写自己的HtmlParseFilter---sohu

1 进入到文件夹 $NUTCH_HOME/src/plugin

mkdir htmlparsefilter-sohu

2 按照下面的结构建立目录和文件

htmlparsefilter-sohu/

plugin.xml

build.xml

ivy.xml

src/

java/

org/

apache/

nutch/

parse/

HtmlParseFilterSohu.java

3 修改plugin.xml

<?xml version="1.0" encoding="UTF-8"?>

<plugin id="htmlparsefilter-sohu" name="Add Sohu Field to Doc"

version="1.0.0" provider-name="nutch.org">

<runtime>

<library name="htmlparsefilter-sohu.jar">

<export name="*"/>

</library>

</runtime>

<requires>

</requires>

<extension id="org.apache.nutch.parse.parse_sohu"

name="Add sohu Field to doc"

point="org.apache.nutch.parse.HtmlParseFilter">

<implementation id="HtmlParseFilterSohu"

class="org.apache.nutch.parse.HtmlParseFilterSohu"/>

</extension>

</plugin>

4 关于ivy.xml

从plugin/index-basic下面复制对应的ivy.xml,不需要任何改变即可。

5 关于build.xml

修改成以下内容

<?xml version="1.0" encoding="UTF-8"?>

<project name="htmlparsefilter-sohu" default="jar">

<import file="../build-plugin.xml"/>

</project>

6 修改 HtmlParseFilterSohu.java-以实际需求为准-以实际代码为准

7 修改src/plugin/build.xml

找到

<!-- ====================================================== -->

<!-- Build & deploy all the plugin jars. -->

<!-- ====================================================== -->

在下面添加一行

<ant dir="htmlparsefilter-sohu" target="deploy"/>

8 修改nutch-site.xml

我的运行在local模式下,则修改配置文件local/conf/nutch-site.xml如下

从nutch-default.xml中复制plugin.includes的配置块

<property>

<name>plugin.includes</name>

<value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>

<description> </description>

</property>

到nutch-site.xml中

然后修改复制后的内容

<property>

<name>plugin.includes</name>

<value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)|htmlparsefilter-sohu</value>

<description> </description>

</property>

就可以了。

9 最后一步-修改$NUTCH_HOME/conf/schema.xml

在<fields>...</fields>段内添加

<field name="pageLength"type="long"stored="true"indexed="true"/>

10 重新ant , 大功告成。

java文件如下:

package org.apache.nutch.parse;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.protocol.Content;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;

public class HtmlParseFilterSohu implements HtmlParseFilter {

private static final Log LOG = LogFactory.getLog(HtmlParseFilterSohu.class);
private Configuration conf;
private ParseResult _parseResult;
private Content _content;

private static final String A = "a";
private static final String ACTOR = "actor";
private static final String ALBUM = "album";
private static final String AREABOX = "areabox";
private static final String CATEGORY = "category";
private static final String CLASS = "class";
private static final String CONTENT = "content";
private static final String CONTENT_LOCATION = "contentLocation";
private static final String CRUMBS = "crumbs";
private static final String CRUMBSBAR = "crumbsBar";
private static final String DATA_SUBSCRIBE_CATEGORYNAME = "data-subscribe-categoryname";
private static final String DATE_PUBLISHED = "datePublished";
private static final String DESCRIPTION = "description";
private static final String DIRECTOR = "director";
private static final String DIV = "div";
private static final String DURATION = "duration";
private static final String FULLDESCRIPTION = "full_desc";
private static final String GENRE = "genre";
private static final String H = "h";
private static final String H2 = "h2";
private static final String ID = "id";
private static final String INTRO = "intro";
private static final String INFO_INFO_CON = "info info-con";
private static final String IRALBUMNAME = "irAlbumName";
private static final String IRCATEGORY = "irCategory";
private static final String IRTITLE = "irTitle";
private static final String ITEM = "item";
private static final String ITEMPROP = "itemprop";
private static final String KEYWORDS = "keywords";
private static final String LABEL = "label";
private static final String LI = "li";
private static final String LINK = "link";
private static final String MAINACTOR = "mainactor";
private static final String NAME = "name";
private static final String OG_IMAGE = "og:image";
private static final String P = "p";
private static final String PEOS_INFO = "peos-info";
private static final String PROPERTY = "property";
private static final String PUB = "pub";
private static final String S_H = "s h";
private static final String SCRIPT = "script";
private static final String SERIES = "series";
private static final String SPAN = "span";
private static final String STYLE = "style";
private static final String THUMB_NAIL_URL = "thumbnailUrl";
private static final String TITLE = "title";
private static final String TYPE = "type";
private static final String MVINFO = "mvInfo"; // just a mv
private static final String META = "meta";
private static final String UL = "ul";
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~url about
private static final String URL_DOMAIN = ".sohu.com/";
private static final String URL_SUFFIX = ".shtml";
private static final String URL_SLASH = "/";

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~parse_data
public static final String NUTCH_VIDEO_FULL_DESCRIPTION = "desc";
public static final String NUTCH_VIDEO_SERIES = "series";
public static final String NUTCH_VIDEO_YEAR = "year";
public static final String NUTCH_VIDEO_AREA = "area";
public static final String NUTCH_VIDEO_TYPE = "type";
public static final String NUTCH_VIDEO_DIRECTOR = "director";
public static final String NUTCH_VIDEO_ACTOR = "actor";
public static final String NUTCH_VIDEO_TITLE = "title";
public static final String NUTCH_VIDEO_CHANNEL = "channel";
public static final String NUTCH_VIDEO_KEYWORD = "keyword";
public static final String NUTCH_VIDEO_URL = "url";
public static final String NUTCH_VIDEO_PICTURE = "picture";
public static final String NUTCH_VIDEO_WEBSITE = "website";
public static final String NUTCH_VIDEO_TIMESPAN = "timespan";

private boolean isNumber(String segment) {
int length = segment.length();
int index = 0;
for (index = 0; index < length; index++) {
char c = segment.charAt(index);
if (c >= '0' && c <= '9') {

} else {
break;
}
}
if (index >= length) {
return true;
} else {
return false;
}
}

private boolean isValidStr(String segment) {
int length = segment.length();
int index = 0;
for (; index < length; index++) {
char c = segment.charAt(index);
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')) {

} else {
break;
}
}
if (index >= length) {
return true;
} else {
return false;
}
}

private boolean isValidUrl(String url) {
if (null == url) {
return false;
}
int firstIndex, secondIndex, thirdIndex;

firstIndex = url.indexOf(URL_DOMAIN);
if (-1 == firstIndex) {
return false;
}
firstIndex += URL_DOMAIN.length();

secondIndex = url.indexOf(URL_SLASH, firstIndex);
if (-1 == secondIndex) {
return false;
}
int length = secondIndex - firstIndex;
if (8 != length) {
return false;
}
String str = url.substring(firstIndex, secondIndex);
if (false == this.isNumber(str)) {
return false;
}

secondIndex += URL_SLASH.length();
thirdIndex = url.indexOf(URL_SUFFIX, secondIndex);
if (-1 == thirdIndex) {
return false;
}
str = url.substring(secondIndex, thirdIndex);
if (false == this.isValidStr(str)) {
return false;
}

return true;
}

private String getKeyValue(String key) {
if (null == _parseResult || null == _content
|| null == _content.getUrl()) {
return null;
}

ParseData parseData = _parseResult.get(_content.getUrl()).getData();
if (null == parseData) {
return null;
}
Metadata metadata = parseData.getParseMeta();
if (null == metadata) {
return null;
}
return metadata.get(key);
}

/*
* private void setKeyValue(String key, String value) { if (null ==
* _parseResult || null == _content || null == _content.getUrl()) { return;
* }
*
* ParseData parseData = _parseResult.get(_content.getUrl()).getData(); if
* (null == parseData) { return; } Metadata metadata =
* parseData.getParseMeta(); if (null == metadata) { return; }
* metadata.set(key, value); }
*/

private void appendKeyValue(String key, String value) {
// used when exist more block...
if (null == _parseResult || null == _content
|| null == _content.getUrl()) {
return;
}

ParseData parseData = _parseResult.get(_content.getUrl()).getData();
if (null == parseData) {
return;
}
Metadata metadata = parseData.getParseMeta();
if (null == metadata) {
return;
}
String oldValue = metadata.get(key);
if (null == oldValue) {
metadata.set(key, value);
} else {
metadata.set(key, oldValue + " " + value);
}
LOG.info("[" + key + "]   [" + metadata.get(key) + "]");

}

private String replaceWithRegex(String str, String regEx, String replace) {
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
str = m.replaceAll(replace).trim();
return str;
}

private Element getFirstChildNodeWithTagName(Element parent, String tagName) {
Element result = null;
if (null == parent) {
return null;
}
// not include itself...
NodeList nodeList = parent.getChildNodes();
int index = 0;
for (index = 0; index < nodeList.getLength(); index++) {
Node child = nodeList.item(index);
if (child instanceof Element) {
Element childElement = (Element) child;
String childTagName = childElement.getTagName();
if (null != childTagName) {
childTagName = childTagName.toLowerCase().trim();
if (childTagName.equals(tagName)) {
return childElement;
}
}
}
}
return null;
}

private Element getFirstChildNodeWithTagNameFixedAttributeLength(
Element parent, String tagName, int n) {
Element result = null;
if (null == parent) {
return null;
}
// not include itself...
NodeList nodeList = parent.getChildNodes();
int index = 0;
for (index = 0; index < nodeList.getLength(); index++) {
Node child = nodeList.item(index);
if (child instanceof Element) {
Element childElement = (Element) child;
String childTagName = childElement.getTagName();
if (null != childTagName) {
childTagName = childTagName.toLowerCase().trim();
if (childTagName.equals(tagName)) {
if (childElement.getAttributes().getLength() == n)
return childElement;
}
}
}
}
return null;
}

private Element getFirstChildNodeWithAttribute(Element parent,
String attributeName, String attributeValue) {
Element result = null;
if (null == parent) {
return null;
}
// not include itself...
NodeList nodeList = parent.getChildNodes();
int index = 0;
for (index = 0; index < nodeList.getLength(); index++) {
Node child = nodeList.item(index);
if (child instanceof Element) {
Element childElement = (Element) child;
String value = childElement.getAttribute(attributeName);
if (null != value) {
value = value.trim();
if (value.equals(attributeValue)) {
return childElement;
}
}
}
}
return null;
}

private Element getFirstDescendantWithAttribute(Node parent,
String attributeName, String attributeValue) {
// include itself...
Element result = null;
if (null == parent) {
return null;
}
if (parent instanceof Element) {
Element element = (Element) parent;
String value = element.getAttribute(attributeName);
if (null != value && value.equals(attributeValue)) {
return element;
}
}
// find from all child...
NodeList children = parent.getChildNodes();
int index = 0;
for (index = 0; index < children.getLength(); index++) {
Node child = children.item(index);
Element found = getFirstDescendantWithAttribute(child,
attributeName, attributeValue);
if (null != found) {
return found;
}
}
// not find...
return null;
}

private Element getFirstDescendantWithTag(Node parent, String tagName) {
// include itself...
Element result = null;
if (null == parent) {
return null;
}
if (parent instanceof Element) {
Element element = (Element) parent;
String value = element.getTagName();
if (null != value && value.toLowerCase().equals(tagName)) {
return element;
}
}
// find from all child...
NodeList children = parent.getChildNodes();
int index = 0;
for (index = 0; index < children.getLength(); index++) {
Node child = children.item(index);
Element found = getFirstDescendantWithTag(child, tagName);
if (null != found) {
return found;
}
}
// not find...
return null;
}

private Element getFirstDescendantWithTagPlusAttribute(Node parent,
String tagName, String attributeName, String attributeValue) {
// include itself...
Element result = null;
if (null == parent) {
return null;
}
if (parent instanceof Element) {
Element element = (Element) parent;
String tag = element.getTagName();
if (null != tag && tag.toLowerCase().equals(tagName)) {
String attrValue = element.getAttribute(attributeName);
if (null != attrValue && attrValue.equals(attributeValue)) {
return element;
}
}
}
// find from all child...
NodeList children = parent.getChildNodes();
int index = 0;
for (index = 0; index < children.getLength(); index++) {
Node child = children.item(index);
Element found = getFirstDescendantWithTagPlusAttribute(child,
tagName, attributeName, attributeValue);
if (null != found) {
return found;
}
}
// not find...
return null;
}

private Element getFirstDescendantWithTagPlusAttributeFixedAttribute(
Node parent, String tagName, String attributeName,
String attributeValue, int n) {
// include itself...
Element result = null;
if (null == parent) {
return null;
}
if (parent instanceof Element) {
Element element = (Element) parent;
String tag = element.getTagName();
if (null != tag && tag.toLowerCase().equals(tagName)) {
String attrValue = element.getAttribute(attributeName);
if (null != attrValue && attrValue.equals(attributeValue)) {

if (null == element.getAttributes()) {
if (element.getAttributes().getLength() == n)
return element;
}

}
}
}
// find from all child...
NodeList children = parent.getChildNodes();
int index = 0;
for (index = 0; index < children.getLength(); index++) {
Node child = children.item(index);
Element found = getFirstDescendantWithTagPlusAttribute(child,
tagName, attributeName, attributeValue);
if (null != found) {
return found;
}
}
// not find...
return null;
}

private void walk(Node node) {
if (null == node) {
return;
}
short nodeType = node.getNodeType();
if (nodeType == Node.DOCUMENT_FRAGMENT_NODE) {

NodeList children = node.getChildNodes();
for (int i = 0; children != null && i < children.getLength(); i++) {
walk(children.item(i));
}

} else if (nodeType == Node.ELEMENT_NODE) {

Element element = (Element) node;
String tag = element.getTagName();
if (null == tag) {
return;
}
tag = tag.toLowerCase();
if (SCRIPT.equals(tag) || STYLE.equals(tag) || LINK.equals(tag)) {
// <script>...</script>
// <style>...</style>
// <link>...</link>
return;
} else if (META.equals(tag)) {
String value = element.getAttribute(NAME);

if (null != value && value.equals(ALBUM)) {
// album
value = element.getAttribute(CONTENT);
if (null != value && value.length() > 0) {
this.appendKeyValue(NUTCH_VIDEO_SERIES, value);
}
} else if (null != value && value.equals(CATEGORY)) {
// category
value = element.getAttribute(CONTENT);
if (null != value && value.length() > 0) {
this.appendKeyValue(NUTCH_VIDEO_CHANNEL, value);
}
}

value = element.getAttribute(PROPERTY);
// image
if (null != value && value.equals(OG_IMAGE)) {
value = element.getAttribute(CONTENT);
if (null != value && value.length() > 0) {
this.appendKeyValue(NUTCH_VIDEO_PICTURE, value);
}
}

} else if (DIV.equals(tag)) {

String value = element.getAttribute(ID);
if (null != value && value.equals(CRUMBSBAR)) {
Element h2 = this.getFirstDescendantWithTag(element, H2);
if (null != h2) {
String text = h2.getTextContent();
if (null != text && text.length() > 0) {
text = this.replaceWithRegex(text, "[\\s\\t/()]+",
" ");
text = text.trim();
if (text.length() > 0)
this.appendKeyValue(NUTCH_VIDEO_TITLE, text);
}
}

return;
}

// ///////////////////////////////////////////////////////////////

value = element.getAttribute(CLASS);
if (null != value && value.equals(INFO_INFO_CON)) {

Element mainactor = this
.getFirstDescendantWithTagPlusAttribute(element,
LI, ID, MAINACTOR);
if (null != mainactor) {
Element actor = this.getFirstChildNodeWithTagName(
mainactor, A);
while (null != actor) {
String name = actor.getTextContent();
if (null != name && name.length() > 0) {
this.appendKeyValue(NUTCH_VIDEO_ACTOR, name);
}
mainactor.removeChild(actor);
actor = this.getFirstChildNodeWithTagName(
mainactor, A);
}
}

Element li = this.getFirstDescendantWithTagPlusAttribute(
element, LI, CLASS, H);
if (null != li) {
Element a = this.getFirstChildNodeWithTagName(li, A);
if (null != a) {
String year = a.getTextContent();
if (null != year && year.trim().length() == 4
&& this.isNumber(year.trim())) {
this.appendKeyValue(this.NUTCH_VIDEO_YEAR,
year.trim());
}
}

}

li = this.getFirstDescendantWithTagPlusAttribute(element,
LI, ID, AREABOX);
if (null != li) {
Element a = this.getFirstChildNodeWithTagName(li, A);
if (null != a) {
String area = a.getTextContent();
if (null != area && area.length() > 0) {
this.appendKeyValue(NUTCH_VIDEO_AREA, area);
}
}
if (null != li.getAttribute(CLASS))
li.removeAttribute(CLASS);

}

li = this.getFirstDescendantWithTagPlusAttribute(element,
LI, CLASS, S_H);
if (null != li) {
Element a = this.getFirstChildNodeWithTagName(li, A);
if (null != a) {
String type = a.getTextContent();
if (null != type && type.length() > 0) {
this.appendKeyValue(NUTCH_VIDEO_TYPE, type);
}
}
}

Element p = this.getFirstDescendantWithTagPlusAttribute(
element, P, CLASS, INTRO);
if (null != p) {
// delete a firstly!!!
Element em = this.getFirstChildNodeWithTagName(p, "em");
if (null != em) {
p.removeChild(em);
}
String text = p.getTextContent();
if (null != text && text.length() > 0) {
this.appendKeyValue(NUTCH_VIDEO_FULL_DESCRIPTION,
text);
}
}

return;
}
}

// handle children
NodeList children = node.getChildNodes();
for (int i = 0; children != null && i < children.getLength(); i++) {
walk(children.item(i));
}

} else if (nodeType == Node.TEXT_NODE) {
return;
} else if (nodeType == Node.COMMENT_NODE) {
return;
} else {
LOG.info("xxx-type-not-parsed------" + node.getNodeName());
return;
}

}

public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
_parseResult = parseResult;
_content = content;
LOG.info("begin**********HtmlParseFilterIQiyi************by IQiyi");
/*
* LOG.info("Content Information:");
* LOG.info("BaseUrl---"+content.getBaseUrl());
* LOG.info("ContentType---"+content.getContentType());
* LOG.info("url---"+content.getUrl());
* LOG.info("Content---"+content.getContent().toString());
* LOG.info("Metadata"+content.getMetadata().toString());
*/
if (null == content || null == parseResult || null == metaTags
|| null == doc) {
LOG.info("content|parseResult|metaTags|doc is null,so just return parseResult...");
return parseResult;
}
LOG.info("four params checked ok,handle next......");
String url = content.getUrl();
LOG.info("currenturl is ------" + url);
if (false == this.isValidUrl(url)) {
LOG.info("invalid url,just return raw parseResult...");
return parseResult;
}
LOG.info("video[source]---" + "valid url,iqiyistatics...");

this.appendKeyValue(NUTCH_VIDEO_URL, url);

/*
* Metadata metadata = metaTags.getGeneralTags(); if (null != metadata)
* { this.appendKeyValue(NUTCH_VIDEO_KEYWORD, metadata.get(KEYWORDS));
* this.appendKeyValue(NUTCH_VIDEO_TITLE, metadata.get(TITLE));
* this.appendKeyValue(NUTCH_VIDEO_FULL_DESCRIPTION,
* metadata.get(DESCRIPTION)); }
*/

walk((Node) doc);

LOG.info("end**************************************end");
return parseResult;
}

public Configuration getConf() {
return conf;
}

public void setConf(Configuration conf) {
this.conf = conf;
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  nutch sohu