您的位置:首页 > 其它

Nutch1.2增加插件例子

2013-09-05 16:11 393 查看
今尝试下给nutch1.2增加一个插件,于是到官网找了个例子,链接如下:

http://wiki.apache.org/nutch/WritingPluginExample-0.9

这个例子实现的的是推荐网站,就是写关键字在content里,当别人搜索这个关键字时,你推荐的网站在搜索结果中排前,要实现推荐必须在你的网页上加上

[xhtml] view
plaincopy

<meta name="recommended" content="plugins" />

这条属性才能被插件识别。

由于它这个例子是用nutch0.9的,而且1.2和0.9有些区别,于是要修改一些代码。步骤如下:

1.插件开放

1.1在src/plugin中新建一个文件夹recommend

1.2.在recommend目录下新建Plugin.xml和Build.xml文件,内容如下:

Plugin.xml

[xhtml] view
plaincopy

<?xml version="1.0"
4000
encoding="UTF-8"?>

<plugin

id="recommended"

name="Recommended Parser/Filter"

version="0.0.1"

provider-name="nutch.org">

<runtime>

<!-- As defined in build.xml this plugin will end up bundled as recommended.jar -->

<library name="recommended.jar">

<export name="*"/>

</library>

</runtime>

<!-- The RecommendedParser extends the HtmlParseFilter to grab the contents of

any recommended meta tags -->

<extension id="org.apache.nutch.parse.recommended.recommendedfilter"

name="Recommended Parser"

point="org.apache.nutch.parse.HtmlParseFilter">

<implementation id="RecommendedParser"

class="org.apache.nutch.parse.recommended.RecommendedParser"/>

</extension>

<!-- TheRecommendedIndexer extends the IndexingFilter in order to add the contents

of the recommended meta tags (as found by the RecommendedParser) to the lucene

index. -->

<extension id="org.apache.nutch.parse.recommended.recommendedindexer"

name="Recommended identifier filter"

point="org.apache.nutch.indexer.IndexingFilter">

<implementation id="RecommendedIndexer"

class="org.apache.nutch.parse.recommended.RecommendedIndexer"/>

</extension>

<!-- The RecommendedQueryFilter gets called when you perform a search. It runs a

search for the user's query against the recommended fields. In order to get

add this to the list of filters that gets run by default, you have to use

"fields=DEFAULT". -->

<extension id="org.apache.nutch.parse.recommended.recommendedSearcher"

name="Recommended Search Query Filter"

point="org.apache.nutch.searcher.QueryFilter">

<implementation id="RecommendedQueryFilter"

class="org.apache.nutch.parse.recommended.RecommendedQueryFilter">

<parameter name="fields" value="recommended"/>

</implementation>

</extension>

</plugin>

Build.xml

[xhtml] view
plaincopy

<?xml version="1.0"?>

<project name="recommended" default="jar-core">

<import file="../build-plugin.xml"/>

<!-- Build compilation dependencies -->

<target name="deps-jar">

<ant target="jar" inheritall="false" dir="../lib-xml"/>

</target>

<!-- Add compilation dependencies to classpath -->

<path id="plugin.deps">

<fileset dir="${nutch.root}/build">

<include name="**/lib-xml/*.jar" />

</fileset>

</path>

<!-- Deploy Unit test dependencies -->

<target name="deps-test">

<ant target="deploy" inheritall="false" dir="../lib-xml"/>

<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>

<ant target="deploy" inheritall="false" dir="../protocol-file"/>

</target>

<!-- for junit test -->

<mkdir dir="${build.test}/data"/>

<copy file="data/recommended.html" todir="${build.test}/data"/>

</project>

1.3.在recommended目录下建立/src/java/org/apache/nutch/parse/recommended目录。

1.4.增加RecommendedIndexer.java,RecommendedParser.java,RecommendedQueryFilter.java三个类,内容如下:

RecommendedIndexer.java

[java] view
plaincopy

package org.apache.nutch.parse.recommended;

// JDK import

import java.util.logging.Logger;

// Commons imports

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

// Nutch imports

import org.apache.nutch.util.LogUtil;

import org.apache.nutch.fetcher.FetcherOutput;

import org.apache.nutch.indexer.IndexingFilter;

import org.apache.nutch.indexer.IndexingException;

import org.apache.nutch.indexer.NutchDocument;

import org.apache.nutch.parse.Parse;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.io.Text;

import org.apache.nutch.crawl.CrawlDatum;

import org.apache.nutch.crawl.Inlinks;

// Lucene imports

import org.apache.lucene.document.Field;

import org.apache.lucene.document.Document;

public class RecommendedIndexer implements IndexingFilter {

public static final Log LOG = LogFactory.getLog(RecommendedIndexer.class.getName());

private Configuration conf;

public RecommendedIndexer() {

}

@Override

public NutchDocument filter(NutchDocument doc, Parse parse, Text url,

CrawlDatum datum, Inlinks inlinks)

throws IndexingException {

String recommendation = parse.getData().getMeta("recommended");

if (recommendation != null) {

Field recommendedField =

new Field("recommended", recommendation,

Field.Store.YES, Field.Index.NOT_ANALYZED);

recommendedField.setBoost(5.0f);

doc.add("recommended",recommendedField);

LOG.info("Added " + recommendation + " to the recommended Field");

}

return doc;

}

public void setConf(Configuration conf) {

this.conf = conf;

}

public Configuration getConf() {

return this.conf;

}

@Override

public void addIndexBackendOptions(Configuration conf) {

// TODO Auto-generated method stub

}

}

RecommendedParser.java

[java] view
plaincopy

package org.apache.nutch.parse.recommended;

// JDK imports

import java.util.Enumeration;

import java.util.Properties;

import java.util.logging.Logger;

// Nutch imports

import org.apache.hadoop.conf.Configuration;

import org.apache.nutch.metadata.Metadata;

import org.apache.nutch.parse.HTMLMetaTags;

import org.apache.nutch.parse.Parse;

import org.apache.nutch.parse.HtmlParseFilter;

import org.apache.nutch.parse.ParseResult;

import org.apache.nutch.protocol.Content;

// Commons imports

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

// W3C imports

import org.w3c.dom.DocumentFragment;

public class RecommendedParser implements HtmlParseFilter {

private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());

private Configuration conf;

/** The Recommended meta data attribute name */

public static final String META_RECOMMENDED_NAME="recommended";

/**

* Scan the HTML document looking for a recommended meta tag.

*/

@Override

public ParseResult filter(Content content, ParseResult parseResult,

HTMLMetaTags metaTags, DocumentFragment doc) {

// Trying to find the document's recommended term

String recommendation = null;

Properties generalMetaTags = metaTags.getGeneralTags();

for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames.hasMoreElements(); ) {

if (tagNames.nextElement().equals("recommended")) {

System.out.println(generalMetaTags.getProperty("recommended"));

recommendation = generalMetaTags.getProperty("recommended");

LOG.info("Found a Recommendation for " + recommendation);

}

}

if (recommendation == null) {

LOG.info("No Recommendation");

} else {

LOG.info("Adding Recommendation for " + recommendation);

Parse parse = parseResult.get(content.getUrl());

parse.getData().getContentMeta().set(META_RECOMMENDED_NAME, recommendation);

}

return parseResult;

}

public void setConf(Configuration conf) {

this.conf = conf;

}

public Configuration getConf() {

return this.conf;

}

}

RecommendedQueryFilter.java

[java] view
plaincopy

package org.apache.nutch.parse.recommended;

import org.apache.nutch.searcher.FieldQueryFilter;

import java.util.logging.Logger;

// Commons imports

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

public class RecommendedQueryFilter extends FieldQueryFilter {

private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());

public RecommendedQueryFilter() {

super("recommended", 5f);

LOG.info("Added a recommended query");

}

}

1.5.在 src/plugin/build.xml 中的<target name="deploy"></target>中增加一行:

[xhtml] view
plaincopy

<ant dir="recommended" target="deploy" />

1.6.运行cmd,切换到recommend目录,运行ant命令编译,插件开发完成。

1.7 让nutch识别你的插件

在conf/nutch-site.xml 中增加一下属性

[c-sharp] view
plaincopy

<property>

<name>plugin.includes</name>

<value>recommended|protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> <description>Regular expression naming plugin id names to

include. Any plugin not matching this expression is excluded.

In any case you need at least include the nutch-extensionpoints plugin. By

default Nutch includes crawling just HTML and plain text via HTTP,

and basic indexing and search plugins.

</description>

</property>

2.编写插件测试类

2.1 在src/plugin中/recommend目录下新建一个data目录,在data目录下新建一个html文件recommended.html内容如下:

[xhtml] view
plaincopy

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">

<html lang="en">

<head>

<meta http-equiv="Content-Type" content="text/html; charset=utf-8">

<title>recommended</title>

<meta name="generator" content="TextMate http://macromates.com/">
<meta name="author" content="Ricardo J. Méndez">

<meta name="recommended" content="recommended-content"/>

<!-- Date: 2007-02-12 -->

</head>

<body>

Recommended meta tag test.

</body>

</html>

2.2 在src/plugin中/recommend目录下新建src/test/org/apache/nutch/parse/recommended目录,增加TestRecommendedParser.java类,内容如下:

[xhtml] view
plaincopy

package org.apache.nutch.parse.recommended;

import org.apache.nutch.metadata.Metadata;

import org.apache.nutch.parse.Parse;

import org.apache.nutch.parse.ParseResult;

import org.apache.nutch.parse.ParseUtil;

import org.apache.nutch.protocol.Content;

import org.apache.hadoop.conf.Configuration;

import org.apache.nutch.util.NutchConfiguration;

import java.util.Properties;

import java.io.*;

import java.net.URL;

import junit.framework.TestCase;

/*

* Loads test page recommended.html and verifies that the recommended

* meta tag has recommended-content as its value.

*

*/

public class TestRecommendedParser extends TestCase {

private static final File testDir =

new File("H:/project/SearchEngine/Nutch1.2/src/plugin/recommended/data");

public void testPages() throws Exception {

pageTest(new File(testDir, "recommended.html"), "http://foo.com/",

"recommended-content");

}

public void pageTest(File file, String url, String recommendation)

throws Exception {

String contentType = "text/html";

InputStream in = new FileInputStream(file);

ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());

byte[] buffer = new byte[1024];

int i;

while ((i = in.read(buffer)) != -1) {

out.write(buffer, 0, i);

}

in.close();

byte[] bytes = out.toByteArray();

Configuration conf = NutchConfiguration.create();

Content content =

new Content(url, url, bytes, contentType, new Metadata(), conf);

Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content).get(content.getUrl());

Metadata metadata = parse.getData().getContentMeta();

assertEquals(recommendation, metadata.get("recommended"));

assertTrue("somesillycontent" != metadata.get("recommended"));

}

}

2.3 用junit运行TestRecommendedParser.java测试。

转自http://blog.csdn.net/laigood/article/details/5929388
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  nutch 插件