您的位置:首页 > 运维架构 > 网站架构

用nutch和solrnet建立自己的搜索网站

2014-06-04 10:57 423 查看
下载解压nutch和solr

添加javahome

export JAVA_HOME=/opt/bitnami/java

修改vi conf/regex-urlfilter.txt

+^http://([a-z0-9]*\.)*letv.com/

新建urls目录,用于存放首要抓取的url列表,我们存放http://guodo.net[zhouhh@Hadoop48 nutch]$ mkdir urls
[zhouhh@Hadoop48 urls]$ vi seed.txt
 http://www.letv.com/
修改
 nutch-site.xml

<?xml version="1.0"?>

<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>

<property>

<name>http.agent.name</name>

<value>myag</value>

<description>HTTP 'User-Agent' request header. MUST NOT be empty -

please set this to a single word uniquely related to your organization.

NOTE: You should also check other related properties:

http.robots.agents

http.agent.description

http.agent.url

http.agent.email

http.agent.version

and set their values appropriately.

</description>

</property>

<property>

<name>parser.skip.truncated</name>

<value>myag</value>

</property>

<property>

<name>http.robots.agents</name>

<value>myag,*</value>

<description>The agent strings we'll look for in robots.txt files,

comma-separated, in decreasing order of precedence. You should

put the value of http.agent.name as the first agent name, and keep the

default * at the end of the list. E.g.: BlurflDev,Blurfl,*

</description>

</property>

</configuration>

复制nutch/conf里面的schema-solr4.xml到solr并改名为schema.xml
并在fields里面添加一行

<field name="_version_" type="long" indexed="true" stored="true"/>

到solr目录启动solr

java -jar start.jar

cd /opt/bitnami/nutch/

后台不挂起运行nutch

不带错误log

nohup ./bin/crawl /opt/bitnami/nutch/urls /opt/bitnami/nutch/sp http://localhost:8983/solr/ 50>/dev/null 2>&1 &

带错误log
nohup ./bin/crawl /opt/bitnami/nutch/urls /opt/bitnami/nutch/sp http://localhost:8983/solr/ 50>/dev/null 2>log &

前台运行

./bin/crawl /opt/bitnami/nutch/urls /opt/bitnami/nutch/sp http://localhost:8983/solr/ 50
查看记录数
./bin/nutch readdb /opt/bitnami/nutch/hc/crawldb -stats

到solrweb管理界面

http://localhost:8983/solr/
然后用slornet调用这个solr做测试网站
public partial class gosearch : System.Web.UI.Page
{
public string pidx = "0";
public string q = "tgbus";
protected void Page_Load(object sender, EventArgs e)
{
if (!IsPostBack)
{
if (Request.QueryString["q"] != null)
{
q = Request.QueryString["q"].ToString();
C1InputText1.Text = q;
if (Request.QueryString["pidx"] != null)
{
pidx = Request.QueryString["pidx"].ToString();
Label1.Text = "当前为" + (int.Parse(pidx) + 1).ToString() + "页";
bind();
}
}
else
{
pidx = "0";
bind();
}
}

/* C1Pager1.PageCount = 100;
Label1.Text = pidx;*/
}

private void bind()
{
Startup.Init<prt>("http://192.168.1.157:8983/solr");

/*  var mapper = new AllPropertiesMappingManager();
mapper.SetUniqueKey(typeof(prt).GetProperty("id"));*/
var solr = ServiceLocator.Current.GetInstance<ISolrOperations<prt>>();
QueryOptions qo = new QueryOptions();
qo.Rows = 20;
qo.Start = int.Parse(pidx);
//   var results = solr.Query(new SolrQueryByField("url", "tgbus"));
ISolrQueryResults<prt> results = solr.Query(q, qo);
//  dataGridView1.AutoGenerateColumns = true;
// MessageBox.Show(results.NumFound.ToString());
//   dataGridView1.Rows.Clear();
C1GridView1.DataSource = results.ToList();
C1GridView1.DataBind();
if (results.NumFound>20)
{
C1Pager1.PageCount = results.NumFound/20;
}
else
{
C1Pager1.PageCount = 1;
}
C1Pager1.PageIndex = int.Parse(pidx);
Label2.Text ="共找到"+ results.NumFound.ToString()+"项";
}

protected void C1Pager1_PageIndexChanged(object sender, EventArgs e)
{
C1Pager cp = sender as C1Pager;
Label1.Text ="当前为"+( cp.PageIndex+1).ToString()+"页";
// Response.Redirect("gosearch.aspx?pidx="+cp.PageIndex.ToString());
pidx = cp.PageIndex.ToString();
q = C1InputText1.Text;
bind();
}

protected void Button1_Click(object sender, EventArgs e)
{
if (this.C1InputText1.Text != "")
{
Response.Redirect("gosearch.aspx?q=" + this.C1InputText1.Text + "&pidx=0");
}
}
public static bool IsChinese(string CString)
{
return System.Text.RegularExpressions.Regex.IsMatch(CString, @"[\u4e00-\u9fa5]");
// return Regex.IsMatch(CString, @"^[\u4e00-\u9fa5]+$");
}
}
}
测试地址
http://asearch.azurewebsites.net
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: