您的位置:首页 > 其它

Solr索引pdf.txt.word等文件

2016-04-14 21:33 459 查看
这里用的solr4.7

首先搭建环境

    创建一个新core

    这里有详细的资料

    http://blog.csdn.net/clj198606061111/article/details/21288499/
修改core0里面的xml

  schema.xml

加入

<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>

<field name="text"      type="text_general" indexed="true"  stored="true"/>
<field name="_version_" type="long"         indexed="true"  stored="true"/>
<dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>

<!-- general -->
<field name="id"        type="string"   indexed="true"  stored="true"  multiValued="false" required="true"/>
<field name="type"      type="string"   indexed="true"  stored="true"  multiValued="false" />
<field name="name"      type="string"   indexed="true"  stored="true"  multiValued="false" />
<field name="fileName"      type="string"   indexed="true"  stored="true"  multiValued="false" />
<field name="path"      type="string"   indexed="true"  stored="true"  multiValued="false" />


修改solrconfig,xml

     

<requestHandler name="/update/extract" class="solr.extraction.ExtractingRequestHandler" >
<lst name="defaults">
<str name="fmap.content">text</str>
<str name="lowernames">true</str>
<str name="uprefix">attr_</str>
<str name="captureAttr">true</str>
</lst>
</requestHandler>


加入相应的jar包

 


  solr-4.7.2\dist    solr-cell-4.7.2

  solr-4.7.2\contrib\extraction  所有的jar包

然后引入jar包

    在solrconfig.xml文件

<lib dir="../extract" regex=".*\.jar" />


重启tomcat

数据准备

   


然后代码编写 

public static void main(String[] args) {
File parentFile = new File("G:/document/");
if (parentFile.exists()) {
File[] files = parentFile.listFiles();
for (File file : files) {
try {
indexFilesSolrCell(file.getName(), file.getPath());
} catch (IOException e) {
e.printStackTrace();
} catch (SolrServerException e) {
e.printStackTrace();
}
}
}
}

/**
* 从文件创建索引 <功能详细描述>
*
* @param fileName
* @param solrId
* @see [类、类#方法、类#成员]
*/
public static void indexFilesSolrCell(String fileName, String path)
throws IOException, SolrServerException {
//连接solr服务
String urlString = "http://localhost:8080/solr/core0";
SolrServer solr = new HttpSolrServer(urlString);

ContentStreamUpdateRequest up = new ContentStreamUpdateRequest(
"/update/extract");

String contentType = getFileContentType(fileName);
up.addFile(new File(path), contentType);
up.setParam("literal.id", fileName);
up.setParam("literal.path", path);
up.setParam("literal.fileName", fileName);
up.setParam("fmap.content", "attr_content");
up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);

/*
* up.addFile(file, contenttype); up.setParam("literal.id", id);
* up.setParam("literal.mytitle", mytitle);
* up.setParam("literal.mytime", dataTurntoLong(savetime));
* up.setParam("literal.myindextype", myindextype);
* up.setParam("literal.myyears", myyears); up.setParam("fmap.content",
* "content");
*/
solr.request(up);

QueryResponse rsp = solr.query(new SolrQuery("*:*"));

SolrDocumentList solrDocumentList = rsp.getResults();

ListIterator<SolrDocument> listIterator = solrDocumentList
.listIterator();
while (listIterator.hasNext()) {
SolrDocument solrDocument = listIterator.next();
System.out.println(solrDocument.getFieldValue("attr_filename"));
}

}

/**
* 根据文件名获取文件的ContentType类型
*
* @param filename
* @return
*/
public static String getFileContentType(String filename) {
String contentType = "";
String prefix = filename.substring(filename.lastIndexOf(".") + 1);
if (prefix.equals("xlsx")) {
contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
} else if (prefix.equals("pdf")) {
contentType = "application/pdf";
} else if (prefix.equals("doc")) {
contentType = "application/msword";
} else if (prefix.equals("txt")) {
contentType = "text/plain";
} else if (prefix.equals("xls")) {
contentType = "application/vnd.ms-excel";
} else if (prefix.equals("docx")) {
contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
} else if (prefix.equals("ppt")) {
contentType = "application/vnd.ms-powerpoint";
} else if (prefix.equals("pptx")) {
contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
}

else {
contentType = "othertype";
}

return contentType;
}


  solr操作

    


   

    

   
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  solr