您的位置:首页 > 其它

solr6.6 solrJ索引富文本(word/pdf)文件

2017-12-13 16:04 357 查看

  1、文件配置

    在core下面新建lib文件夹,存放相关的jar包,如图所示:

    


    


    修改solrconfig.xml

   

<lib dir="${solr.install.dir:../../../..}/contrib/extraction/lib" regex=".*\.jar" />
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-cell-\d.*\.jar" />

<lib dir="${solr.install.dir:../../../..}/contrib/clustering/lib/" regex=".*\.jar" />
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-clustering-\d.*\.jar" />

<lib dir="${solr.install.dir:../../../..}/contrib/langid/lib/" regex=".*\.jar" />
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-langid-\d.*\.jar" />

<lib dir="${solr.install.dir:../../../..}/contrib/velocity/lib" regex=".*\.jar" />
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-velocity-\d.*\.jar" />
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar" />
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar" />
<lib dir="./lib" regex=".*\.jar"/>


    增加配置,如果有则不用添加:

    

<requestHandler name="/update/extract"
startup="lazy"
class="solr.extraction.ExtractingRequestHandler" >
<lst name="defaults">
<str name="fmap.content">text</str>
<str name="fmap.meta">ignored_</str>
<str name="lowernames">true</str>
<str name="uprefix">attr_</str>
<str name="captureAttr">true</str>
</lst>
</requestHandler>


  配置managed-schema文件:

   


  

  修改managed-schema文件,增加字段:

<field name="path"      type="string"   indexed="true"  stored="true"  multiValued="false" />
<field name="pathftype"      type="string"   indexed="true"  stored="true"  multiValued="false" />
<field name="pathuploaddate"      type="string"   indexed="true"  stored="true"  multiValued="false" />
<field name="pathsummary"      type="string"   indexed="true"  stored="true"  multiValued="false" />
<field name="attr_content"      type="text_general"   indexed="true"  stored="true"  multiValued="false" />


  2、Java代码solrj操作(6.6.0版本)

import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.AbstractUpdateRequest.ACTION;
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;

/**
* @Author:sks
* @Description:索引pdf等富文本文件
* @Date:Created in 15:16 2017/12/13
* @Modified by:
**/
public class solr_pdf {
public static void main(String[] args)
{

String fileName = "D:/work/Solr/ImportData/20160229001cn.pdf";
String solrId = "20160229001cn.pdf";

try
{
indexFilesSolrCell(solrId, solrId,fileName);
}
catch (IOException e)
{
e.printStackTrace();
}
catch (SolrServerException e)
{
e.printStackTrace();
}

}

/**
* @Author:sks
* @Description:获取系统当天日期yyyy-mm-dd
* @Date:
*/
private static String GetCurrentDate(){
Date dt = new Date();
//最后的aa表示“上午”或“下午”    HH表示24小时制    如果换成hh表示12小时制
//        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss aa");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
String day =sdf.format(dt);
return day;
}

public static void indexFilesSolrCell(String fileName, String solrId, String path)
throws IOException, SolrServerException
{
String urlString = "http://localhost:8983/solr/test";
SolrClient solr = new HttpSolrClient.Builder(urlString).build();

ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
String contentType = getFileContentType(fileName);
up.addFile(new File(path), contentType);
String fileType = fileName.substring(fileName.lastIndexOf(".")+1);
up.setParam("literal.id", fileName);

up.setParam("literal.path", path);//文件路径
up.setParam("literal.pathuploaddate", GetCurrentDate());//文件上传时间
up.setParam("literal.pathftype", fileType);//文件类型,doc,pdf
up.setParam("fmap.content", "attr_content");//文件内容
up.setAction(ACTION.COMMIT, true, true);
solr.request(up);
}

/**
* @Author:sks
* @Description:根据文件名获取文件的ContentType类型
* @Date:
*/
public static String getFileContentType(String filename) {
String contentType = "";
String prefix = filename.substring(filename.lastIndexOf(".") + 1);
if (prefix.equals("xlsx")) {
contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
} else if (prefix.equals("pdf")) {
contentType = "application/pdf";
} else if (prefix.equals("doc")) {
contentType = "application/msword";
} else if (prefix.equals("txt")) {
contentType = "text/plain";
} else if (prefix.equals("xls")) {
contentType = "application/vnd.ms-excel";
} else if (prefix.equals("docx")) {
contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
} else if (prefix.equals("ppt")) {
contentType = "application/vnd.ms-powerpoint";
} else if (prefix.equals("pptx")) {
contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
}

else {
contentType = "othertype";
}

return contentType;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: