Lucene7.0与HanLP分词器整合索引数据库建立索引文件
2017-11-29 14:05
369 查看
HanLP官网:http://hanlp.linrunsoft.com/
GitHup地址:https://github.com/hankcs/HanLP
HanLP插件地址:https://github.com/hankcs/hanlp-lucene-plugin
需要一下jar包
类
package com.kyd.demo.hanLP;
import java.io.IOException;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;
import com.hankcs.lucene.HanLPAnalyzer;
import com.hankcs.lucene.HanLPIndexAnalyzer;
/**
* 索引数据库字段建立索引文件
*
* @author zhengzhen
*
*/
public class JdbcIndexDemo {
public static void main(String[] args) {
try {
Class.forName("com.mysql.jdbc.Driver");
String url = "jdbc:mysql://192.168.100.69:3306/xxxx?useUnicode=true&characterEncoding=utf8&autoReconnect=true&failOverReadOnly=false";
String password ="root";
String userName ="root";
String sql ="select * from xxxx";
try (
Connection conn = DriverManager.getConnection(url,userName,password);
PreparedStatement sta =conn.prepareStatement(sql);
ResultSet rs = sta.executeQuery();
){
/**
* 1.设置索引文件保存路径
*/
Directory directory = FSDirectory.open(Paths.get("xxxx_index"));
/**
* 2.创建分词器
*/
Analyzer analyzer = new HanLPIndexAnalyzer();
/**
* 3.分词器配置
*/
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setOpenMode(OpenMode.CREATE);
/**
* 4.创建索引输出流
*/
IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);
/**
* 5.循环遍历创建索引文档
*/
while (rs.next()) {
/**
* 5.1.创建文档
*/
Document document = new Document();
/**
* 5.2.添加字段
*/
Long id =rs.getLong("unitId");
IndexableField unitIdField = new StringField("unitId", id+"",Store.YES);
document.add(unitIdField);
String title = rs.getString("title");
if( title != null) {
IndexableField sectionNameField = new TextField("sectionName", title, Store.YES);
document.add(sectionNameField);
}
String unitName= rs.getString("unitName");
if( unitName != null) {
IndexableField unitNameField = new TextField("unitName", unitName, Store.YES);
document.add(unitNameField);
}
String courseName= rs.getString("courseName");
if(courseName !=null) {
IndexableField courseNameField = new TextField("courseName", courseName, Store.YES);
document.add(courseNameField);
}
String startPage= rs.getString("startPage");
if(startPage !=null) {
IndexableField startPageField = new StringField("startPage", startPage, Store.YES);
document.add(startPageField);
}
String endPage= rs.getString("startEndPage");
if(endPage != null) {
IndexableField endPageField = new StringField("endPage", endPage,Store.YES);
document.add(endPageField);
}
indexWriter.addDocument(document);
}
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} catch (ClassNotFoundException e1) {
e1.printStackTrace();
}
}
/**
* HanLPAnalyzer
* 这个分词器对于长词不会切割 ,例如 “中华人民共和国” 是一个长词会保留下来
* @throws IOException
*/
@Test
public void hanLPAnalyzerTest() throws IOException {
String text = "中华人民共和国很辽阔";
for (int i = 0; i < text.length(); ++i)
{
System.out.print(text.charAt(i) + "" + i + " ");
}
System.out.println();
Analyzer analyzer = new HanLPAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("field", text);
tokenStream.reset();
while (tokenStream.incrementToken())
{
CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
System.out.println(attribute + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + positionAttr.getPositionIncrement());
}
/* 输出:
* 中0 华1 人2 民3 共4 和5 国6 很7 辽8 阔9
* 中华人民共和国 0 7 1
* 很 7 8 1
* 辽阔 8 10 1
*/
}
/**
* HanLPIndexAnalyzer
* 这个分词器会对长词进行分割 “中华人民共和国” 会切分成“中华人民共和国” “中华” “人民”等等
* @throws IOException
*/
@Test
public void hanLPIndexAnalyzerTest() throws IOException {
String text = "中华人民共和国很辽阔";
for (int i = 0; i < text.length(); ++i)
{
System.out.print(text.charAt(i) + "" + i + " ");
}
System.out.println();
Analyzer analyzer = new HanLPIndexAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("field", text);
tokenStream.reset();
while (tokenStream.incrementToken())
{
CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
System.out.println(attribute + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + positionAttr.getPositionIncrement());
}
/* 输出:
* 中0 华1 人2 民3 共4 和5 国6 很7 辽8 阔9
* 中华人民共和国 0 7 1
* 中华人民 0 4 1
* 中华 0 2 1
* 华人 1 3 1
* 人民共和国 2 7 1
* 人民 2 4 1
* 共和国 4 7 1
* 共和 4 6 1
* 很 7 8 1
* 辽阔 8 10 1
*/
}
}
GitHup地址:https://github.com/hankcs/HanLP
HanLP插件地址:https://github.com/hankcs/hanlp-lucene-plugin
需要一下jar包
类
package com.kyd.demo.hanLP;
import java.io.IOException;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;
import com.hankcs.lucene.HanLPAnalyzer;
import com.hankcs.lucene.HanLPIndexAnalyzer;
/**
* 索引数据库字段建立索引文件
*
* @author zhengzhen
*
*/
public class JdbcIndexDemo {
public static void main(String[] args) {
try {
Class.forName("com.mysql.jdbc.Driver");
String url = "jdbc:mysql://192.168.100.69:3306/xxxx?useUnicode=true&characterEncoding=utf8&autoReconnect=true&failOverReadOnly=false";
String password ="root";
String userName ="root";
String sql ="select * from xxxx";
try (
Connection conn = DriverManager.getConnection(url,userName,password);
PreparedStatement sta =conn.prepareStatement(sql);
ResultSet rs = sta.executeQuery();
){
/**
* 1.设置索引文件保存路径
*/
Directory directory = FSDirectory.open(Paths.get("xxxx_index"));
/**
* 2.创建分词器
*/
Analyzer analyzer = new HanLPIndexAnalyzer();
/**
* 3.分词器配置
*/
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setOpenMode(OpenMode.CREATE);
/**
* 4.创建索引输出流
*/
IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);
/**
* 5.循环遍历创建索引文档
*/
while (rs.next()) {
/**
* 5.1.创建文档
*/
Document document = new Document();
/**
* 5.2.添加字段
*/
Long id =rs.getLong("unitId");
IndexableField unitIdField = new StringField("unitId", id+"",Store.YES);
document.add(unitIdField);
String title = rs.getString("title");
if( title != null) {
IndexableField sectionNameField = new TextField("sectionName", title, Store.YES);
document.add(sectionNameField);
}
String unitName= rs.getString("unitName");
if( unitName != null) {
IndexableField unitNameField = new TextField("unitName", unitName, Store.YES);
document.add(unitNameField);
}
String courseName= rs.getString("courseName");
if(courseName !=null) {
IndexableField courseNameField = new TextField("courseName", courseName, Store.YES);
document.add(courseNameField);
}
String startPage= rs.getString("startPage");
if(startPage !=null) {
IndexableField startPageField = new StringField("startPage", startPage, Store.YES);
document.add(startPageField);
}
String endPage= rs.getString("startEndPage");
if(endPage != null) {
IndexableField endPageField = new StringField("endPage", endPage,Store.YES);
document.add(endPageField);
}
indexWriter.addDocument(document);
}
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} catch (ClassNotFoundException e1) {
e1.printStackTrace();
}
}
/**
* HanLPAnalyzer
* 这个分词器对于长词不会切割 ,例如 “中华人民共和国” 是一个长词会保留下来
* @throws IOException
*/
@Test
public void hanLPAnalyzerTest() throws IOException {
String text = "中华人民共和国很辽阔";
for (int i = 0; i < text.length(); ++i)
{
System.out.print(text.charAt(i) + "" + i + " ");
}
System.out.println();
Analyzer analyzer = new HanLPAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("field", text);
tokenStream.reset();
while (tokenStream.incrementToken())
{
CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
System.out.println(attribute + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + positionAttr.getPositionIncrement());
}
/* 输出:
* 中0 华1 人2 民3 共4 和5 国6 很7 辽8 阔9
* 中华人民共和国 0 7 1
* 很 7 8 1
* 辽阔 8 10 1
*/
}
/**
* HanLPIndexAnalyzer
* 这个分词器会对长词进行分割 “中华人民共和国” 会切分成“中华人民共和国” “中华” “人民”等等
* @throws IOException
*/
@Test
public void hanLPIndexAnalyzerTest() throws IOException {
String text = "中华人民共和国很辽阔";
for (int i = 0; i < text.length(); ++i)
{
System.out.print(text.charAt(i) + "" + i + " ");
}
System.out.println();
Analyzer analyzer = new HanLPIndexAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("field", text);
tokenStream.reset();
while (tokenStream.incrementToken())
{
CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
System.out.println(attribute + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + positionAttr.getPositionIncrement());
}
/* 输出:
* 中0 华1 人2 民3 共4 和5 国6 很7 辽8 阔9
* 中华人民共和国 0 7 1
* 中华人民 0 4 1
* 中华 0 2 1
* 华人 1 3 1
* 人民共和国 2 7 1
* 人民 2 4 1
* 共和国 4 7 1
* 共和 4 6 1
* 很 7 8 1
* 辽阔 8 10 1
*/
}
}
相关文章推荐
- lucene对文件建立索引之二
- lucene建立pdf文件内容全文索引
- 利用Lucene将被索引文件目录中的所有文件建立索引
- Lucene.NET建立,搜索多个索引文件
- lucene: 索引建立完后无法查看索引文件中的数据
- lucene建立索引的过程
- lucene入门-复杂索引建立
- lucene索引文件大于2G的处理情况
- html抽取文本信息-java版(适合lucene建立索引)
- Solr4.8.0源码分析(10)之Lucene的索引文件(3)
- 使用Lucene.Net 3.0.3进行文件索引和检索
- Lucene学习总结之三:Lucene的索引文件格式(2)
- hive中对lzo压缩文件建立索引实现并行处理
- HBase整合MapReduce之建立HBase索引
- 应用Lucene.net建立全文索引引擎
- lucene 索引文件大小分布_tim
- lucene003_lucene的索引文件格式
- lucene4.5源码分析系列:lucene默认索引的文件格式-总述
- Lucene学习(三):综述Lucene的索引文件格式
- ctags建立include头文件的vim形式索引文件