您的位置:首页 > 其它

Lucene工作总结

2009-09-15 18:27 141 查看
关键字: lucene总结
公司项目:portal中期刊文章内容作为大字段存储在Oracle中,首页有一个搜索功能:要求将所有包括搜索字段的文章的标题列出来(文章的内容存储在Oracle的CLOB字段中),也就是要用Lucene实现对数据库的大字段进行索引(索引通过计划任务定时建立索引)和搜索。。。

==================定时建立索引文件:===============

Main方法:

Java代码

package zxt.lucene.index;

import java.util.Timer;

public class IndexerServer {

/**

* 定时调用建立索引任务

* @author wulihai

* @create 2009-06-02

*/

public static void main(String[] args) {

String propFile = "directory.properties";

Config.setConfigFileName(propFile);

Timer timer = new Timer();

LuceneDBIndexerTask luceneTask=LuceneDBIndexerTask.getInstance();

timer.scheduleAtFixedRate(luceneTask, 0,DataTypeUtil.toLong(Constant.CREATE_INDEX_SLEEP_TIME));

}

}

定时调用建立索引任务:

Java代码

package zxt.lucene.index;

import java.util.Timer;

public class IndexerServer {

/**

* 定时调用建立索引任务

* @author wulihai

* @create 2009-06-02

*/

public static void main(String[] args) {

String propFile = "directory.properties";

Config.setConfigFileName(propFile);

Timer timer = new Timer();

LuceneDBIndexerTask luceneTask=LuceneDBIndexerTask.getInstance();

timer.scheduleAtFixedRate(luceneTask, 0,DataTypeUtil.toLong(Constant.CREATE_INDEX_SLEEP_TIME));

}

}

建立索引的核心实现:

Java代码

package zxt.lucene.index;

import java.io.BufferedReader;

import java.io.File;

import java.io.IOException;

import java.io.StringWriter;

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

import java.text.SimpleDateFormat;

import java.util.Arrays;

import java.util.Date;

import java.util.TimerTask;

import oracle.sql.CLOB;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

/**

* 建立索引的任务类

* @author wulihai

* @create 2009-06-02

*/

public class LuceneDBIndexerTask extends TimerTask {

//缺省索引目录

private static String DEFAULT_INDEX_DIR="C://IndexDB";

//临时索引目录的父目录

private File parentDir=null;

//被搜索的索引文件

private static LuceneDBIndexerTask index=new LuceneDBIndexerTask();

//构造方法

private LuceneDBIndexerTask(){

String dirStr=Constant.INDEX_STORE_DIRECTORY;

if(dirStr!=null&&!"".equals(dirStr)){

this.parentDir=new File(dirStr);

}else{

this.parentDir=new File(DEFAULT_INDEX_DIR);

}

if(!this.parentDir.exists()){

this.parentDir.mkdir();

}

}

/**

* 单实例访问接口

* @return

*/

public static LuceneDBIndexerTask getInstance(){

return index;

}

/**

* 锁定目录以及文件

* 只允许单线程访问

*

*/

/*public synchronized void singleRunning(){

if(flag==false){

flag=true;

run(parentDir);

}

}*/

/**

* 为数据库字段建立索引

*/

public void run() {

System.out.println("====LuceneDBIndexerTask$run()===============");

System.out.println("~~~开始建立索引文件~~~~~~~~~~~~~~~");

Connection conn=null;

Statement stmt=null;

ResultSet rs=null;

try {

Class.forName(Constant.DB_DRIVER_STRING);

conn = DriverManager.getConnection(Constant.DB_URI_STRING, Constant.DB_USERNAME, Constant.DB_PWD);

stmt = conn.createStatement();

rs = stmt.executeQuery(Constant.DB_QUERY_STRING);

File file=new File(parentDir+File.separator+new SimpleDateFormat("yyyyMMddHHmmss").format(new Date())+File.separator);

if(!file.exists()){

file.mkdir();

}

IndexWriter writer = new IndexWriter(file,new StandardAnalyzer(), true);

long startTime = new Date().getTime();

while (rs.next()) {

Document doc = new Document();

doc.add(new Field("ARTICLEID", rs.getString("ARTICLEID"), Field.Store.YES,Field.Index.TOKENIZED));

doc.add(new Field("TITLE", rs.getString("TITLE"), Field.Store.YES,Field.Index.TOKENIZED));

doc.add(new Field("USERNAME", rs.getString("USERNAME"), Field.Store.YES,Field.Index.TOKENIZED));

doc.add(new Field("USERID", rs.getString("USERID"), Field.Store.YES,Field.Index.TOKENIZED));

//对日期建立索引

String createdate=new SimpleDateFormat("yyyy-MM-dd").format(rs.getTimestamp("CREATEDATE"));

doc.add(new Field("CREATEDATE", createdate, Field.Store.YES,Field.Index.TOKENIZED));

//对大字段建立索引

BufferedReader in=null;

String content="";

CLOB clob = (CLOB) rs.getClob("CONTENT");

if (clob != null) {

//得到一个读入流

in=new BufferedReader(clob.getCharacterStream());

StringWriter out=new StringWriter();

int c;

while((c=in.read())!=-1){

out.write(c);

}

content=out.toString();

}

doc.add(new Field("CONTENT", content, Field.Store.YES, Field.Index.TOKENIZED));

writer.addDocument(doc);

}

writer.optimize();

writer.close();

//测试一下索引的时间

long endTime = new Date().getTime();

System.out.println("索引文件"+file.getPath()+"建立成功...");

System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!");

//判断文件目录file下的文件个数如果大于3,就将文件建立最早的文件给删除掉

checkFiles(parentDir);

} catch (IOException e) {

e.printStackTrace();

} catch (SQLException e) {

e.printStackTrace();

} catch (ClassNotFoundException e) {

e.printStackTrace();

}finally{

try {

if(rs!=null){

rs.close();

}

if(stmt!=null){

stmt.close();

}

if(conn!=null){

conn.close();

}

} catch (SQLException e) {

e.printStackTrace();

}

}

}

/**

* 判断文件目录file下的文件个数如果大于3,就将文件建立最早的文件给删除掉

*/

public void checkFiles(File dir) {

int length=dir.listFiles().length;

while(length>3){

//删除生成最早的文件

File [] files=dir.listFiles();

String[] names=dir.list();

Arrays.sort(names);

File deletefile=files[0];

deleteDirectory(deletefile);

length--;

}

}

/*

* 递归删除一个目录以及下面的文件

*/

public boolean deleteDirectory(File path) {

if( path.exists() ) {

File[] files = path.listFiles();

for(int i=0; i<files.length; i++) {

if(files[i].isDirectory()) {

deleteDirectory(files[i]);

}

else {

//删除文件

files[i].delete();

}

}

}

//删除目录

boolean hasdelete=path.delete();

if(hasdelete){

System.out.println("删除索引目录"+path);

}

return hasdelete;

}

public static void main(String[] args) {

new LuceneDBIndexerTask().run();

}

}

配置文件管理类:

Java代码

package zxt.lucene.index;

import java.io.IOException;

import java.io.InputStream;

import java.util.Properties;

/**

*

* @author wulihai

* @create 2009-06-02

*

*/

public class Config {

private static Config cfg = null;

private static String configFileName = null;

private Properties props;

public Config() {

props = new java.util.Properties();

}

/**

* 单例访问接口

* @return

*/

public synchronized static Config getInstance() {

if (cfg == null) {

cfg = new Config();

cfg.loadConfig();

return cfg;

} else {

return cfg;

}

}

private int loadConfig() {

if (configFileName != null || configFileName.length() > 0) {

InputStream inputStream = Config.class.getClassLoader()

.getResourceAsStream("directory.properties");

System.out.println("configFileName=" + configFileName);

try {

props.load(inputStream);

} catch (IOException e) {

e.printStackTrace();

}

return 1;

}

return 0;

}

public static void setConfigFileName(String cfg) {

configFileName = cfg;

}

public String getProperty(String keyName) {

return props.getProperty(keyName);

}

}

常量配置

Java代码

package zxt.lucene.index;

/**

* 常量配置类 *

* @author wulihai

* @create 2009-06-02

*/

public class Constant {

// 隔多长时间建立一次索引

public static final String CREATE_INDEX_SLEEP_TIME = Config.getInstance()

.getProperty("create_index_sleep_time");

// 索引文件存放路径

public static final String INDEX_STORE_DIRECTORY = Config.getInstance()

.getProperty("index_store_directory");

//数据库驱动程序

public static final String DB_DRIVER_STRING = Config.getInstance()

.getProperty("db_driver_string");

//数据库连接URI

public static final String DB_URI_STRING = Config.getInstance()

.getProperty("db_uri_string");

//数据库连接username

public static final String DB_USERNAME= Config.getInstance()

.getProperty("db_username");

//数据库连接pwd

public static final String DB_PWD= Config.getInstance()

.getProperty("db_pwd");

//数据库查询语句db_query_str

public static final String DB_QUERY_STRING= Config.getInstance()

.getProperty("db_query_string");

}

数据类型处理类:

Java代码

package zxt.lucene.index;

/**

* 数据类型转换工具类

* @author wulihai

* @create 2009-06-02

*/

public class DataTypeUtil {

/**

* 将对象转换为整数型

* @param o 源对象

* @return 对应的Long值,如果出错,则返回Long.MIN_VALUE

*/

public static long toLong(Object o) {

if (o == null) {

throw new IllegalArgumentException("该对象为空");

}

String s = o.toString();

try {

return Long.parseLong(s);

} catch (Exception ex) {

return Long.MAX_VALUE;

}

}

}

配置文件 :

Properties代码

#== the directory for store lucene-index ========#

index_store_directory=D:/lucene/indexDB/

#======== two hours ========#

#create_index_sleep_time=7200000

#======== two minutes ========#

create_index_sleep_time=120000

db_driver_string=oracle.jdbc.driver.OracleDriver

db_uri_string=jdbc:oracle:thin:@localhost:1521:lportal

db_username=lportal

db_pwd=lportal

db_query_string=SELECT * from journalarticle

==================搜索类:===============

核心搜索类:

Java代码

package com.liferay.portal.util;

import java.io.File;

import java.io.IOException;

import java.io.InputStream;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.Date;

import java.util.List;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import com.liferay.portlet.journal.model.JournalArticle;

/**

* 负责搜索的类

*/

public class LuceneDBQuery {

private static LuceneDBQuery search = new LuceneDBQuery();

// 构造方法

private LuceneDBQuery() {

}

/**

* 单实例访问接口

*

* @return

*/

public static LuceneDBQuery getInstance() {

return search;

}

/**

* 搜索方法

*

* @throws java.text.ParseException

* @throws Exception

*/

public List search(String queryString) {

int count = 0;

long startTime = new Date().getTime();

Hits hits = null;

// 搜索目录

File searchDir = null;

Query query = null;

InputStream inputStream=null;;

String filePath="index.xml";

String indexDir="";

indexDir= LuceneDBQueryUtil.getIndexPath();

if (indexDir != null && !"".equals(indexDir)) {

searchDir = new File(indexDir);

if(!searchDir.exists()){

searchDir.mkdir();

}

}

// 这里注意索引存放的目录的父目录

// searchDir=new File("E://index//indexDB//");

File targetDir = getTargetDir(searchDir);

IndexSearcher searcher = null;

List results = new ArrayList();

try {

Directory dir=FSDirectory.getDirectory(targetDir,false);

searcher = new IndexSearcher(dir);

} catch (Exception e1) {

e1.printStackTrace();

System.out.println("创建索引对象出现异常...");

}

Analyzer analyzer = new StandardAnalyzer();

// 构建查询对象Query,对CONTENT字段进行搜索

QueryParser qp = new QueryParser("CONTENT", analyzer);

try {

query = qp.parse(queryString);

} catch (ParseException e1) {

e1.printStackTrace();

}

if (searcher != null) {

// 得到搜索结果Hits

try {

hits = searcher.search(query);

} catch (IOException e1) {

System.out.println("查询索引库出现异常...");

e1.printStackTrace();

}

// 查到的记录条数

count = hits.length();

if (hits.length() > 0) {

for (int i = 0; i < hits.length(); i++) {// 输出搜索信息

JournalArticle article = new JournalArticle();

Document document = null;

try {

document = hits.doc(i);

} catch (Exception e1) {

System.out.println("返回查询结果集出现异常...");

e1.printStackTrace();

}

try {

article.setDisplayDate(new SimpleDateFormat("yyyyMMdd")

.parse(document.get("CREATEDATE")));

article.setCreateDate(new SimpleDateFormat("yyyyMMdd")

.parse(document.get("CREATEDATE")));

} catch (java.text.ParseException e) {

e.printStackTrace();

}

article.setTitle(document.get("TITLE"));

article.setArticleId(document.get("ARTICLEID"));

article.setUserName(document.get("USERNAME"));

article.setUserId(document.get("USERID"));

results.add(article);

}

// 测试一下索引的时间

long endTime = new Date().getTime();

System.out.println("查询过程花费了" + (endTime - startTime) + " 毫秒!");

} else {

System.out.println("0个结果!");

}

}

return results;

}

/**

* 确定搜索索引所在目录目录

*/

private File getTargetDir(File dir) {

int length = dir.listFiles().length;

File searchFile = null;

// length=3的时候最多

// 同时搜索和同时建索引的时候会出现length=4

if (length >= 2) {

// 找到次最新建立的索引文件

String[] names = dir.list();

Arrays.sort(names);

searchFile = new File(dir + File.separator + names[length - 2]);

}

if (length == 1) {

File files[] = dir.listFiles();

searchFile = files[0];

}

if (length == 0) {

// 如果没有索引文件则,建立第一个索引

// TestDBIndexer.getInstance().isInstanceRunning();

// search();

}

return searchFile;

}

//

// public static void main(String[] args) throws Exception {

// new LuceneDBQuery().search("纳税人");

// }

}

配置文件管理类:

Java代码

package com.liferay.portal.util;

import java.io.IOException;

import org.jdom.Document;

import org.jdom.Element;

import org.jdom.JDOMException;

import org.jdom.input.SAXBuilder;

public class LuceneDBQueryUtil {

public static String getIndexPath(){

String filePath = "zxt_index.xml";

String indexPath="";

SAXBuilder builder = new SAXBuilder(false);

try {

Document doc = builder.build(Thread.currentThread().getContextClassLoader().getResource(filePath));

Element rootElement = doc.getRootElement();

Element index=rootElement.getChild("index");

indexPath=index.getText();

System.out.println(indexPath);

} catch (JDOMException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

}

return indexPath;

}

}

配置文件:zxt_index.xml

Xml代码

<?xml version="1.0" encoding="UTF-8"?>

<list>

<index>D://index//IndexDB</index>

</list>
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: