您的位置：首页 > 其它

es SynonymTokenFilterFactory 源码

2016-03-10 16:05 281 查看

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0 *
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.io.FastStringReader;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettingsService;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;

import java.io.Reader;
import java.util.List;
import java.util.Map;

@AnalysisSettingsRequired
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {

private final SynonymMap synonymMap;
private final boolean ignoreCase;

@Inject
public SynonymTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, IndicesAnalysisService indicesAnalysisService, Map<String, TokenizerFactoryFactory> tokenizerFactories,
@Assisted String name, @Assisted Settings settings) {
super(index, indexSettingsService.getSettings(), name, settings);
//同义词流
Reader rulesReader = null;
//获取配置中的synonyms的同义词配置
if (settings.getAsArray("synonyms", null) != null) {
List<String> rules = Analysis.getWordList(env, settings, "synonyms");
StringBuilder sb = new StringBuilder();
for (String line : rules) {
sb.append(line).append(System.getProperty("line.separator"));
}
rulesReader = new FastStringReader(sb.toString());
//获取配置文件中同义词配置synonyms_path
} else if (settings.get("synonyms_path") != null) {
//获取配置路径的同义词文件流
rulesReader = Analysis.getReaderFromFile(env, settings, "synonyms_path");
} else {
throw new IllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
}

this.ignoreCase = settings.getAsBoolean("ignore_case", false);
boolean expand = settings.getAsBoolean("expand", true);

//获取 tokenizer
String tokenizerName = settings.get("tokenizer", "whitespace");

//获取TokenizerFactoryFactory
TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(tokenizerName);
if (tokenizerFactoryFactory == null) {
tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName);
}
if (tokenizerFactoryFactory == null) {
throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
}

final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, Settings.builder().put(indexSettingsService.getSettings()).put(settings).build());

Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create();
TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream);
}
};

try {
SynonymMap.Builder parser = null;

if ("wordnet".equalsIgnoreCase(settings.get("format"))) {
parser = new WordnetSynonymParser(true, expand, analyzer);
//解析同义词数据流
((WordnetSynonymParser) parser).parse(rulesReader);
} else {
parser = new SolrSynonymParser(true, expand, analyzer);
((SolrSynonymParser) parser).parse(rulesReader);
}

synonymMap = parser.build();
} catch (Exception e) {
throw new IllegalArgumentException("failed to build synonyms", e);
}
}

@Override
public TokenStream create(TokenStream tokenStream) {
// fst is null means no synonyms
//使用 lucene 中的 SynonymFilter
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
}
}

/**
* @return null If no settings set for "settingsPrefix" then return <code>null</code>.
* @throws IllegalArgumentException
*          If the Reader can not be instantiated.
* 获取配置同义词流
*/
public static Reader getReaderFromFile(Environment env, Settings settings, String settingPrefix) {
String filePath = settings.get(settingPrefix, null);

if (filePath == null) {
return null;
}

final Path path = env.configFile().resolve(filePath);

try {
return FileSystemUtils.newBufferedReader(path.toUri().toURL(), Charsets.UTF_8);
} catch (IOException ioe) {
String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
throw new IllegalArgumentException(message);
}
}

package org.apache.lucene.analysis.synonym;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0 *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.text.ParseException;
import java.util.Arrays;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;

/**
* Parser for wordnet prolog format
* <p>
* See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format.
* @lucene.experimental SynonymMap 解析子类

*/
// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
public class WordnetSynonymParser extends SynonymMap.Parser {
private final boolean expand;

public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
super(dedup, analyzer);
this.expand = expand;
}

@Override
public void parse(Reader in) throws IOException, ParseException {
//一行一行解析
LineNumberReader br = new LineNumberReader(in);
try {
String line = null;
String lastSynSetID = "";
CharsRef synset[] = new CharsRef[8];
int synsetSize = 0;

while ((line = br.readLine()) != null) {
String synSetID = line.substring(2, 11);

if (!synSetID.equals(lastSynSetID)) {
addInternal(synset, synsetSize);
synsetSize = 0;
}

if (synset.length <= synsetSize+1) {
synset = Arrays.copyOf(synset, synset.length * 2);
}

synset[synsetSize] = parseSynonym(line, new CharsRefBuilder());
synsetSize++;
lastSynSetID = synSetID;
}

// final synset in the file
addInternal(synset, synsetSize);
} catch (IllegalArgumentException e) {
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
ex.initCause(e);
throw ex;
} finally {
br.close();
}
}

private CharsRef parseSynonym(String line, CharsRefBuilder reuse) throws IOException {
if (reuse == null) {
reuse = new CharsRefBuilder();
}

int start = line.indexOf('\'')+1;
int end = line.lastIndexOf('\'');

String text = line.substring(start, end).replace("''", "'");
return analyze(text, reuse);
}

private void addInternal(CharsRef synset[], int size) {
if (size <= 1) {
return; // nothing to do
}

if (expand) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
add(synset[i], synset[j], false);
}
}
} else {
for (int i = 0; i < size; i++) {
add(synset[i], synset[0], false);
}
}
}
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航