应试教育下的三大群体
2009-03-26 17:38
13 查看
SimpleAnalyzer
StandardAnalyzer
WhitespaceAnalyzer
StopAnalyzer
测试代码:
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
public class TestAnalyzer {
private static String testString1 = "The quick brown fox jumped over the lazy dogs";
private static String testString2 = "xy&z mail is - xyz@sohu.com";
public static void testWhitespace(String testString) throws Exception{
Analyzer analyzer = new WhitespaceAnalyzer();
Reader r = new StringReader(testString);
Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);
System.err.println("=====Whitespace analyzer====");
System.err.println("分析方法:空格分割");
Token t;
while ((t = ts.next()) != null) {
System.out.println(t.termText());
}
}
public static void testSimple(String testString) throws Exception{
Analyzer analyzer = new SimpleAnalyzer();
Reader r = new StringReader(testString);
Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);
System.err.println("=====Simple analyzer====");
System.err.println("分析方法:空格及各种符号分割");
Token t;
while ((t = ts.next()) != null) {
System.out.println(t.termText());
}
}
public static void testStop(String testString) throws Exception{
Analyzer analyzer = new StopAnalyzer();
Reader r = new StringReader(testString);
StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
System.err.println("=====stop analyzer====");
System.err.println("分析方法:空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义的词");
//停止词
Token t;
while ((t = sf.next()) != null) {
System.out.println(t.termText());
}
}
public static void testStandard(String testString) throws Exception{
Analyzer analyzer = new StandardAnalyzer();
Reader r = new StringReader(testString);
StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
System.err.println("=====standard analyzer====");
System.err.println("分析方法:混合分割,包括了去掉停止词,支持汉语");
Token t;
while ((t = sf.next()) != null) {
System.out.println(t.termText());
}
}
public static void main(String[] args) throws Exception{
// String testString = testString1;
String testString = testString2;
System.out.println(testString);
testWhitespace(testString);
testSimple(testString);
testStop(testString);
testStandard(testString);
}
}
运行结果:
xy&z mail is - xyz@sohu.com
=====Whitespace analyzer====
分析方法:空格分割
xy&z
mail
is
-
xyz@sohu.com
=====Simple analyzer====
分析方法:空格及各种符号分割
xy
z
mail
is
xyz
sohu
com
=====stop analyzer====
分析方法:空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义
的词
xy
z
mail
xyz
sohu
com
=====standard analyzer====
分析方法:混合分割,包括了去掉停止词,支持汉语
xy&z
mail
xyz@sohu.com
StandardAnalyzer
WhitespaceAnalyzer
StopAnalyzer
测试代码:
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
public class TestAnalyzer {
private static String testString1 = "The quick brown fox jumped over the lazy dogs";
private static String testString2 = "xy&z mail is - xyz@sohu.com";
public static void testWhitespace(String testString) throws Exception{
Analyzer analyzer = new WhitespaceAnalyzer();
Reader r = new StringReader(testString);
Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);
System.err.println("=====Whitespace analyzer====");
System.err.println("分析方法:空格分割");
Token t;
while ((t = ts.next()) != null) {
System.out.println(t.termText());
}
}
public static void testSimple(String testString) throws Exception{
Analyzer analyzer = new SimpleAnalyzer();
Reader r = new StringReader(testString);
Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);
System.err.println("=====Simple analyzer====");
System.err.println("分析方法:空格及各种符号分割");
Token t;
while ((t = ts.next()) != null) {
System.out.println(t.termText());
}
}
public static void testStop(String testString) throws Exception{
Analyzer analyzer = new StopAnalyzer();
Reader r = new StringReader(testString);
StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
System.err.println("=====stop analyzer====");
System.err.println("分析方法:空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义的词");
//停止词
Token t;
while ((t = sf.next()) != null) {
System.out.println(t.termText());
}
}
public static void testStandard(String testString) throws Exception{
Analyzer analyzer = new StandardAnalyzer();
Reader r = new StringReader(testString);
StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
System.err.println("=====standard analyzer====");
System.err.println("分析方法:混合分割,包括了去掉停止词,支持汉语");
Token t;
while ((t = sf.next()) != null) {
System.out.println(t.termText());
}
}
public static void main(String[] args) throws Exception{
// String testString = testString1;
String testString = testString2;
System.out.println(testString);
testWhitespace(testString);
testSimple(testString);
testStop(testString);
testStandard(testString);
}
}
运行结果:
xy&z mail is - xyz@sohu.com
=====Whitespace analyzer====
分析方法:空格分割
xy&z
is
-
xyz@sohu.com
=====Simple analyzer====
分析方法:空格及各种符号分割
xy
z
is
xyz
sohu
com
=====stop analyzer====
分析方法:空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义
的词
xy
z
xyz
sohu
com
=====standard analyzer====
分析方法:混合分割,包括了去掉停止词,支持汉语
xy&z
xyz@sohu.com
相关文章推荐
- 应试教育下的三大群体
- 中国要破三大难题方可崛起
- 谷歌三大核心技术(一)Google File System中文版
- Google MapReduce/GFS/BigTable三大技术的论文中译版
- 第二章:泥潭中的群体
- 关于三大主流移动操作系统的看法
- 计算机视觉研究群体及专家主页汇总
- 从程序员到项目经理:项目管理三大目标
- 引用和指针三大区别:
- C语言中三大经典的排序算法
- 迟来的淘宝SEO规则-淘宝SEO三大定律
- OOP的三大特性
- 三大框架struts+spring+ibatis
- 深入浅出Java三大框架SSH与MVC的设计模式
- 阿里巴巴入股新浪微博的三大原因
- 三大服务器对比分析
- 三大互联网巨头抢购高德软件 阿里巴巴胜算最大
- 出口同比中国经济三大怪状折射出啥危机?
- 数据库三大范式详解
- Java 入门 四 (Java 三大特性之一 继承)