OpenNLP入门实验
2016-11-24 15:58
27 查看
OpenNLP
也是自然语言分析处理的工具是apache的开源项目,感觉这玩意儿更容易入手,使用的人更多。句子探测的例子
注意,首先,添加opennlp的pom依赖<!-- OpenNLP --> <dependency> <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> <version>1.6.0</version> </dependency>
接着还得下载训练集
地址是http://opennlp.sourceforge.net/models-1.5/,例如句子探测,分词都有探测模型使用的是en-sent.bin。
在使用的时候得添加训练模型的文件路径
String path="E:/EclipseWorkSpace/hibernettest/src/main/resources/" + "com/hainan/cs/OpenNLPModels/en-sent.bin"; // always start with a model, a model is learned from training data InputStream is = new FileInputStream(path); SentenceModel model = new SentenceModel(is); SentenceDetectorME sdetector = new SentenceDetectorME(model);
句子探测代码:
package com.hainan.cs.opennlp;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
public class SentenceDetector {
public static void main(String args[]) throws IOException {
String paragraph = "Hi. How are you? This is Mike.";
String path="E:/EclipseWorkSpace/hibernettest/src/main/resources/" + "com/hainan/cs/OpenNLPModels/en-sent.bin"; // always start with a model, a model is learned from training data InputStream is = new FileInputStream(path); SentenceModel model = new SentenceModel(is); SentenceDetectorME sdetector = new SentenceDetectorME(model);
String sentences[] = sdetector.sentDetect(paragraph);
System.out.println(sentences[0]);
System.out.println(sentences[1]);
is.close();
}
}
句子探测结果:
分词的例子
直接上代码,使用的模型在代码里package com.hainan.cs.opennlp; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.Inv 4000 alidFormatException; public class Tokenizer { public static void main(String args[]) throws InvalidFormatException, IOException { String paragraph = "Hi. How are you? This is Mike."; String path = "E:/EclipseWorkSpace/hibernettest/src/main/resources/" + "com/hainan/cs/OpenNLPModels/en-token.bin"; InputStream is = new FileInputStream(path); TokenizerModel model = new TokenizerModel(is); TokenizerME tokenizer = new TokenizerME(model); String tokens[] = tokenizer.tokenize("Hi. How are you? This is Mike."); for (String a : tokens) System.out.println(a); is.close(); } }
结果:
名字查找,比如说要识别哪些是人命
package com.hainan.cs.opennlp; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.util.Span; public class NameFinder { public static void main(String args[]) throws IOException { String paragraph = "Hi. How are you? This is Mike."; String path = "E:/EclipseWorkSpace/hibernettest/src/main/resources/" + "com/hainan/cs/OpenNLPModels/en-ner-person.bin"; InputStream is = new FileInputStream(path); TokenNameFinderModel model = new TokenNameFinderModel(is); is.close(); NameFinderME nameFinder = new NameFinderME(model); String[] sentence = new String[] { "Mike", "Smith", "is", "a", "good", "person" }; Span nameSpans[] = nameFinder.find(sentence); for (Span s : nameSpans) System.out.println(s.toString()); } }
词性标注
package com.hainan.cs.opennlp; import java.io.File; import java.io.IOException; import java.io.StringReader; import opennlp.tools.cmdline.PerformanceMonitor; import opennlp.tools.cmdline.postag.POSModelLoader; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSSample; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; public class POS { public static void main(String args[]) throws IOException { String paragraph = "Hi. How are you? This is Mike."; String path = "E:/EclipseWorkSpace/hibernettest/src/main/resources/" + "com/hainan/cs/OpenNLPModels/en-pos-maxent.bin"; POSModel model = new POSModelLoader().load(new File(path)); PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent"); POSTaggerME tagger = new POSTaggerME(model); String input = "Hi. How are you? This is Mike."; ObjectStream<String> lineStream = new PlainTextByLineStream(new StringReader(input)); perfMon.start(); String line; while ((line = lineStream.read()) != null) { String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line); String[] tags = tagger.tag(whitespaceTokenizerLine); POSSample sample = new POSSample(whitespaceTokenizerLine, tags); System.out.println(sample.toString()); perfMon.incrementCounter(); } perfMon.stopAndPrintFinalResult(); } }
结果:
Chunker
package com.hainan.cs.opennlp; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import opennlp.tools.chunker.ChunkerME; import opennlp.tools.chunker.ChunkerModel; import opennlp.tools.cmdline.PerformanceMonitor; import opennlp.tools.cmdline.postag.POSModelLoader; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSSample; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.Span; public class Chunker { public static void main(String args[]) throws IOException { String path = "E:/EclipseWorkSpace/hibernettest/src/main/resources/" + "com/hainan/cs/OpenNLPModels/en-pos-maxent.bin"; POSModel model = new POSModelLoader().load(new File(path)); PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent"); POSTaggerME tagger = new POSTaggerME(model); String input = "Hi. How are you? This is Mike."; ObjectStream<String> lineStream = new PlainTextByLineStream(new StringReader(input)); perfMon.start(); String line; String whitespaceTokenizerLine[] = null; String[] tags = null; while ((line = lineStream.read()) != null) { whitespaceTokenizerLine = WhitespaceTokenizer.INSTANCE.tokenize(line); tags = tagger.tag(whitespaceTokenizerLine); POSSample sample = new POSSample(whitespaceTokenizerLine, tags); System.out.println(sample.toString()); perfMon.incrementCounter(); } perfMon.stopAndPrintFinalResult(); // chunker String path1 = "E:/EclipseWorkSpace/hibernettest/src/main/resources/" + "com/hainan/cs/OpenNLPModels/en-chunker.bin"; InputStream is = new FileInputStream(path1); ChunkerModel cModel = new ChunkerModel(is); ChunkerME chunkerME = new ChunkerME(cModel); String result[] = chunkerME.chunk(whitespaceTokenizerLine, tags); for (String s : result) System.out.println(s); Span[] span = chunkerME.chunkAsSpans(whitespaceTokenizerLine, tags); for (Span s : span) System.out.println(s.toString()); } }
结果: 看不懂这是个啥求,先放着吧
分析器parser
package com.hainan.cs.opennlp; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import opennlp.tools.cmdline.parser.ParserTool; import opennlp.tools.parser.Parse; import opennlp.tools.parser.ParserFactory; import opennlp.tools.parser.ParserModel; import opennlp.tools.util.InvalidFormatException; public class Parser { public static void main(String args[]) throws InvalidFormatException, IOException { // http://sourceforge.net/apps/mediawiki/opennlp/index.php?title=Parser#Training_Tool String path = "E:/EclipseWorkSpace/hibernettest/src/main/resources/" + "com/hainan/cs/OpenNLPModels/en-parser-chunking.bin"; InputStream is = new FileInputStream(path); ParserModel model = new ParserModel(is); Parser parser = ParserFactory.create(model); String sentence = "Programcreek is a very huge and useful website."; Parse topParses[] = ParserTool.parseLine(sentence, (opennlp.tools.parser.Parser) parser, 1); for (Parse p : topParses) p.show(); is.close(); /* * (TOP (S (NP (NN Programcreek) ) (VP (VBZ is) (NP (DT a) (ADJP (RB * very) (JJ huge) (CC and) (JJ useful) ) ) ) (. website.) ) ) */ } }
结果:
自然语言处理两个最重要的方面就是词性标注和句法分析。例子都有了。
相关文章推荐
- 仅用500行Python代码实现一个英文解析器的教程
- 用Python进行一些简单的自然语言处理的教程
- 自然语言理解(摘自中南大学课件)
- 自然语言也支持泛型
- openNLP在eclipse中配置和使用
- openNLP在eclipse中配置和使用
- 关于 Cucumber 与 自动化测试的自我感触
- 基于神经网络的统计语言模型-----第一章 引言
- 基于神经网络的统计语言模型-----第一章 引言
- The next day yo learn Python(第二天学习Python)
- BosonNLP分词技术解密
- 隐马尔科夫模型详解
- word2Vec浅析
- 自然语言处理(NLP)在企业应用中的实践(一)
- 自然语言处理(NLP)在企业应用中的实践(二)
- 自然语言处理(NLP)在企业应用中的实践(三)
- 自然语言处理(NLP)在企业应用中的实践(四)
- Web自动化测试(1): Python+Behave+ Selenium Web Driver 在windows系统的安装
- Web自动化测试(3): Selenium Web Driver 如何操作web页面。
- Web自动化测试(4): Selenium Web Driver 操作控件完成页面自动化测试的例子。