您的位置:首页 > 其它

中文分词 (机械传统方法 )正向最大匹配

2012-04-05 22:56 501 查看
//ditionary.h
#include <iostream>
#include <string>
#include <hash_map>
#include <fstream>
#include <sstream>
using namespace std;

class Cditionary
{
public:
Cditionary();
~Cditionary();
int FindWord(string w);
private:
string strtmp;
string word;
hash_map<string , int> wordhash;

};

Cditionary::Cditionary()
{
ifstream infile("wordlist.txt");     // 打开词典
if (!infile.is_open())     // 打开词典失败则退出程序
{
cerr << "Unable to open input file: " << "wordlexicon"<< " -- bailing out!" << endl;
exit(-1);
}
while (getline(infile, strtmp, '\n'))     // 读入词典的每一行并将其添加入哈希中
{
istringstream istr(strtmp);
istr >> word;     //读入每行第一个词
wordhash[word] = 1;     //插入到哈希中
}
}

Cditionary::~Cditionary()
{
}
int Cditionary::FindWord(string s)
{
if (wordhash.find(s) != wordhash.end())
return 1;
else
return 0 ;

}


//main.cpp
#include "dictionary.h"
#define MaxWordLength 10
#define Sep "/"

Cditionary WordDic;

// 字符串用最大匹配法处理
string SegmentSetence(string s1)
{
string s2 = "";

while (! s1.empty())
{
int len = s1.length();
if (len > MaxWordLength)
len = MaxWordLength;
string temp = s1.substr(0, len);

int n = WordDic.FindWord(temp);
while (len > 2 && n == 0)
{
len -= 2;
temp = temp.substr(0 , len);
n = WordDic.FindWord(temp);
}
s2 += temp + Sep;
s1 = s1.substr(temp.length(), s1.length());
}
return s2;
}

int main(int argc , char * argv[])
{
string strtmp; // 用于保存从语料库中读入的每一行
string line; // 用于输出每一行的结果

ifstream infile(argv[1]); // 打开输入文件
if (! infile.is_open()) // 打开输入文件失败则退出程序
{
cerr << "Unable to open input file: " << " -- bailing out!" << endl;
exit(-1);
}

ofstream outfile1("result.txt"); // 确定输出文件
if (! outfile1.is_open())
{
cerr << "Unable to open file:SegmentResult.txt"
<< "--bailing out!" << endl;
exit(-1);
}

while (getline(infile, strtmp, 'n')) // 读入语料库中的每一行并用最大匹配法处理
{
line = strtmp;
line = SegmentSetence(line); // 调用分词函数进行分词处理
outfile1 << line << endl; // 将分词结果写入目标文件
}

return 0;
}


原理参见:52NLP

每次取最大匹配到的长度,截取后重新继续匹配
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: