您的位置:首页 > 其它

贝叶斯分类器的简单应用-拼写检查的实现

2015-06-30 19:30 295 查看
最近看了贝叶斯定理,挺有意思的,下面是贝叶斯定理在拼写检查中的应用具体的理论部分参看阮一峰老师的文章

点击打开链接 pathyon实现只用了21行。 由于编辑距离为2的扩展实在是太大了,这里就只实现了编辑距离为1的查找

以C#实现(Ps:编辑距离可以更加完善,数据字典的效率比较低啊)

/// <summary>
/// 统计次数类
/// </summary>
public class Statistics
{
public  Dictionary<string, int> di_text;
public Statistics(List<string> Li_text)
{
di_text = new Dictionary<string, int>();
foreach(string word in Li_text)
{
if(di_text.Keys.Contains(word))
{
di_text[word]++;
}
else
{
di_text.Add(word, 1);
}
}
}
}


/// <summary>
/// 编辑距离类
/// </summary>
class Edit
{
//数据字典,单词以及对应的 编辑距离为1的词
public  Dictionary<string, List<string>> Di_Word_Edit = new Dictionary<string, List<string>>();
public void  Edit_1(ref List<string> text)

{

Char[] char_text;
string alphat = "abcdefghijklmnopqrstuvwxyz";
char[] alphat_char=alphat.ToCharArray();
List<string> li_text_2=new List<string>();
int text_count = text.Count;
string Later_text; int Lengths=0;
for(int i=0;i<text_count;i++)
{
if (!Di_Word_Edit.Keys.Contains(text[i]))
{

li_text_2 = new List<string>();
string Before_text = "";
Later_text = text[i];
Lengths = Later_text.Count();
//split
for (int j = 0; j < Lengths; j++)
{
li_text_2.Add(Before_text);
li_text_2.Add(Later_text);
Before_text = Before_text + Later_text.Substring(0, 1);
Later_text = Later_text.Substring(1, Lengths - j - 1);
}

li_text_2.Add(text[i]);
li_text_2.Add("");

//deletes
for (int j = 0; j < Lengths; j++)
{
li_text_2.Add(text[i].Remove(j, 1));
}
//transport
char temps;

for (int k = 0; k < Lengths - 1; k++)
{
char_text = text[i].ToCharArray();
StringBuilder texts = new StringBuilder();
temps = char_text[k];
char_text[k] = char_text[k + 1];
char_text[k + 1] = temps;
for (int j = 0; j < Lengths; j++)
{
texts.Append(char_text[j]);

}
li_text_2.Add(texts.ToString());
}
//replaces
string replaces_temps;
for (int j = 0; j < Lengths; j++)
{

for (int k = 0; k < alphat.Length; k++)
{
replaces_temps = text[i].Remove(j, 1);
li_text_2.Add(replaces_temps.Insert(j, alphat_char[k].ToString()));
}
}
//inserts
for (int j = 0; j < Lengths + 1; j++)
{

for (int k = 0; k < alphat.Length; k++)
{

li_text_2.Add(text[i].Insert(j, alphat_char[k].ToString()));
}
}
//因为同样的单词的编辑距离为1是一样的,所以这里用数据字典,在计算频率的时候给乘上次数就可以
Di_Word_Edit.Add(text[i], li_text_2);

}

}

}

class Program
{
static string path;
/// <summary>
/// 单词链表产生方法
/// </summary>
/// <param name="li_text"></param>
static void List_Intialize(ref List<string> li_text)
{
string rules = "[a-z]+";
Regex gt = new Regex(rules, RegexOptions.IgnoreCase | RegexOptions.Multiline);
MatchCollection ma;
path = "D:/test.txt";
ma = gt.Matches(String_Copy());
foreach (Match t in ma)
{
li_text.Add(t.Value.ToLower());
}

}
static void Edit1_Seek(string read_string,Edit edit,Statistics state )
{
DateTime t1 = DateTime.Now;
string temp_word ="Not Found";
int counts = 0;
int temps = 0;//记录出现频次,大的替换
foreach (string word in edit.Di_Word_Edit.Keys)
{
counts = 0;
foreach (string edit_word in edit.Di_Word_Edit[word])
{
if (edit_word == read_string)
{
counts++;
}

}
//计算频率,用大的替换,因为错误中出现的频数就是正确单词出现的次数和其边际
if ((temps < (((double)counts / edit.Di_Word_Edit[word].Count) * state.di_text[word] * state.di_text[word])))
{
temps = counts / edit.Di_Word_Edit[word].Count * state.di_text[word];
temp_word = word;
}

}
DateTime t2 = DateTime.Now;
TimeSpan timer = t2 - t1;

Console.WriteLine(temp_word + "   Times:   " + timer.TotalSeconds);
}
static void Main(string[] args)
{
//单词链表
List<string> li_text = new List<string>();
List_Intialize(ref li_text);
Statistics stat = new Statistics(li_text);
Edit edit = new Edit();
edit.Edit_1(ref li_text);
Console.WriteLine("Analysis Down!");
while (true)
{

string read_string = Console.ReadLine().ToLower();
//判断是否在单词文本中
bool is_right = false;
foreach (string word in edit.Di_Word_Edit.Keys)
{
if (read_string.Trim() == word)
{
Console.WriteLine(word);
is_right = true;
break;
}
}

Statistics state = new Statistics(li_text);

//如果不在单词文本中,则在每个单词的编辑距离为1的单词中找
if (is_right == false)
{
Edit1_Seek(read_string, edit,state);

}
}
}
//复制字符串
static string String_Copy()
{

using(StreamReader  reader=new StreamReader(path))
{
return reader.ReadToEnd();
}

}
}


实现结果如下

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: