您的位置:首页 > 编程语言 > C语言/C++

英文文本词频统计

2014-10-21 10:40 309 查看
软件工程作业:

       老师在国庆前有布置一个任务,就是对一个英文文本进行词频统计,输出词频最高的10个单词及次数。还要做性能分析,简直抓狂啊~~~~从未做过。

       经过老师的点评,自己在后期做了一些修正,现在把最终把拿出来。

       我还是采用的C语言,使用链表加上字符数组,自己写了一个过滤单词的字典,实现了老师要求的功能。

       主要的程序设计思想是采用以下步骤:

      


    

      具体到程序来讲就主要是以下两个函数

       List *Statistic() //用来进行读取文件,并进行词典过滤

List *Statistic()
{
struct List *Head=NULL;
FILE *fp;
char infilename[1024];
int n =1;
char *b[] = {"I","We","He","She","Her","His","is","a","an","the",
"It","it","they","They","he","she","his","her","we",
"are","to","in","that","and","the","he","all","was","to","would","and","of",
"that","i","for","could","had","when","as","on","not","us","him","this","so","out",
"our","never","up","how","at","few","often","get",
"after","have","their","there","around","be","if","were",
"again","didn't","your","take","you","it's","toward","with",
"yourself","than","rather","an","what","don't","you're","or",
"--","you've","my","but","from","more","no","its","do",
"which","them","go","are","just","by","will",
"me","can","then","s","all","now","even","into","And","But","who","Then","So","may","thou","thee","Odysseus","thy"};
printf("请输入文件路径:\n");
scanf("%s",infilename);
fp=fopen(infilename,"r");//读文件
while(!feof(fp))
{
char *p=(char*)malloc(30*sizeof(char));
fscanf(fp,"%s",p);
for(int i=0;i<sizeof(b)/sizeof(b[0]);i++)
{
n *= strcmp(b[i],p);
}
if(n!=0)
{
if(Head==NULL)//分析单词频率
{
struct List *temp=(struct List*)malloc(sizeof(struct List));
strcpy(temp->word,p);
temp->m=1;
temp->next=NULL;
Head=temp;
}
else
{
struct List *L=Head;
while(L!=NULL)
{
if(strcmp(L->word,p)==0)
{
int count = L->m;
count++;
L->m = count;
break;
}
L=L->next;
}
if(L==NULL)
{
struct List*temp = (struct List*)malloc(sizeof(struct List));
strcpy(temp->word, p);
temp->m=1;
temp->next=Head;
Head=temp;
}
}
}
else
{
n=1;
}
}
return Head;
}


       void Printf  (List *Head) //输出统计的词频

void Printf(List *Head)
{
struct List *q;
int i,a[10];
for(i=0;i<10;i++)
a[i]=0;
printf("文本单词出现频率由高到低依次为:\n");//排序输出
for(i=0;i<10;i++)
{
q=Head;
while(q!=NULL)
{
if(q->m>a[i])
a[i]=q->m;
else
q=q->next;
}
q=Head;
while(q!=NULL)
{
if(a[i]==q->m)
{
q->m=0;
printf("%s\t",q->word);
printf("出现频数为:%d\n",a[i]);
break;
}
else
q=q->next;
}
}
}

    然后过滤单词系统中采用的词典主要是我自己写的词典

char *b[] = {"I","We","He","She","Her","His","is","a","an","the",
"It","it","they","They","he","she","his","her","we",
"are","to","in","that","and","the","he","all","was","to","would","and","of",
"that","i","for","could","had","when","as","on","not","us","him","this","so","out",
"our","never","up","how","at","few","often","get",
"after","have","their","there","around","be","if","were",
"again","didn't","your","take","you","it's","toward","with",
"yourself","than","rather","an","what","don't","you're","or",
"--","you've","my","but","from","more","no","its","do",
"which","them","go","are","just","by","will",
"me","can","then","s","all","now","even","into","And","But","who","Then","So","may","thou","thee","Odysseus","thy"};

通过上面三个过程,实现了功能,上面贴的代码只是自定义函数的代码,接下来附上源代码

源代码

#include<stdio.h>
#include<stdlib.h>
#include<string.h>

struct List {
char word[30];
int m;
struct List *next;
};

List *Statistic();
void Printf(List *Head);

int main() //主函数
{
struct List *Head;
Head=Statistic();
Printf(Head);
return 0;
}

List *Statistic()
{
struct List *Head=NULL;
FILE *fp;
char infilename[1024];
int n =1;
char *b[] = {"I","We","He","She","Her","His","is","a","an","the",
"It","it","they","They","he","she","his","her","we",
"are","to","in","that","and","the","he","all","was","to","would","and","of",
"that","i","for","could","had","when","as","on","not","us","him","this","so","out",
"our","never","up","how","at","few","often","get",
"after","have","their","there","around","be","if","were",
"again","didn't","your","take","you","it's","toward","with",
"yourself","than","rather","an","what","don't","you're","or",
"--","you've","my","but","from","more","no","its","do",
"which","them","go","are","just","by","will",
"me","can","then","s","all","now","even","into","And","But","who","Then","So","may","thou","thee","Odysseus","thy"};
printf("请输入文件路径:\n");
scanf("%s",infilename);
fp=fopen(infilename,"r");//读文件
while(!feof(fp))
{
char *p=(char*)malloc(30*sizeof(char));
fscanf(fp,"%s",p);
for(int i=0;i<sizeof(b)/sizeof(b[0]);i++)
{
n *= strcmp(b[i],p);
}
if(n!=0)
{
if(Head==NULL)//分析单词频率
{
struct List *temp=(struct List*)malloc(sizeof(struct List));
strcpy(temp->word,p);
temp->m=1;
temp->next=NULL;
Head=temp;
}
else
{
struct List *L=Head;
while(L!=NULL)
{
if(strcmp(L->word,p)==0)
{
int count = L->m;
count++;
L->m = count;
break;
}
L=L->next;
}
if(L==NULL)
{
struct List*temp = (struct List*)malloc(sizeof(struct List));
strcpy(temp->word, p);
temp->m=1;
temp->next=Head;
Head=temp;
}
}
}
else
{
n=1;
}
}
return Head;
}

void Printf(List *Head)
{
struct List *q;
int i,a[10];
for(i=0;i<10;i++)
a[i]=0;
printf("文本单词出现频率由高到低依次为:\n");//排序输出
for(i=0;i<10;i++)
{
q=Head;
while(q!=NULL)
{
if(q->m>a[i])
a[i]=q->m;
else
q=q->next;
}
q=Head;
while(q!=NULL)
{
if(a[i]==q->m)
{
q->m=0;
printf("%s\t",q->word);
printf("出现频数为:%d\n",a[i]);
break;
}
else
q=q->next;
}
}
}

运行一个名叫test1.txt的文本以后,统计的词频结果如下。



接下来是对程序进行性能分析,我采用的是VS2012自带的性能分析工具。







     经过性能分析以后,发现在字典过滤的模块还可以进行优化,我采用的是最传统的构建字典,然后for循环一个一个的compare,通过网上百度以后发现,其实还有更好的方法,那就是采用Hash表或者正则表达式等等来进行过滤,性能会更高,我会在后面有时间的时候加以尝试。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息