英文文本词频统计
2014-10-21 10:40
309 查看
软件工程作业:
老师在国庆前有布置一个任务,就是对一个英文文本进行词频统计,输出词频最高的10个单词及次数。还要做性能分析,简直抓狂啊~~~~从未做过。
经过老师的点评,自己在后期做了一些修正,现在把最终把拿出来。
我还是采用的C语言,使用链表加上字符数组,自己写了一个过滤单词的字典,实现了老师要求的功能。
主要的程序设计思想是采用以下步骤:
具体到程序来讲就主要是以下两个函数
List *Statistic() //用来进行读取文件,并进行词典过滤
List *Statistic()
{
struct List *Head=NULL;
FILE *fp;
char infilename[1024];
int n =1;
char *b[] = {"I","We","He","She","Her","His","is","a","an","the",
"It","it","they","They","he","she","his","her","we",
"are","to","in","that","and","the","he","all","was","to","would","and","of",
"that","i","for","could","had","when","as","on","not","us","him","this","so","out",
"our","never","up","how","at","few","often","get",
"after","have","their","there","around","be","if","were",
"again","didn't","your","take","you","it's","toward","with",
"yourself","than","rather","an","what","don't","you're","or",
"--","you've","my","but","from","more","no","its","do",
"which","them","go","are","just","by","will",
"me","can","then","s","all","now","even","into","And","But","who","Then","So","may","thou","thee","Odysseus","thy"};
printf("请输入文件路径:\n");
scanf("%s",infilename);
fp=fopen(infilename,"r");//读文件
while(!feof(fp))
{
char *p=(char*)malloc(30*sizeof(char));
fscanf(fp,"%s",p);
for(int i=0;i<sizeof(b)/sizeof(b[0]);i++)
{
n *= strcmp(b[i],p);
}
if(n!=0)
{
if(Head==NULL)//分析单词频率
{
struct List *temp=(struct List*)malloc(sizeof(struct List));
strcpy(temp->word,p);
temp->m=1;
temp->next=NULL;
Head=temp;
}
else
{
struct List *L=Head;
while(L!=NULL)
{
if(strcmp(L->word,p)==0)
{
int count = L->m;
count++;
L->m = count;
break;
}
L=L->next;
}
if(L==NULL)
{
struct List*temp = (struct List*)malloc(sizeof(struct List));
strcpy(temp->word, p);
temp->m=1;
temp->next=Head;
Head=temp;
}
}
}
else
{
n=1;
}
}
return Head;
}
void Printf (List *Head) //输出统计的词频
void Printf(List *Head)
{
struct List *q;
int i,a[10];
for(i=0;i<10;i++)
a[i]=0;
printf("文本单词出现频率由高到低依次为:\n");//排序输出
for(i=0;i<10;i++)
{
q=Head;
while(q!=NULL)
{
if(q->m>a[i])
a[i]=q->m;
else
q=q->next;
}
q=Head;
while(q!=NULL)
{
if(a[i]==q->m)
{
q->m=0;
printf("%s\t",q->word);
printf("出现频数为:%d\n",a[i]);
break;
}
else
q=q->next;
}
}
}
然后过滤单词系统中采用的词典主要是我自己写的词典
char *b[] = {"I","We","He","She","Her","His","is","a","an","the",
"It","it","they","They","he","she","his","her","we",
"are","to","in","that","and","the","he","all","was","to","would","and","of",
"that","i","for","could","had","when","as","on","not","us","him","this","so","out",
"our","never","up","how","at","few","often","get",
"after","have","their","there","around","be","if","were",
"again","didn't","your","take","you","it's","toward","with",
"yourself","than","rather","an","what","don't","you're","or",
"--","you've","my","but","from","more","no","its","do",
"which","them","go","are","just","by","will",
"me","can","then","s","all","now","even","into","And","But","who","Then","So","may","thou","thee","Odysseus","thy"};
通过上面三个过程,实现了功能,上面贴的代码只是自定义函数的代码,接下来附上源代码
源代码
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
struct List {
char word[30];
int m;
struct List *next;
};
List *Statistic();
void Printf(List *Head);
int main() //主函数
{
struct List *Head;
Head=Statistic();
Printf(Head);
return 0;
}
List *Statistic()
{
struct List *Head=NULL;
FILE *fp;
char infilename[1024];
int n =1;
char *b[] = {"I","We","He","She","Her","His","is","a","an","the",
"It","it","they","They","he","she","his","her","we",
"are","to","in","that","and","the","he","all","was","to","would","and","of",
"that","i","for","could","had","when","as","on","not","us","him","this","so","out",
"our","never","up","how","at","few","often","get",
"after","have","their","there","around","be","if","were",
"again","didn't","your","take","you","it's","toward","with",
"yourself","than","rather","an","what","don't","you're","or",
"--","you've","my","but","from","more","no","its","do",
"which","them","go","are","just","by","will",
"me","can","then","s","all","now","even","into","And","But","who","Then","So","may","thou","thee","Odysseus","thy"};
printf("请输入文件路径:\n");
scanf("%s",infilename);
fp=fopen(infilename,"r");//读文件
while(!feof(fp))
{
char *p=(char*)malloc(30*sizeof(char));
fscanf(fp,"%s",p);
for(int i=0;i<sizeof(b)/sizeof(b[0]);i++)
{
n *= strcmp(b[i],p);
}
if(n!=0)
{
if(Head==NULL)//分析单词频率
{
struct List *temp=(struct List*)malloc(sizeof(struct List));
strcpy(temp->word,p);
temp->m=1;
temp->next=NULL;
Head=temp;
}
else
{
struct List *L=Head;
while(L!=NULL)
{
if(strcmp(L->word,p)==0)
{
int count = L->m;
count++;
L->m = count;
break;
}
L=L->next;
}
if(L==NULL)
{
struct List*temp = (struct List*)malloc(sizeof(struct List));
strcpy(temp->word, p);
temp->m=1;
temp->next=Head;
Head=temp;
}
}
}
else
{
n=1;
}
}
return Head;
}
void Printf(List *Head)
{
struct List *q;
int i,a[10];
for(i=0;i<10;i++)
a[i]=0;
printf("文本单词出现频率由高到低依次为:\n");//排序输出
for(i=0;i<10;i++)
{
q=Head;
while(q!=NULL)
{
if(q->m>a[i])
a[i]=q->m;
else
q=q->next;
}
q=Head;
while(q!=NULL)
{
if(a[i]==q->m)
{
q->m=0;
printf("%s\t",q->word);
printf("出现频数为:%d\n",a[i]);
break;
}
else
q=q->next;
}
}
}
运行一个名叫test1.txt的文本以后,统计的词频结果如下。
接下来是对程序进行性能分析,我采用的是VS2012自带的性能分析工具。
经过性能分析以后,发现在字典过滤的模块还可以进行优化,我采用的是最传统的构建字典,然后for循环一个一个的compare,通过网上百度以后发现,其实还有更好的方法,那就是采用Hash表或者正则表达式等等来进行过滤,性能会更高,我会在后面有时间的时候加以尝试。
老师在国庆前有布置一个任务,就是对一个英文文本进行词频统计,输出词频最高的10个单词及次数。还要做性能分析,简直抓狂啊~~~~从未做过。
经过老师的点评,自己在后期做了一些修正,现在把最终把拿出来。
我还是采用的C语言,使用链表加上字符数组,自己写了一个过滤单词的字典,实现了老师要求的功能。
主要的程序设计思想是采用以下步骤:
具体到程序来讲就主要是以下两个函数
List *Statistic() //用来进行读取文件,并进行词典过滤
List *Statistic()
{
struct List *Head=NULL;
FILE *fp;
char infilename[1024];
int n =1;
char *b[] = {"I","We","He","She","Her","His","is","a","an","the",
"It","it","they","They","he","she","his","her","we",
"are","to","in","that","and","the","he","all","was","to","would","and","of",
"that","i","for","could","had","when","as","on","not","us","him","this","so","out",
"our","never","up","how","at","few","often","get",
"after","have","their","there","around","be","if","were",
"again","didn't","your","take","you","it's","toward","with",
"yourself","than","rather","an","what","don't","you're","or",
"--","you've","my","but","from","more","no","its","do",
"which","them","go","are","just","by","will",
"me","can","then","s","all","now","even","into","And","But","who","Then","So","may","thou","thee","Odysseus","thy"};
printf("请输入文件路径:\n");
scanf("%s",infilename);
fp=fopen(infilename,"r");//读文件
while(!feof(fp))
{
char *p=(char*)malloc(30*sizeof(char));
fscanf(fp,"%s",p);
for(int i=0;i<sizeof(b)/sizeof(b[0]);i++)
{
n *= strcmp(b[i],p);
}
if(n!=0)
{
if(Head==NULL)//分析单词频率
{
struct List *temp=(struct List*)malloc(sizeof(struct List));
strcpy(temp->word,p);
temp->m=1;
temp->next=NULL;
Head=temp;
}
else
{
struct List *L=Head;
while(L!=NULL)
{
if(strcmp(L->word,p)==0)
{
int count = L->m;
count++;
L->m = count;
break;
}
L=L->next;
}
if(L==NULL)
{
struct List*temp = (struct List*)malloc(sizeof(struct List));
strcpy(temp->word, p);
temp->m=1;
temp->next=Head;
Head=temp;
}
}
}
else
{
n=1;
}
}
return Head;
}
void Printf (List *Head) //输出统计的词频
void Printf(List *Head)
{
struct List *q;
int i,a[10];
for(i=0;i<10;i++)
a[i]=0;
printf("文本单词出现频率由高到低依次为:\n");//排序输出
for(i=0;i<10;i++)
{
q=Head;
while(q!=NULL)
{
if(q->m>a[i])
a[i]=q->m;
else
q=q->next;
}
q=Head;
while(q!=NULL)
{
if(a[i]==q->m)
{
q->m=0;
printf("%s\t",q->word);
printf("出现频数为:%d\n",a[i]);
break;
}
else
q=q->next;
}
}
}
然后过滤单词系统中采用的词典主要是我自己写的词典
char *b[] = {"I","We","He","She","Her","His","is","a","an","the",
"It","it","they","They","he","she","his","her","we",
"are","to","in","that","and","the","he","all","was","to","would","and","of",
"that","i","for","could","had","when","as","on","not","us","him","this","so","out",
"our","never","up","how","at","few","often","get",
"after","have","their","there","around","be","if","were",
"again","didn't","your","take","you","it's","toward","with",
"yourself","than","rather","an","what","don't","you're","or",
"--","you've","my","but","from","more","no","its","do",
"which","them","go","are","just","by","will",
"me","can","then","s","all","now","even","into","And","But","who","Then","So","may","thou","thee","Odysseus","thy"};
通过上面三个过程,实现了功能,上面贴的代码只是自定义函数的代码,接下来附上源代码
源代码
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
struct List {
char word[30];
int m;
struct List *next;
};
List *Statistic();
void Printf(List *Head);
int main() //主函数
{
struct List *Head;
Head=Statistic();
Printf(Head);
return 0;
}
List *Statistic()
{
struct List *Head=NULL;
FILE *fp;
char infilename[1024];
int n =1;
char *b[] = {"I","We","He","She","Her","His","is","a","an","the",
"It","it","they","They","he","she","his","her","we",
"are","to","in","that","and","the","he","all","was","to","would","and","of",
"that","i","for","could","had","when","as","on","not","us","him","this","so","out",
"our","never","up","how","at","few","often","get",
"after","have","their","there","around","be","if","were",
"again","didn't","your","take","you","it's","toward","with",
"yourself","than","rather","an","what","don't","you're","or",
"--","you've","my","but","from","more","no","its","do",
"which","them","go","are","just","by","will",
"me","can","then","s","all","now","even","into","And","But","who","Then","So","may","thou","thee","Odysseus","thy"};
printf("请输入文件路径:\n");
scanf("%s",infilename);
fp=fopen(infilename,"r");//读文件
while(!feof(fp))
{
char *p=(char*)malloc(30*sizeof(char));
fscanf(fp,"%s",p);
for(int i=0;i<sizeof(b)/sizeof(b[0]);i++)
{
n *= strcmp(b[i],p);
}
if(n!=0)
{
if(Head==NULL)//分析单词频率
{
struct List *temp=(struct List*)malloc(sizeof(struct List));
strcpy(temp->word,p);
temp->m=1;
temp->next=NULL;
Head=temp;
}
else
{
struct List *L=Head;
while(L!=NULL)
{
if(strcmp(L->word,p)==0)
{
int count = L->m;
count++;
L->m = count;
break;
}
L=L->next;
}
if(L==NULL)
{
struct List*temp = (struct List*)malloc(sizeof(struct List));
strcpy(temp->word, p);
temp->m=1;
temp->next=Head;
Head=temp;
}
}
}
else
{
n=1;
}
}
return Head;
}
void Printf(List *Head)
{
struct List *q;
int i,a[10];
for(i=0;i<10;i++)
a[i]=0;
printf("文本单词出现频率由高到低依次为:\n");//排序输出
for(i=0;i<10;i++)
{
q=Head;
while(q!=NULL)
{
if(q->m>a[i])
a[i]=q->m;
else
q=q->next;
}
q=Head;
while(q!=NULL)
{
if(a[i]==q->m)
{
q->m=0;
printf("%s\t",q->word);
printf("出现频数为:%d\n",a[i]);
break;
}
else
q=q->next;
}
}
}
运行一个名叫test1.txt的文本以后,统计的词频结果如下。
接下来是对程序进行性能分析,我采用的是VS2012自带的性能分析工具。
经过性能分析以后,发现在字典过滤的模块还可以进行优化,我采用的是最传统的构建字典,然后for循环一个一个的compare,通过网上百度以后发现,其实还有更好的方法,那就是采用Hash表或者正则表达式等等来进行过滤,性能会更高,我会在后面有时间的时候加以尝试。
相关文章推荐
- 编程统计一个英文文本文件中单词词频
- c++ 统计英文文本中每个单词的词频并且按照词频对每行排序
- 统计一个英文文本的单词词频
- C语言实现英文文本词频统计
- c++ 统计英文文本中每个单词的词频并且按照词频对每行排序
- 统计英文文本中的词频
- 英文单词词频统计代码
- 软件体系结构课程设计:词频统计程序(包含英文单词和数字double,int)
- Hadoop的改进实验(中文分词词频统计及英文词频统计)(1/4)
- Python 对文本先按词频统计,若相同按字典排序,后取TopN
- Hadoop的改进实验(中文分词词频统计及英文词频统计)(2/4)
- 统计英文单词词频
- 一本英文小说的词频统计
- Hadoop的改进实验(中文分词词频统计及英文词频统计)(3/4)
- 编译器DIY之———统计英文文本中的单词数,字符数和行数
- Java 文件统计:编写程序,统计英文文本文件中的字符数目和单词数目。程序运行时,输入要统计的文件的名称,程序处理后输出字符数目和单词数目
- 1st 英文文章词频统计
- 使用Lucene词频统计与d3.cloud展示的中文英文词云系统
- Python 3.6 利用NLTK 统计多个文本中的词频
- Hadoop的改进实验(中文分词词频统计及英文词频统计)(4/4)