您的位置:首页 > 其它

基于规则的命名实体识别

2011-04-06 19:55 363 查看
/*
* =====================================================================================
*
*       Filename:  regex_test.c
*
*    Description: 根据规则模板抽取命名实体
*
*        Version:  1.0
*        Created:  2011年04月06日 10时49分16秒
*       Revision:  none
*       Compiler:  gcc
*
*         Author:  齐保元 (qby), qibaoyuan@126.com
*        Company:  ict,gucas
*
* =====================================================================================
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
//规则结构体
typedef struct _RULE_{
char name[256];//规则的名称,如a+n+b+
double freq;//规则可以推导出是一个命名实体的概率,有王石提供
}RULE;
//存放所有的正则表达式对象
static regex_t regexes[256];
//输入,格式为{{"蓝屏","司机"}{"a","n"}},每个词对应一个词性,下标一致
static char *input[2][500];
//declear
FILE *file_open(char* file_name,char* mode);
char* read_line (FILE *fp);
char* parse_line(char* line,int index,char sign);
void process_regex(int count,char*line);
static void close_all_regex(int count);
void
get_corpse(int count){
FILE *fp=file_open("huge.crf_format.txt_0","r");
char *line;
int i=0,j=0;
char *word,*pos;
while((line=read_line(fp))!=NULL){//遇到空白行
if(strlen(line)==0){

char pos1[500]={'/0'};
j=0;
for(j=0;input[1][j]!=NULL && strlen(input[1][j])!=0;j++){
sprintf(pos1,"%s%s",pos1,input[1][j]);
//printf("pos1:%s/n",pos1);
}
process_regex(count,pos1);

i=j=0;
continue;
}
word=parse_line(line,0,'/t');
pos=parse_line(line,1,'/t');
input[0][i++]=word;
input[1][j++]=pos;

}
close_all_regex(count);
}
/*-----------------------------------------------------------------------------
*  对每一行规则进行解析,存入结构体对象
*-----------------------------------------------------------------------------*/
RULE
parse_rule(char* line){
if(line==NULL){
perror("空串");
exit(1);
}
RULE rule;
memset(rule.name,0,sizeof(rule.name));
char *p=line;
char *q=line;
while(*p==' ')p++;
while(*q==' ')q++;

while(*q!='/t')q++;//指向regex的结尾
int index=0;
while(p!=q){
if(*p=='+' || *p=='*')
sprintf(rule.name,"%s%s",rule.name,"//");
sprintf(rule.name,"%s%c",rule.name,p[index]);

p++;
}

return rule;
}
/*-----------------------------------------------------------------------------
*  对所有的正则表达式进行编译,保存到结构提regexes,返回加载的正则表达式的个数
*-----------------------------------------------------------------------------*/
static  int compile_all_regx(){
int p=0,z=0,cflags=0;
char ebuf[300];
regex_t reg;
char *pattern;

FILE *fp=file_open("template","r");
char *line;
RULE rule;
while((line=read_line(fp))!=NULL){
if(strlen(line)==0)continue;
printf("第%d行/n",p);
/*
a	0.1
b	0.2
*/
rule=parse_rule(line);
pattern=rule.name;
printf("加载第%d个正则表达式:%s,freq:%f/n",p,pattern,rule.freq);

z=regcomp(®,pattern,cflags);
if(0!=z){
regerror(z,®,ebuf,sizeof(ebuf));
fprintf(stderr,"%s:pattern '%s'/n",ebuf,pattern);
return;
}
regexes[p]=reg;
p++;

free(line);

}

return p;
}
/*-----------------------------------------------------------------------------
*  关闭所有打开的正则表达式
*-----------------------------------------------------------------------------*/
static void close_all_regex(int count){
int i=0;
for(i=0;i<count;i++){
regfree(®exes[i]);
}
}
/*-----------------------------------------------------------------------------
*  打开文件,返回句柄
*-----------------------------------------------------------------------------*/
FILE *file_open(char* file_name,char* mode){
FILE *fp;
if((fp=fopen(file_name,mode))==NULL){
fprintf(stderr,"无法打开文件:%s./n",file_name);
return NULL;
}
return fp;
}

/*-----------------------------------------------------------------------------
*  返回一行
*-----------------------------------------------------------------------------*/
char*
read_line (FILE *fp)
{
if(feof(fp))
return NULL;
char ch;
int max_len=100;
char *line=(char*)malloc(max_len);
char *new_line;
int counter=0;//counter
while( ((ch=fgetc(fp))!='/n') && !feof(fp)){
if(counter>=max_len-1){
max_len*=2;
new_line=(char*)malloc(max_len);
strcpy(new_line,line);
free(line);
line=new_line;
}
*(line+(counter++))=ch;
}
*(line+counter)='/0';
return line;
}
/*-----------------------------------------------------------------------------
*  解析,line,要取的下标,分割符
*-----------------------------------------------------------------------------*/
char*
parse_line(char* line,int index,char sign){
char *p=line;
char *q=line;
while(*p==' ')p++;
while(*q==' ')q++;
int i=0;
char *ret=(char*)calloc(sizeof(char),300);
while(1){
if(*q!=sign){q++;continue;}
if(i<index){
i++;
p=++q;

continue;
}else{
strncpy(ret,p,q-p);
p=++q;
break;
}
if(*q=='/0')break;
}

return ret;
}
void
process_regex(int count,char *lbuf){
int p=0;
char *pattern;
int x,z,no=0,cflags=0;
char ebuf[1280]={'/0'};
char ret[1024];
char *curr;
regmatch_t pm[10];
const size_t nmatch=10;
char word[200];
int word_i;
int ori_i;
//对输入在每个正则表达式进行匹配,选取最长的
for(p=0;p<count;p++){
++no;
a	if((z=strlen(lbuf))>=0 && lbuf[z-1]=='/n')
lbuf[z-1]=0;
curr=lbuf;
ori_i=0;
while(regexec(®exes[p],curr,nmatch,pm,0)==0){
if(pm[0].rm_so==-1)
break;
memset(ret,0x00,sizeof(ret));
memset(word,0x00,sizeof(word));
memcpy(ret,curr+pm[0].rm_so,pm[0].rm_eo-pm[0].rm_so);
for(word_i=pm[0].rm_so+ori_i;word_i<pm[0].rm_eo+ori_i;word_i++){
sprintf(word,"%s%s",word,input[0][word_i]);
}

printf("pos:%s->word:%s/n/n",ret,word);
curr+=pm[0].rm_eo;
ori_i+=pm[0].rm_eo;
}
}
}
int main(int argc,char** argv){

int count=compile_all_regx();//预编译regex

get_corpse(count);

return EXIT_SUCCESS;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: