您的位置:首页 > 编程语言 > C语言/C++

常见的字符串匹配算法对比实现C语言版本

2015-08-25 17:15 711 查看
/*文件命名:match.c

*/

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <time.h>

#include <memory.h>

#define ASCII_SIZE 256

#define SUCCESS 0

#define ERROR -1

#define RABIN_KARP_BASE 2

struct BM_bad_character_node {

int pos;

struct BM_bad_character_node *next;

};

void brute_force(char *T, char *P) {

int i,j,total=0;

if (T == NULL || P == NULL)

return;

for (i=j=0; T[i] != '\0'; ) {

total ++;

if (j < 0 || T[i] == P[j]) {

i++; j++;

if (P[j] == '\0') { // 找到了

printf("Brute-Force 在位置 %d 找到 %s, 总比对次数=%d\n",i-j,P,total);

return;

}

}

else {

i = i - j;

j = -1;

}

}

printf("Brute-Force 找不到 %s, 总比对次数=%d\n",P,total);

return;

} // end of brute_force()

void preprocess_KMP(char *P, int *next) {

int k=-1,i=1,nLen=0;

if (P == NULL)

return;

nLen = strlen(P);

next[0]=k;

for (i=0; i<nLen; i++) {

next[i+1] = next[i] + 1;

while (next[i+1] > 0 && P[next[i+1]-1] != P[i]) {

next[i+1] = next[next[i+1]-1]+1;

}

}

} // end of preprocess_KMP()

void kmp(char *T, char *P, int *next) {

int i,j,total=0;

if (T == NULL || P == NULL || next == NULL)

return;

for (i=j=0; T[i] != '\0'; ) {

total ++;

if (j<0 || T[i] == P[j]) {

++i; ++j;

if (P[j] == '\0') { // 找到了

printf("KMP 在位置 %d 找到 %s, 总比对次数=%d\n",i-j,P,total);

return;

}

}

else

j = next[j];

}

printf("KMP 找不到 %s, 总比对次数=%d\n",P,total);

return;

} // end of kmp

int bad_character_BM(char *p, struct BM_bad_character_node *BM_head[ASCII_SIZE]) {

unsigned char uc;

char *pCh=NULL;

int nPos = 0;

struct BM_bad_character_node *ptr=NULL;

if (p == NULL)

return ERROR;

pCh = p;

while (*pCh != '\0') {

uc = (unsigned char)*pCh;

ptr = (struct BM_bad_character_node *)malloc(sizeof(struct BM_bad_character_node));

if (ptr == NULL)

return ERROR;

ptr->pos = nPos;

ptr->next = BM_head[uc];

BM_head[uc] = ptr;

nPos ++;

pCh ++;

}

return SUCCESS;

} // end of bad_character_BM()

void good_suffix_BM(char *p, int *good_suffix) {

int nLen=0,i;

char *pCh=NULL,*pCh2=NULL,*pCh3=NULL;

if (p == NULL || good_suffix == NULL)

return;

nLen = strlen(p);

for (i=nLen-2;i>=0;i--) {

pCh = p+i+1;

pCh2 = p;

while ((pCh3 = strstr(pCh2,pCh)) != NULL) {

if (pCh3 < pCh)

pCh2 = pCh3+1;

else

break;

}

if (pCh2 == p) // can't find pCh anymore

good_suffix[i] = nLen;

else {

pCh2 --;

good_suffix[i] = pCh-pCh2;

}

//printf("good_suffix[%d]=%d, pCh=%s, pCh2=%s\n",i,good_suffix[i],pCh,pCh2);

}

} // end of good_suffix_BM()

void bm(char *T, char *P, struct BM_bad_character_node *BM_head[ASCII_SIZE], int *good_suffix) {

int i,j,nLen=0,nTextLen=0;

int total = 0;

int nBadCharMove=0, nGoodSuffixMove=0, nMove=0;

unsigned char uc;

struct BM_bad_character_node *ptr=NULL;

if (T == NULL || P == NULL || good_suffix == NULL)

return;

nLen = strlen(P);

nTextLen = strlen(T);

i = j = nLen-1;

while(i < nTextLen) {

while(T[i] == P[j]) {

total ++;

if (j == 0) {

printf("BM 在位置 %d 找到 %s, 总比对次数=%d\n",i,P,total);

return;

}

i--; j--;

}

total ++;

// 计算 bad_character_rule 位移

uc = (unsigned char)T[i];

ptr = BM_head[uc];

while (ptr) {

if (ptr->pos < j)

break;

ptr = ptr->next;

}

if (ptr == NULL)

nBadCharMove = nLen;

else

nBadCharMove = j-ptr->pos;

// 计算 good_suffix_rule 位移

nGoodSuffixMove = good_suffix[j];

// 比较 move 格数

if (nGoodSuffixMove > nBadCharMove)

nMove = nGoodSuffixMove;

else

nMove = nBadCharMove;

// 移动箭头

i = i + nMove + (nLen-1-j);

j = nLen-1;

}

printf("BM 找不到 %s, 总比对次数=%d\n",P,total);

} // end of bm()

void Rabin_Karp(char *T, char *P) {

unsigned int sumText=0,sumPattern=0,maxFactor=0;

long long int llsum=0;

int nLen=0,nTextLen=0,i,j,total=0;

if (T == NULL || P == NULL)

return;

nLen = strlen(P);

nTextLen = strlen(T);

// 取得 Pattern 的 Hash, overflow 就算了

for (i=0;i<nLen;i++) {

sumPattern *= RABIN_KARP_BASE;

sumPattern += P[i];

if (maxFactor == 0)

maxFactor = 1;

else

maxFactor *= RABIN_KARP_BASE;

}

for (i=0;i<nLen;i++) {

sumText *= RABIN_KARP_BASE;

sumText += T[i];

}

while (i < nTextLen) {

total ++;

//printf("sumText=%u, sumPattern=%u, maxFactor=%d\n",sumText, sumPattern, maxFactor);

if (sumText == sumPattern) { // Hash 相同, 仔细比对

for (j=0;j<nLen;j++) {

total ++;

if (P[j] != T[i-nLen+j])

break;

}

if (j == nLen) { // 找到了

printf("Rabin-Karp 在位置 %d 找到 %s, 总比对次数=%d\n",i-j,P,total);

return;

}

}

// 不相同, Text 退一个字再加一个字

//printf("i=%d, 退 %c 加 %c\n",i,T[i-nLen],T[i]);

sumText = sumText - T[i-nLen] * maxFactor;

sumText = sumText * RABIN_KARP_BASE;

sumText = sumText + T[i];

i++;

}

printf("Rabin-Karp 找不到 %s, 总比对次数=%d\n",P,total);

} // end of Rabin_Karp()

int main(int argc, char *argv[]) {

int *next=NULL; // KMP 使用

char *p=NULL,*t=NULL;

int i,nLen=0;

FILE *fp=NULL;

clock_t begin=0,end=0;

int *good_suffix = NULL; // BM 使用

struct BM_bad_character_node *BM_head[ASCII_SIZE]; // BM 使用

struct BM_bad_character_node *ptr=NULL; // BM 使用

if (argc != 3) {

printf("请使用 string_matching 档案名称 要寻找的文字\n");

return;

}

memset(BM_head,0,sizeof(struct BM_bad_character_node *)*ASCII_SIZE); // BM 使用

nLen = strlen(argv[2]);

if ((next = (int *)malloc(sizeof(int)*(nLen+1))) == NULL)

goto errexit;

/////////////////////////

// 读入 Pattern 文字

/////////////////////////

if ((p = (char *)malloc(sizeof(char)*(nLen+1))) == NULL)

goto errexit;

strncpy(p,argv[2],nLen);

p[nLen] = '\0';

if ((fp = fopen(argv[1],"r")) == NULL)

goto errexit;

/////////////////////////

// 读入 Text 档案

/////////////////////////

fseek(fp,0,SEEK_END);

nLen = ftell(fp);

rewind(fp);

if ((t = (char *)malloc(sizeof(char)*(nLen+1))) == NULL)

goto errexit;

nLen = fread(t,1,nLen,fp);

t[nLen] = '\0';

/////////////////////////

// Brute-Force search

/////////////////////////

begin = clock();

brute_force(t,p);

end = clock();

printf("Time of Brute-Force is %f\n",(float)(end-begin)/CLOCKS_PER_SEC);

/////////////////////////

// KMP search

/////////////////////////

preprocess_KMP(p,next);

begin = clock();

kmp(t,p,next);

end = clock();

printf("Time of KMP is %f\n",(float)(end-begin)/CLOCKS_PER_SEC);

/////////////////////////

// BM search

/////////////////////////

nLen = strlen(p);

if ((good_suffix = (int *)malloc(sizeof(int)*nLen)) == NULL)

goto errexit;

memset(good_suffix,0,sizeof(int)*nLen);

if (bad_character_BM(p,BM_head) != SUCCESS)

goto errexit;

good_suffix_BM(p,good_suffix);

begin = clock();

bm(t,p,BM_head,good_suffix);

end = clock();

printf("Time of BM is %f\n",(float)(end-begin)/CLOCKS_PER_SEC);

/////////////////////////

// Rabin-Karp search

/////////////////////////

begin = clock();

Rabin_Karp(t,p);

end = clock();

printf("Time of Rabin-Karp is %f\n",(float)(end-begin)/CLOCKS_PER_SEC);

errexit:

if (fp) {

fclose(fp);

fp = NULL;

}

if (p) {

free(p);

p = NULL;

}

if (t) {

free(t);

t = NULL;

}

if (next) {

free(next);

next = NULL;

}

if (good_suffix) {

free(good_suffix);

good_suffix = NULL;

}

for (i=0;i<ASCII_SIZE;i++) {

ptr = BM_head[i];

while(ptr) {

BM_head[i] = BM_head[i]->next;

free(ptr);

ptr = BM_head[i];

}

ptr = NULL;

}

} // end of main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: