字符串查找算法:bm算法
2015-07-02 14:20
302 查看
今天有空,认真的对比了一下经典的字符串查找算法BM算法和C库查找函数 strstr 的区别,两者各有优缺点,总结一下:
bm算法的应用场合:适合海量数据搜索,比如数据库,磁盘文件等,总之是数据量越大,性能越高;
strstr,数据量较少时,比较适合,尤其是在一个几千字节的字符串中查找不同的字符串,这时候bm被strstr甩出几条街,原因就是每次查找,bm都要建立搜索模型,而strstr立马上阵。其次,搜索字符在总字符里面出现概率较少,相同率很低的情况下,strstr优势再次展露,比如在http协议头部关键字,像Host、Get、Post、User-Agent、Accept等,这些关键字一次出现几次的概率都非常少,此时用strstr较快。
废话这么多,上代码证明一下吧,同时附加一个改进过的 bm 算法代码:
bm算法的应用场合:适合海量数据搜索,比如数据库,磁盘文件等,总之是数据量越大,性能越高;
strstr,数据量较少时,比较适合,尤其是在一个几千字节的字符串中查找不同的字符串,这时候bm被strstr甩出几条街,原因就是每次查找,bm都要建立搜索模型,而strstr立马上阵。其次,搜索字符在总字符里面出现概率较少,相同率很低的情况下,strstr优势再次展露,比如在http协议头部关键字,像Host、Get、Post、User-Agent、Accept等,这些关键字一次出现几次的概率都非常少,此时用strstr较快。
废话这么多,上代码证明一下吧,同时附加一个改进过的 bm 算法代码:
#include <stdio.h> #include <string.h> #include <sys/time.h> #include <stdlib.h> #define MAX_CHAR 128 // 键盘字符,从32-126,总共95个 #define SIZE 128 #define MAX(x, y) (x) > (y) ? (x) : (y) void PreBmBc(char *pattern, int m, int bmBc[]) { int i; for (i=0; i<MAX_CHAR; i++) { bmBc[i] = m; } for (i = 0; i < m - 1; i++) { bmBc[pattern[i]] = m - 1 - i; } } void suffix_old(char *pattern, int m, int suff[]) { int i, j; suff[m - 1] = m; for(i = m - 2; i >= 0; i--) { j = i; while(j >= 0 && pattern[j] == pattern[m - 1 - i + j]) j--; suff[i] = i - j; } } void suffix(char *pattern, int m, int suff[]) { int f, g, i; suff[m - 1] = m; g = m - 1; for (i = m - 2; i >= 0; --i) { if (i > g && suff[i + m - 1 - f] < i - g) suff[i] = suff[i + m - 1 - f]; else { if (i < g) g = i; f = i; while (g >= 0 && pattern[g] == pattern[g + m - 1 - f]) --g; suff[i] = f - g; } } } void PreBmGs(char *pattern, int m, int bmGs[]) { int i, j; int suff[SIZE]; // 计算后缀数组 suffix(pattern, m, suff); // 先全部赋值为m,包含Case3 for(i = 0; i < m; i++) { bmGs[i] = m; } // Case2 j = 0; for(i = m - 1; i >= 0; i--) { if(suff[i] == i + 1) { for(; j < m - 1 - i; j++) { if(bmGs[j] == m) bmGs[j] = m - 1 - i; } } } // Case1 for(i = 0; i <= m - 2; i++) { bmGs[m - 1 - suff[i]] = m - 1 - i; } // print(bmGs, m, "bmGs[]"); } const char* BM_strstr(char *pattern, int patternLen, const char *text, int textLen) { int i, j, bmBc[MAX_CHAR], bmGs[SIZE]; // Preprocessing PreBmBc(pattern, patternLen, bmBc); PreBmGs(pattern, patternLen, bmGs); // Searching j = 0; while(j <= textLen - patternLen) { for(i = patternLen - 1; i >= 0 && pattern[i] == text[i + j]; i--); if(i < 0) { //printf("Find it, the position is %d\n", j); return (text + j); } else { j += MAX(bmBc[text[i + j]] - patternLen + 1 + i, bmGs[i]); } } //printf("No find.\n"); return NULL; } const char* line_strstr(const char* src, int srcLen, const char* dst, int dstLen) { const char* pline = src; const char* pEnd = src + srcLen; while(pline < pEnd) { if (*pline == '\n' || *pline == '\r') { ++pline; while (*pline == '\n' || *pline == '\r' || *pline == ' ' || *pline == '\t') ++pline; if (memcmp(pline, dst, dstLen) == 0) { return pline; } } else { ++pline; } } return NULL; } const char* sun_strstr(const char *text, int textLen, const char *patt, int pattLen) { unsigned int temp[256]; unsigned int *shift = temp; int i; for( i=0; i < 256; i++ ) { *(shift+i) = pattLen + 1; } for( i=0; i < pattLen; i++ ) { *(shift + (unsigned char)(*(patt+i))) = pattLen-i; } //shift['s']=6 步,shitf['e']=5 以此类推 size_t limit = textLen - pattLen+1; for(i=0; i < limit; i += shift[ text[i+pattLen] ]) { if( text[i] == *patt ) { const char *match_text = text + i + 1; size_t match_size = 1; do { // 输出所有匹配的位置 if( match_size == pattLen ) { return (text+i); } }while((*match_text++) == patt[match_size++]); } } return NULL; } const char *text = "GET /qq HTTP/1.1\n\r\ Connection: keep-alive\n\r\ Accept: image/webp,*/*;q=0.8\n\r\ User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/41.0.2272.89 Safari/537.36\n\r\ Referer: \ http://www.jd.com/?cu=true&utm_source=p.yiqifa.com&utm_medium=tuiguang&utm_campaign=t_1_792977&utm_term=2d74c473e39447508534846d9f847e2f\n\r\ Accept-Encoding: gzip, deflate, sdch \n\r\ Host: jcm.jd.com \n\r\ Accept-Language: zh-CN,zh;q=0.8\n\r"; int main() { #define TEST_CNT 1000000 char pattern[256] = {"Host"}; int patLen = strlen(pattern); int textLen = strlen(text); printf("patLen=%d, textLen=%d\n", patLen, textLen); int i = 0; struct timeval t; gettimeofday(&t, 0); unsigned long long a = t.tv_sec * 1000000ULL + t.tv_usec; for (i=0; i<TEST_CNT; ++i) { const char* pstr = BM_strstr(pattern, patLen, text, textLen); //if (pstr) //printf("%c%c%c%c\n", pstr[0], pstr[1], pstr[2], pstr[3]); if (!pstr) { printf("error\n"); exit(1); } } gettimeofday(&t, 0); unsigned long long b = t.tv_sec * 1000000ULL + t.tv_usec; printf("BM_strstr: time=%lld\n", b-a); gettimeofday(&t, 0); a = t.tv_sec * 1000000ULL + t.tv_usec; for (i=0; i<TEST_CNT; ++i) { const char* pstr = strstr(text, pattern); //if (pstr) //printf("%c%c%c%c\n", pstr[0], pstr[1], pstr[2], pstr[3]); if (!pstr) { printf("error, i=%d\n", i); exit(1); } } gettimeofday(&t, 0); b = t.tv_sec * 1000000ULL + t.tv_usec; printf("strstr: time=%lld\n", b-a); gettimeofday(&t, 0); a = t.tv_sec * 1000000ULL + t.tv_usec; for (i=0; i<TEST_CNT; ++i) { const char* pstr = line_strstr(text, textLen, pattern, patLen); //if (pstr) //printf("%c%c%c%c\n", pstr[0], pstr[1], pstr[2], pstr[3]); if (!pstr) { printf("error, i=%d\n", i); exit(1); } } gettimeofday(&t, 0); b = t.tv_sec * 1000000ULL + t.tv_usec; printf("linestrstr: time=%lld\n", b-a); gettimeofday(&t, 0); a = t.tv_sec * 1000000ULL + t.tv_usec; for (i=0; i<TEST_CNT; ++i) { const char* pstr = sun_strstr(text, textLen, pattern, patLen); //if (pstr) //printf("%c%c%c%c\n", pstr[0], pstr[1], pstr[2], pstr[3]); if (!pstr) { printf("error, i=%d\n", i); exit(1); } } gettimeofday(&t, 0); b = t.tv_sec * 1000000ULL + t.tv_usec; printf("SUNDAY: time=%lld\n", b-a); return 0; }
相关文章推荐
- UVa - 100 - The 3n + 1 problem
- Sample Code地址
- 斜坡光照阴影算法
- Linux下新手如何将VIM配置成C++编程环境(可以STL自动补全)
- [Leetcode]-Merge Two Sorted Lists
- Web开发之环境搭建
- Unity里面三角面数
- if条件综合shell一键安装LAMP
- 判断一个点是否在view上
- php 加密 解密 方法
- ContentObserver 介绍
- Cloudfoundry之Service
- JPush推送 之 RegistrationID 精确对点推送
- 【Android基础篇】TabWidget设置背景和字体
- 配置Odroid ubunutu 静态IP 并让PC 使用Ccproxy 做代理上网
- SecureCRT快捷键
- 实时操作系统与非实时操作系统到底有什么区别?
- 在ActionBar中进行Fragment之间的切换
- Android版本和API Level对应关系
- iOS开发各种小知识