您的位置:首页 > 其它

sunday 字符串匹配算法的实现(支持二进制匹配)

2013-05-05 21:50 441 查看
之前在解析multi-part formdata协议的http请求包时, 用字符串匹配的方式寻找包体中的 boundary标记.  这就涉及到了字符串匹配算法,  然后就选择了sunday算法.
sunday是我所知到目前最快的单模式字符串匹配算法了, 由于请求包体中可能含有二进制, 所以把sunday算法改造成了支持二进制串匹配的.

sunday算法的原理不多说,网上一搜一大把, 下面贴下我的实现:

/*
* @desc    : Sunday String pattern matching algorithm (also support binary buf pattern match)
* @author  : nemozhang
*
*/
#ifndef __SUNDAY_H_20111203__
#define __SUNDAY_H_20111203__

#include <stdio.h>
#include <vector>

#ifndef u_char
#define u_char unsigned char
#endif

class SundayAlgo
{
public:
enum
{
JUMP_TABLE_LEN                          = 256    // 跳转表大小
};

enum
{
MATCH_RULE_STEP_ONE_CHAR        = 0,    // 匹配上时, 跳过一个字符长度继续匹配
MATCH_RULE_STEP_ONE_PATTEN      = 1,    // 匹配上时, 跳过一个模式串长度继续匹配
};

public:

SundayAlgo():
_jump_table_inited(false),
_pat_start(0),
_pat_end(0),
_match_rule(MATCH_RULE_STEP_ONE_CHAR)
{}

public:
// 源串     [text_start, text_end)
// 模式串   [pat_start, pat_end)
// @return -1 没找到, else 目标相对于起始串的偏移
int Search(const char *text_start, const char *text_end)
{
if (text_start >= text_end)
{
return -1;
}

if (!_jump_table_inited)
{
return -1;
}

int text_len = text_end - text_start;
int pat_len = _pat_end - _pat_start;

for (int i=0; i<text_len-pat_len+1; )
{
bool finded = true;
// 从后往前匹配
for (int j=0; j<pat_len; ++j)
{
// 匹配不上, 跳
if (text_start[i+pat_len-j-1] != _pat_start[pat_len-j-1])
{
//printf("i:%d, j:%d\n", i, j);
//printf("text:%d [%c], pat:%d [%c] \n", i+pat_len-j-1, text_start[i+pat_len-j-1],  pat_len-j-1, _pat_start[pat_len-j-1]);
//printf("i:%d [%c], j:%d [%c] \n", i, text_start[i],  j, _pat_start[j]);
u_char next_c = (u_char)text_start[i + pat_len];
//printf("next c:%d, [%c], jmp:%d\n", i+pat_len, next_c, _jump_table[next_c]);

i += _jump_table[next_c];

finded = false;
break;
}
}

if (finded)
{
// 匹配上了
return i;
}
}

return -1;
}

// 将每一个匹配项的偏移存于pos_vec
void Search(const char *text_start, const char *text_end, std::vector<int> &pos_vec)
{
int pos = 0;

const char *text_start_new = (const char*)text_start;

int pat_len = _pat_end - _pat_start;

while(pos != -1)
{
pos = Search(text_start_new, text_end);

if (pos != -1)
{
pos_vec.push_back(pos + text_start_new - text_start);

if (MATCH_RULE_STEP_ONE_CHAR == _match_rule)
{
text_start_new += (1 + pos);
}
else
{
text_start_new += (pat_len + pos);
}

}
else
{
break;
}
}
}

// 设置模式串
// [pat_start, pat_end) 不含pat_end.
void SetPatten(const char* pat_start, const char* pat_end)
{
_pat_start = pat_start;
_pat_end = pat_end;
PreCompute(pat_start, pat_end);
}

// 设置匹配策略
// 假设文本串为 "aaaaaa", 模式串为 "aaa"
// 如果rule:MATCH_RULE_STEP_ONE_CHAR, 则会产生 4次匹配
// 如果rule:MATCH_RULE_STEP_ONE_PATTERN, 则会产生 2次匹配
void SetMatchRule(int rule)
{
_match_rule = rule;
}

private:
// 生成跳转表
void PreCompute(const char* pat_start, const char* pat_end)
{
if (pat_start >= pat_end)
{
return;
}

int pat_len = pat_end - pat_start ;

// 初始化
for (int i=0; i<JUMP_TABLE_LEN; ++i)
{
_jump_table[i] = pat_len + 1; // pat长度+1
}

const char* p = pat_start;
for (; p!=pat_end; ++p)
{
_jump_table[(u_char)(*p)] = pat_end - p;
}

_jump_table_inited = true;
}

private:
u_char  _jump_table[JUMP_TABLE_LEN];
bool    _jump_table_inited;

const char    *_pat_start;
const char    *_pat_end;

int             _match_rule;
};

#endif


测试用例:
// by nemozhang

#include <gtest/gtest.h>
#include "sunday.h"
#include <unistd.h>

using namespace std;

TEST(autorun_SundayAlgo, test_ascii_str) {

const char *text = "sunhello world !\n taday is sunday, i feel good now.\nthis is a text for sunday algo test program.day, sunhow,dslasun.sdslsunday" ;
const char *pat = "sunday";

int text_len = strlen(text);
int pat_len = strlen(pat);

SundayAlgo sunday;
const char * pat_start = (const char*)pat;
const char * pat_end = pat_start + pat_len;
sunday.SetPatten(pat_start, pat_end);

vector<int> pos_vec;

sunday.Search(text, text + text_len, pos_vec);

printf("hit times : %d\n", pos_vec.size());
for (size_t i=0; i<pos_vec.size(); ++i)
{
printf("the %u time : %d\n", i, pos_vec[i]);
for (int j=pos_vec[i]; j<pos_vec[i]+pat_len+5; ++j)
{
if (j >= text_len)
{
break;
}
printf("%c",text[j]);
}
printf("\n");
}

}

TEST(autorun_SundayAlgo, test_binary_str) {

u_char text[] = {1,2,255,253,0,255,0,253,0,3,4,5,6,7,8,9,0,0,1,2,3,4,0,0,1,2,0,4,5,0,9,6,4,2,0,0,0,0,0,0,3,2,1,1,2,3,4,5,6,7,0,3,4,6,55,4,2,3,4,234,12,111,255,253,0,255,253,0,255,253,0};
//u_char pat[] = {255,253,0};
u_char pat[] = {0,0,0};

int text_len = sizeof(text);
int pat_len = sizeof(pat);

SundayAlgo sunday;
const char * pat_start = (const char*)pat;
const char * pat_end = pat_start + pat_len;
sunday.SetPatten(pat_start, pat_end);

vector<int> pos_vec;

sunday.SetMatchRule(SundayAlgo::MATCH_RULE_STEP_ONE_PATTEN);
sunday.Search((const char*)text, (const char*)text + text_len, pos_vec);

printf("\n");
printf("\n");
printf("hit times : %d\n", pos_vec.size());
for (size_t i=0; i<pos_vec.size(); ++i)
{
printf("the %u time : %d\n", i, pos_vec[i]);
for (int j=pos_vec[i]; j<pos_vec[i]+pat_len+5; ++j)
{
if (j >= text_len)
{
break;
}
printf("%d,",text[j]);
}
printf("\n");
}

}


输出如下:

nemo@vm04_sles10:[unittest]$ ./sunday_unittest 

Running main() from gtest_main.cc

[==========] Running 2 tests from 1 test case.

[----------] Global test environment set-up.

[----------] 2 tests from autorun_SundayAlgo

[ RUN      ] autorun_SundayAlgo.test_ascii_str

hit times : 3

the 0 time : 27

sunday, i f

the 1 time : 71

sunday algo

the 2 time : 120

sunday

[       OK ] autorun_SundayAlgo.test_ascii_str (0 ms)

[ RUN      ] autorun_SundayAlgo.test_binary_str

hit times : 2

the 0 time : 34

0,0,0,0,0,0,3,2,

the 1 time : 37

0,0,0,3,2,1,1,2,

[       OK ] autorun_SundayAlgo.test_binary_str (0 ms)

[----------] 2 tests from autorun_SundayAlgo (0 ms total)

[----------] Global test environment tear-down

[==========] 2 tests from 1 test case ran. (0 ms total)

[  PASSED  ] 2 tests.
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: