您的位置:首页 > 其它


2012-07-21 15:52 239 查看

这样看来,更好的办法就是使用哈希函数,能够直接定位到你的分隔符。微软就是这么实现的。strtok函数源码中使用了 ,unsigned char map[32] 这个东西作为分隔符列表的映射。这个map一共有 32*8=256个映射,而ASCII编码总共只有128个字符,所以拿来做hash非常自然。源码如下:

#include <cruntime.h>
#include <string.h>
#ifdef _MT
#include <mtdll.h>
#endif    /* _MT */

*char *strtok(string, control) - tokenize string with delimiter in control
*         strtok considers the string to consist of a sequence of zero or more
*         text tokens separated by spans of one or more control chars. the first
*         call, with string specified, returns a pointer to the first char of the
*         first token, and will write a null char into string immediately
*         following the returned token. subsequent calls with zero for the first
*         argument (string) will work thru the string until no tokens remain. the
*         control string may be different from call to call. when no tokens remain
*         in string a NULL pointer is returned. remember the control chars with a
*         bit map, one bit per ascii char. the null char is always a control char.
*         char *string - string to tokenize, or NULL to get next token
*         char *control - string of characters to use as delimiters
*         returns pointer to first token in string, or if string
*         was NULL, to next token
*         returns NULL when no more tokens remain.

char * __cdecl strtok(char * string, const char * control) {
unsigned char *str;
const unsigned char *ctrl = control;

unsigned char map[32];
int count;

#ifdef _MT
_ptiddata ptd = _getptd();
#else    /* _MT */
static char *nextoken;
#endif    /* _MT */

/* Clear control map */
for (count = 0; count < 32; count++)
map[count] = 0;

/* Set bits in delimiter table */
do {
map[*ctrl >> 3] |= (1 << (*ctrl & 7));
} while (*ctrl++);

/* Initialize str. If string is NULL, set str to the saved
* pointer (i.e., continue breaking tokens out of the string
* from the last strtok call) */
if (string)
str = string;
#ifdef _MT
str = ptd->_token;
#else    /* _MT */
str = nextoken;
#endif    /* _MT */

/* Find beginning of token (skip over leading delimiters). Note that
* there is no token iff this loop sets str to point to the terminal
* null (*str == '/0') */
while ((map[*str >> 3] & (1 << (*str & 7))) && *str)

string = str;

/* Find the end of the token. If it is not the end of the string,
* put a null there. */
for (; *str; str++)
if (map[*str >> 3] & (1 << (*str & 7))) {
*str++ = '/0';

/* Update nextoken (or the corresponding field in the per-thread data
* structure */
#ifdef _MT
ptd->_token = str;
#else    /* _MT */
nextoken = str;
#endif    /* _MT */

/* Determine if a token has been found. */
if (string == str)
return NULL;
return string;

do {
map[*ctrl >> 3] |= (1 << (*ctrl & 7));
} while (*ctrl++);

while ((map[*str >> 3] & (1 << (*str & 7))) && *str)


for (; *str; str++)
if (map[*str >> 3] & (1 << (*str & 7))) {
*str++ = '/0';




* @describe just like strtok of Microsoft
*           and the usage is the Same
*           if u wanna split Chinese words
*           GBK should be used.
char* cstrtok(char* string, const char* control) {
/* 在这里加一个assert,如果中文GBK被截断,那就不往下走了 */

unsigned char *str;
const unsigned char *ctrl = (const unsigned char*) control;

do {
if (*ctrl >> 7) {
if (*(ctrl + 1) == '\0') {
fprintf(stderr, "Some weird symbol occurred in delimiter\n"
"Maybe Chinese word GBK have been truncated.\n");
return NULL;
} else {
} while (*ctrl++);

ctrl = (const unsigned char*) control;
unsigned char map[32];
unsigned char vice_map[32];
int count;

#ifdef _MT
_ptiddata ptd = _getptd();
#else    /* _MT */
static char *nextoken;
#endif    /* _MT */

/* Clear control map */
for (count = 0; count < 32; count++) {
map[count] = 0;
vice_map[count] = 0;

/* Set bits in delimiter table */
/* 这里很可能出问题,当中文GBK编码被截断的时候*/
do {
map[*ctrl >> 3] |= (1 << (*ctrl & 7));
if ((*(ctrl + 1)) && (*ctrl >> 7)) {
vice_map[*(ctrl + 1) >> 3] |= (1 << (*(ctrl + 1) & 7));
} while (*ctrl++);

/* Initialize str. If string is NULL, set str to the saved
* pointer (i.e., continue breaking tokens out of the string
* from the last strtok call) */
if (string)
str = (unsigned char*) string;
#ifdef _MT
str = ptd->_token;
#else    /* _MT */
str = (unsigned char*) nextoken;
#endif    /* _MT */

/* Find beginning of token (skip over leading delimiters). Note that
* there is no token iff this loop sets str to point to the terminal
* null (*str == '/0') */
while ((map[*str >> 3] & (1 << (*str & 7))) && *str) {
if (*str >> 7) {
if (vice_map[*(str + 1) >> 3] & (1 << (*(str + 1) & 7))) {
str += 2;
} else {
} else {

string = (char*) str;

/* Find the end of the token. If it is not the end of the string,
* put a null there. */
for (; *str; str++)
if (map[*str >> 3] & (1 << (*str & 7))) {
if (*str >> 7) {
if (vice_map[*(str + 1) >> 3] & (1 << (*(str + 1) & 7))) {
*str++ = '\0';
*str++ = '\0';
} else {
} else {
*str++ = '\0';

/* Update nextoken (or the corresponding field in the per-thread data
* structure */
#ifdef _MT
ptd->_token = str;
#else    /* _MT */
nextoken = (char*) str;
#endif    /* _MT */

/* Determine if a token has been found. */
if (string == (char*) str)
return NULL;
return string;

其中用到了一个 vice_map,因为中文GBK编码是两个ASCII一个字符,所以用到了两个map,这样,当第一个map中存了8bit中初始位置是1的内容后,就把下一个字符填入到vice_map中,在查找的时候如果有开头时1的char,并不一定就是分隔符,因为还没有确定后面一个字符是不是和前一个拼接在一起作为分隔符。如果后面一个和前一个拼接起来不是分隔符,那么跳过这个字符。




内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息