您的位置:首页 > 编程语言 > C语言/C++

[C语言]Unicode编码(二)-中文字符刷选

2016-11-01 00:05 302 查看
Unicode编码(二)-中文字符刷选1,UTF-8编码中三字节中文字符的刷选方法如下:
int
chinese_filter(unsigned char one, unsigned char two )
{
int flag = -1;
if ( one >= 0x4E && one < 0x9F )
{
flag = 1;
}
else if ( one == 0x9F && two <= 0xCB )
{
flag = 1;
}
else if ( one >= 0x34 && one < 0x4D )
{
flag = 1;
}
else if ( one == 0x4D && two <= 0xB5 )
{
flag = 1;
}
else if ( one == 0x2F && two <= 0xD5 )
{
flag = 1;
}
else if ( one == 0x2E && two >= 0x80 && two <= 0xF3 )
{
flag = 1
}
else if ( one >= 0xF9 && one < 0xFA )
{
flag = 1;
}
else if ( one == 0xFA && two <= 0xD9 )
{
flag = 1;
}
else if ( one == 0xE8 && two >= 0x15 && two <= 0x6F )
{
flag = 1;
}
else if ( one >= 0xE4 && one < 0xE5 )
{
flag = 1;
}
else if ( one == 0xE5 && two <= 0xE8 )
{
flag = 1;
}
else if ( one == 0xE6 && two <= 0xCF )
{
flag = 1;
}
else if ( one == 0x31 && two >= 0xC0 && two <= 0xE3 )
{
flag = 1;
}
else if ( one == 0x2F && two >= 0xF0 && two <= 0xFB )
{
flag = 1;
}
else if ( one == 0x31 && two >= 0x05 && two <= 0x20 )
{
flag = 1;
}
else if ( one == 0x31 && two >= 0xA0 && two <= 0xBA )
{
flag = 1;
}

return flag;
}
2,UTF-8编码中四字节中文字符刷选方法如下:
int
chinese_filter2( unsigned char one, unsigned char two, unsigned char thr )
{
int flag = -1;
if ( one == 0x02 && two < 0xA6 )
{
/*20000-2A6D6*/
flag = 1;
}
else if ( one == 0x02 && two == 0xA6 && thr <= 0xD6 )
{
flag = 1;
}
else if ( one == 0x02 && two >= 0xA7 && two < 0xB7 )
{
/* 2A700-2B734 */
flag = 1;
}
else if ( one == 0x02 && two == 0xB7 && thr <= 0x34 )
{
flag = 1;
}
else if ( one == 0x02 && two >= 0xB7 && thr >= 0x40 && two < 0xB8 )
{
/* 2B740-2B81D */
flag = 1;
}
else if ( one == 0x02 && two == 0xB8 && thr <= 0x1D )
{
flag = 1;
}
else if ( one == 0x02 && two >= 0xF8 && two < 0xFA )
{
/* 2F800-2FA1D */
flag = 1;
}
else if ( one == 0x02 && two == 0xFA && thr <= 0x1D )
{
flag = 1;
}

return flag;
}


3,UTF-8字符转Unicode编码:
1)src为输入的UTF-8字符串
2)unicode为UTF-8字符串转换后输出的unicode编码串
3)chs为字符串中刷选出来的中文字符
int
utf_to_unicode( unsigned char *src, unsigned char *unicode, unsigned char *chs )
{
int size = 0;
int ch_len = 0;

unsigned char one = 0x00;
unsigned char two = 0x00;
unsigned char thr = 0x00;
unsigned char fou = 0x00;
unsigned char fiv = 0x00;
unsigned char six = 0x00;

/* one使高位,FFFE方式存储,所以第一个取出来的one是高位 */
while ( *src )
{
if ( *src <= 0x80 )
{
one = *( src + 0 );
*( unicode + size++ ) = one;

src += 1;
}
else if ( *src >= 0xC0 && *src < 0xE0 )
{
one = *( src + 0 );
two = *( src + 1 );
*( unicode + size++ ) = one & 0x03;
*( unicode + size++ ) = ( two & 0x3F ) | ( ( one & 0x03 ) << 6 );

src += 2;
}
else if ( *src >= 0xE0 && *src < 0xF0 )
{
one = *( src + 0 );
two = *( src + 1 );
thr = *( src + 2 );

*( unicode + size++ ) = ( ( two & 0x3C ) >> 2 ) | ( ( one & 0x0F ) << 4 );
*( unicode + size++ ) = ( thr & 0x3F ) | ( ( two & 0x03 ) << 6 );

int val = chinese_filter( *( unicode + size - 2 ), *( unicode + size - 1 ) );
if ( val == 1 )
{
*( chs + ch_len++ ) = *( src + 0 );
*( chs + ch_len++ ) = *( src + 1 );
*( chs + ch_len++ ) = *( src + 2 );
}

src += 3;
}
else if ( *src >= 0xF0 && *src < 0xF8 )
{
one = *( src + 0 );
two = *( src + 1 );
thr = *( src + 2 );
fou = *( src + 3 );

*( unicode + size++ ) = ( ( two & 0x30 ) >> 4 ) | ( ( one & 0x07 ) << 2 );
*( unicode + size++ ) = ( ( thr & 0x3C ) >> 2 ) | ( ( two & 0x0F ) << 4 );
*( unicode + size++ ) = ( fou & 0x3F ) | ( ( thr & 0x03 ) << 6 );

inr val = chinese_filter2( *( unicode + size - 3 ), *( unicode + size - 2 ), *( unicode + size - 1 ) );
if ( val == 1 )
{
*( chs + ch_len++ ) = *( src + 0 );
*( chs + ch_len++ ) = *( src + 1 );
*( chs + ch_len++ ) = *( src + 2 );
*( chs + ch_len++ ) = *( src + 3 );
}

src += 4;
}
else if ( *src >= 0xF8 && *src < 0xFC )
{
one = *( src + 0 );
two = *( src + 1 );
thr = *( src + 2 );
fou = *( src + 3 );
fiv = *( src + 4 );

*( unicode + size++ ) = one & 0x03;
*( unicode + size++ ) = ( ( thr & 0x30 ) >> 4 ) | ( ( two & 0x3F ) << 2 );
*( unicode + size++ ) = ( ( fou & 0x3C ) >> 2 ) | ( ( thr & 0x0F ) << 4 );
*( unicode + size++ ) = ( fiv & 0x3F ) | ( ( fou & 0x03 ) << 6 );

src += 5;
}
else if ( *src >= 0xFC )
{
one = *( src + 0 );
two = *( src + 1 );
thr = *( src + 2 );
fou = *( src + 3 );
fiv = *( src + 4 );
six = *( src + 5 );

*( unicode + size++ ) = ( two & 0x3F ) | ( ( one & 0x01 ) << 6 );
*( unicode + size++ ) = ( ( fou & 0x30 ) >> 4 ) | ( ( thr & 0x3F ) << 2 );
*( unicode + size++ ) = ( ( fiv & 0x3C ) >> 2 ) | ( ( fou & 0x0F ) << 4 );
*( unicode + size++ ) = ( six & 0x3F ) | ( ( fiv & 0x03 ) << 6 );

src += 6;
}
else
{
printf( "Error: unknoe scope\n" );
return -1;
}
}

*( unicode + size ) = '\0';

return size;
}
4,主函数测试程序和Unicode编码打印程序

void
unicode_print( unsigned char *unicode, int size )
{
int index = 0;
for ( ; index < size; index += 1 )
{
printf( "%02X", *( unicode + index ) );
}

printf("\n");
}
int
main( int argc, char *argv[] )
{
unsigned char ch3[] = "一A严严·";

int size = 0;
int len  = 0;
len = strlen( ch3 );

unsigned char unicode[len + 1];
memset( unicode, 0x00, len + 1 );

unsigned char china[len + 1];
memset( china, 0x00, len + 1 );

size = utf_to_unicode( ch3, len + 1, unicode, china );
unicode_print( unicode, size );

printf( "Chinese = %s\n", china );

return 0;
}


本文参考文献:http://www.qqxiuzi.cn/zh/hanzi-unicode-bianma.php?zfj=kzb
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  C语言