utf8字符串截取
2010-04-07 12:23
127 查看
utf8字符串是变长字符串,在日常处理时字符串截取时,如果处理不好则会出现乱码,针对这个问题下面给出一个通用utf8字符串截取的示例,下面给出的是用python和php实现根据原理可以用任何语言来实现.
# python
# utf8 substr
def safestrlength_utf8(sourcestr): # {{{
i = 0;
n = 0;
str_length = len(sourcestr);
while i < str_length:
temp_str = sourcestr[i:i+1]
ascnum = ord(temp_str)
if ascnum >= 252:
i = i + 6
elif ascnum >= 248:
i = i + 5
elif ascnum >= 240:
i = i + 4
elif ascnum >= 224:
i = i + 3
elif ascnum >= 192:
i = i + 2
elif ascnum >= 65 and ascnum <= 90:
i = i + 1
else:
i = i + 1
n = n + 1
n = n - 1
return n
# utf8 string length
def safesubstr_utf8(sourcestr, cutlength):
returnlist = []
i = 0
n = 0
str_length = len(sourcestr)
while (n < cutlength) and (i <= str_length):
temp_str = sourcestr[i:i+1]
ascnum = ord(temp_str)
if ascnum >= 252:
returnlist.append(sourcestr[i:i+6])
i = i + 6
elif ascnum >= 248:
returnlist.append(sourcestr[i:i+5])
i = i + 5
elif ascnum >= 240:
returnlist.append(sourcestr[i:i+4])
i = i + 4
elif ascnum >= 224:
returnlist.append(sourcestr[i:i+3])
i = i + 3
elif ascnum >= 192:
returnlist.append(sourcestr[i:i+2])
i = i + 2
elif ascnum >= 65 and ascnum <= 90:
returnlist.append(sourcestr[i:i+1])
i = i + 1
else:
returnlist.append(sourcestr[i:i+1])
i = i + 1
n = n + 1;
return "".join(returnlist)
// php
// substr for utf8 string, then utf8 word is 1 length
public static function safesubstr_utf8($sourcestr, $cutlength) // {{{
{
$returnstr = '';
$i = 0;
$n = 0;
$str_length = strlen($sourcestr);
while(($n < $cutlength) && ($i <= $str_length))
{
$temp_str = substr($sourcestr, $i, 1);
$ascnum = Ord($temp_str);
if($ascnum >= 252)
{
$returnstr = $returnstr.substr($sourcestr, $i, 6);
$i = $i + 6;
}
elseif($ascnum >= 248)
{
$returnstr = $returnstr.substr($sourcestr, $i, 5);
$i = $i + 5;
}
elseif($ascnum >= 240)
{
$returnstr = $returnstr.substr($sourcestr, $i, 4);
$i = $i + 4;
}
elseif($ascnum >= 224)
{
$returnstr = $returnstr.substr($sourcestr, $i, 3);
$i = $i + 3;
}
elseif($ascnum >= 192)
{
$returnstr = $returnstr.substr($sourcestr, $i, 2);
$i = $i + 2;
}
elseif($ascnum >= 65 && $ascnum <= 90)
{
$returnstr = $returnstr.substr($sourcestr, $i, 1);
$i = $i + 1;
}
else
{
$returnstr = $returnstr.substr($sourcestr, $i, 1);
$i = $i + 1;
}
$n++;
}
return $returnstr;
} // }}}
// get length for utf8 string, then utf8 word is 1 length
public static function safestrlength_utf8($sourcestr) // {{{
{
$i = 0;
$n = 0;
$str_length = strlen($sourcestr);
while($i <= $str_length)
{
$temp_str = substr($sourcestr, $i, 1);
$ascnum = Ord($temp_str);
if($ascnum >= 252)
{
$i = $i + 6;
}
elseif($ascnum >= 248)
{
$i = $i + 5;
}
elseif($ascnum >= 240)
{
$i = $i + 4;
}
elseif($ascnum >= 224)
{
$i = $i + 3;
}
elseif($ascnum >= 192)
{
$i = $i + 2;
}
elseif($ascnum >= 65 && $ascnum <= 90)
{
$i = $i + 1;
}
else
{
$i = $i + 1;
}
$n++;
}
$n--;
return $n;
} // }}}
# python
# utf8 substr
def safestrlength_utf8(sourcestr): # {{{
i = 0;
n = 0;
str_length = len(sourcestr);
while i < str_length:
temp_str = sourcestr[i:i+1]
ascnum = ord(temp_str)
if ascnum >= 252:
i = i + 6
elif ascnum >= 248:
i = i + 5
elif ascnum >= 240:
i = i + 4
elif ascnum >= 224:
i = i + 3
elif ascnum >= 192:
i = i + 2
elif ascnum >= 65 and ascnum <= 90:
i = i + 1
else:
i = i + 1
n = n + 1
n = n - 1
return n
# utf8 string length
def safesubstr_utf8(sourcestr, cutlength):
returnlist = []
i = 0
n = 0
str_length = len(sourcestr)
while (n < cutlength) and (i <= str_length):
temp_str = sourcestr[i:i+1]
ascnum = ord(temp_str)
if ascnum >= 252:
returnlist.append(sourcestr[i:i+6])
i = i + 6
elif ascnum >= 248:
returnlist.append(sourcestr[i:i+5])
i = i + 5
elif ascnum >= 240:
returnlist.append(sourcestr[i:i+4])
i = i + 4
elif ascnum >= 224:
returnlist.append(sourcestr[i:i+3])
i = i + 3
elif ascnum >= 192:
returnlist.append(sourcestr[i:i+2])
i = i + 2
elif ascnum >= 65 and ascnum <= 90:
returnlist.append(sourcestr[i:i+1])
i = i + 1
else:
returnlist.append(sourcestr[i:i+1])
i = i + 1
n = n + 1;
return "".join(returnlist)
// php
// substr for utf8 string, then utf8 word is 1 length
public static function safesubstr_utf8($sourcestr, $cutlength) // {{{
{
$returnstr = '';
$i = 0;
$n = 0;
$str_length = strlen($sourcestr);
while(($n < $cutlength) && ($i <= $str_length))
{
$temp_str = substr($sourcestr, $i, 1);
$ascnum = Ord($temp_str);
if($ascnum >= 252)
{
$returnstr = $returnstr.substr($sourcestr, $i, 6);
$i = $i + 6;
}
elseif($ascnum >= 248)
{
$returnstr = $returnstr.substr($sourcestr, $i, 5);
$i = $i + 5;
}
elseif($ascnum >= 240)
{
$returnstr = $returnstr.substr($sourcestr, $i, 4);
$i = $i + 4;
}
elseif($ascnum >= 224)
{
$returnstr = $returnstr.substr($sourcestr, $i, 3);
$i = $i + 3;
}
elseif($ascnum >= 192)
{
$returnstr = $returnstr.substr($sourcestr, $i, 2);
$i = $i + 2;
}
elseif($ascnum >= 65 && $ascnum <= 90)
{
$returnstr = $returnstr.substr($sourcestr, $i, 1);
$i = $i + 1;
}
else
{
$returnstr = $returnstr.substr($sourcestr, $i, 1);
$i = $i + 1;
}
$n++;
}
return $returnstr;
} // }}}
// get length for utf8 string, then utf8 word is 1 length
public static function safestrlength_utf8($sourcestr) // {{{
{
$i = 0;
$n = 0;
$str_length = strlen($sourcestr);
while($i <= $str_length)
{
$temp_str = substr($sourcestr, $i, 1);
$ascnum = Ord($temp_str);
if($ascnum >= 252)
{
$i = $i + 6;
}
elseif($ascnum >= 248)
{
$i = $i + 5;
}
elseif($ascnum >= 240)
{
$i = $i + 4;
}
elseif($ascnum >= 224)
{
$i = $i + 3;
}
elseif($ascnum >= 192)
{
$i = $i + 2;
}
elseif($ascnum >= 65 && $ascnum <= 90)
{
$i = $i + 1;
}
else
{
$i = $i + 1;
}
$n++;
}
$n--;
return $n;
} // }}}
U-00000000 - U-0000007F: | 0xxxxxxx |
U-00000080 - U-000007FF: | 110xxxxx 10xxxxxx |
U-00000800 - U-0000FFFF: | 1110xxxx 10xxxxxx 10xxxxxx |
U-00010000 - U-001FFFFF: | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
U-00200000 - U-03FFFFFF: | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
U-04000000 - U-7FFFFFFF: | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
相关文章推荐
- php截取字符串可以避免乱码 utf8
- 截取带emoji表情的utf8字符串
- PHP截取字符串[GB2312-UTF8编码]
- php自定义中文字符串截取函数substr_for_gb2312及substr_for_utf8示例
- 自己实现php UTF8中文字符串截取
- PHP substr 截取字符串出现乱码问题解决方法[utf8与gb2312]
- PHP截取UTF8字符串 utf-8 可以能占一个字符 二个字符 或者三个字符
- lua中截取UTF8字符串的方法(无乱码)
- php截取字符串,兼容utf8、gb2312,gbk、big5
- 中文字符串截取函数 substr_for_gb2312 and substr_for_utf8
- 真正根据utf8编码的规律来进行截取字符串的函数(utf8版sub_str )
- php截取字符串之截取utf8或gbk编码的中英文字符串示例
- PHP截取字符串[GB2312-UTF8编码]
- PHP实现以UTF8格式截取指定字符串位数
- 关于PHP字符串截取显示相同长度的字符以及UTF8下的ASCII编码
- 真正根据utf8编码的规律来进行截取字符串的函数(utf8版sub_str )
- php截取中文字符串支持utf8和gbk
- PHP截取UTF8字符串
- php自定义中文字符串截取函数substr_for_gb2312及substr_for_utf8示例
- dedecms 的cn_substr_utf8字符串截取函数商榷