您的位置:首页 > 编程语言 > Python开发

【python】字符串编码问题

2014-01-16 16:59 239 查看
参考:/article/1759217.html

python内部的字符串是以unicode来编码

decode函数用来将其他编码解码为unicode

encode函数将unicode编码为指定的编码类型,例如gbk,utf-8

# -*- coding: utf-8 -*-
"""
Created on Wed Jan 15 15:20:59 2014

@author: hp
"""

import urllib2
import re
import time
import jieba

url="http://blog.sina.com.cn/s/blog_608e1afd0102e5ym.html"
def geturl(url):
html=urllib2.urlopen(url).read()
html=unicode(html,'utf-8')
word=re.findall(ur"[\u4e00-\u9fa5]+",html)

s=""
for w in word:
s+=w
return s  #return web content
def separate_word(s):
seg_list=jieba.cut(s,cut_all=False)
fenci="/ ".join(seg_list)
#    print 'get web-->',s
#    print 'div result-》',fenci
#    print "fenci[1]-->",fenci[1]
word_list=[]
word_tmp=""
#word_tmp.decode('utf-8')
for i in range(len(fenci)):
if fenci[i]!="/":
word_tmp+=fenci[i]
else:
i+=1
word_tmp.decode('utf-8','ignore')
word_list.append(word_tmp)
word_tmp=""
#word_list=seg_list.split("/ ")

#    print "word_list-->",word_list
return word_list

def count_word(word_list):
word_list_group=[]
word_num=[]
dic={}
for i in range(len(word_list)):
w_tmp=word_list[i]
signal=0
for j in range(len(word_list_group)):
if word_list_group[j]==w_tmp:
signal=1
if signal==0:
word_list_group.append(unicode(w_tmp.encode('utf-8'),'utf-8'))

for i in range(len(word_list_group)):
num=0
for j in range(len(word_list)):
if word_list_group[i]==word_list[j]:
num+=1
word_num.append(num)

for i in range(len(word_list_group)):
dic[word_list_group[i].encode('gbk')]=word_num[i]

#    for i in range(len(word_list_group)):
#        print "word_list_group-->",word_list_group[i].encode('gbk'),"word_num-->",word_num[i]
return dic
#    return word_list_group,word_num

contant=geturl(url)
word=separate_word(contant)
result=count_word(word)
for key in result.keys():
print key.encode('gbk'),"--->",result[key]
#print result

time.sleep(10)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: