python justtest中一些小函数的整理,以备查用
2014-02-26 20:27
453 查看
#encoding:utf-8 import re from tone2Pinyin import ConvertToneNumbersPinyin args = "Zhào Qián Sūn Lǐ Zhōu Wú Zhèng Wáng" # lineOut = ConvertToneNumbersPinyin(args) # print lineOut s = u"""Zhào Qián Sūn Lǐ Zhōu Wú Zhèng Wáng 赵 钱 孙 李 周 吴 郑 王 Féng Chén Chǔ Wèi Jiǎng Shěn Hán Yáng冯 陈 褚 卫 蒋 沈 韩 杨 Zhū Qin Yoú Xǔ Hé Lǚ Shi Zhāng 朱 秦 尤 许 何 吕 施 张""" def convert_to_pinyin_num(): words_pinyin_dic = {} pattern = re.compile(u"[\u4e00-\u9fa5]+") param_list = s.split() flag_point = 0 for param in param_list: match = pattern.match(param) if not match: param = param.encode("utf-8") lineOut = ConvertToneNumbersPinyin(param) key_words = param_list[flag_point+8] print key_words value_pinyin_num = lineOut + "\n" words_pinyin_dic[key_words] = value_pinyin_num flag_point += 1 return words_pinyin_dic # convert_to_pinyin_num() def write_file(): words_pinyin_dic = convert_to_pinyin_num() with open("words_pinyin_num.txt",mode="a") as f: for key in words_pinyin_dic: print key,words_pinyin_dic[key] com_str ="\t".join((key,words_pinyin_dic[key])) print com_str f.write(com_str.encode("utf-8")) # write_file() def read_file(): import codecs # fileObj = codecs.open("family_name_words_pin.txt",mode="wb",encoding="utf-8") with codecs.open("words_pinyin_num.txt",encoding="utf-8") as f: pattern = re.compile("[1-9]") for line in f.readlines(): match = pattern.search(line) if match: print pattern.sub(" ",line) trim_num = pattern.sub(" ",line) # fileObj.write(trim_num) # fileObj.close() # read_file() def re_sub_fun(): '''正则中的替换操作''' import codecs with codecs.open("words_pinyin_num.txt",encoding="utf-8") as f: for line in f.readlines(): print re.sub("\d"," ",line,count=1) # read_file() def convert_pinyin_lower(): '''字符串操作,大小写转换''' import os,codecs fileObj = open("multi_family_name.txt",mode="wb") with codecs.open("family_name_words_pin.txt") as f: for line in f.readlines(): lower_line = line.lower() fileObj.write(lower_line) fileObj.close() # convert_pinyin_lower() def test_listdir_fun(): '''测试listdir,函数,其他的类似遍历方法还有glob,以及os.walk()''' import os print os.listdir(".") def str_find_fun(): '''find函数,字符串中进行查找,查找失败则返回-1, 所以不能直接用if str.find('*')来判断是否查询失败''' s = "谁了" flag = s.find("天") print flag if s.find("天") != -1: print "yes" # if s.find("天"): # print "yes" # str_find_fun() def check_mulitfamilyname_in_omit(): '''确定姓氏准确发音与高频词不同,即姓氏部分可能会被错误标音的字''' import codecs omit_words_set = set() multi_intersection_dic = {} high_freq_pinyin_dic = {} with codecs.open("data/omit_check_words.txt",encoding="utf-8") as f: for line in f.readlines(): omit_words_set.add(line.strip()) print len(omit_words_set) with codecs.open("data/multi_family_name.txt",encoding="utf-8") as f: for line in f.readlines(): splited_line = line.strip().split("\t") if splited_line[0] in omit_words_set: multi_intersection_dic[splited_line[0]] = splited_line[1] with codecs.open("data/high_frequence_single_word.txt",encoding="utf-16") as f: for line in f.readlines(): splited_line = line.strip().split("\t") high_freq_pinyin_dic[splited_line[0]] = splited_line[1] i = 0 temp_list = [] for (k,v) in multi_intersection_dic.items(): if (k,v) not in high_freq_pinyin_dic.items(): i += 1 com_str = "\t".join((k,v,high_freq_pinyin_dic[k])) com_str = com_str + "\n" temp_list.append(com_str) print k,v,high_freq_pinyin_dic[k] print len(temp_list) # with codecs.open("omit_not_eq_high_freq.txt",mode="wb",encoding="utf-8") as f: # f.writelines(temp_list) # check_mulitfamilyname_in_omit() def gen_multi_pinyin(): '''获取2,3,4,5,6多音字,且按顺序排列''' import os,codecs path = r"E:\SVN\chocolate_ime\doc" filename = os.path.join(path,"HZout_NoTone.txt") word_list = [] with codecs.open(filename,mode="rb",encoding="utf-16") as f: for line in f.readlines(): splited_line = line.split("\t") word_list.append(splited_line[0]) multi_pinyin_num_dic = {} for word in set(word_list): multi_pinyin_num_dic[word] = word_list.count(word)#数据量为23721,利用list.count速度相当慢 com_temp_list = [] multi_pinyin_num_list = sorted(multi_pinyin_num_dic.items(),key=lambda x:x[1])#按照dic的value值进行排序,sorted返回数组 for word_repeat_times_tuple in multi_pinyin_num_list: if word_repeat_times_tuple[1] >= 2: com_str = "\t".join((word_repeat_times_tuple[0],str(word_repeat_times_tuple[1]))) com_str = com_str + "\n" com_temp_list.append(com_str) print len(com_temp_list) with codecs.open("multi_repeat_times_sort.txt",mode="a",encoding="utf-8") as f: f.writelines(com_temp_list) # gen_multi_pinyin() def gen_multi_familyname_dic(): import codecs multi_familyname_dic = {} with codecs.open("familyname_not_equal_high_freq.txt",encoding="utf8")as f: for line in f.readlines(): # print len(line.split("\t")) splited_line = line.split("\t") multi_familyname_dic[splited_line[0]] = splited_line[1] print multi_familyname_dic # gen_multi_familyname_dic() def get_high_freq(): '''max函数比较,两个参数,interator,key,对arr中的第三个参数进行比较,然后返回对应的list 注意,此时若不用float进行转化,则会按字符串进行比较,及"22"小于"4",这个问题的不是第一次碰到了, 当慎重才是''' arr = [[u'\u90fd', u'du', u'3093398\r\n'], [u'\u90fd', u'dou', u'18953009\r\n']] high_word_pinyin_freq_list = max(arr, key=lambda x: float(x[2])) print high_word_pinyin_freq_list #get_high_freq()
相关文章推荐
- 【python】整理一些实用的函数
- 【python】整理一些实用的函数
- 【python】整理一些实用的函数
- 【python】整理一些实用的函数
- 【python】整理一些实用的函数
- 【python】整理一些实用的函数
- 【python】整理一些实用的函数
- Xman整理篇___关于Python的一些基础方法及函数
- 【python】整理一些实用的函数
- 【python】整理一些实用的函数
- 【python】整理一些实用的函数
- python 写函数的一些小细节
- 自己整理的关于C的一些字符串处理函数
- Python+Selenium WebDriver API:浏览器及元素的常用函数及变量整理总结
- python函数作用域、迭代器与生成器知识点整理
- Python一些函数的笔记
- python 中 常用到的 numpy 函数 整理
- python抓取京东商品颜色&beautifulsoup的一些常用函数
- Python的Django框架中URLconf相关的一些技巧整理
- 整理一些Python—matplotlib和numpy会常用到的网站