python 短语查询(中文版本+英文版本)
2015-11-25 20:20
645 查看
# -*- coding: utf-8 -*- import jieba import sys def word_split(text): """ Split a text in words. Returns a list of tuple that contains (word, location) location is the starting words position of the word. alse do the job of normalization """ word_list = [] wcurrent = [] windex = 0 #enumerate can get the index and the specific content of a string for i, c in enumerate(text): if c.isalnum(): wcurrent.append(c) elif wcurrent: windex = windex + 1 word = u''.join(wcurrent).lower() word_list.append((windex, word)) wcurrent = [] if wcurrent: windex = windex + 1 word = u''.join(wcurrent).lower() word_list.append((windex, word)) return word_list def inverted_index(text): """ Create an Inverted-Index of the specified text document. {word:[locations]} """ inverted = {} for index, word in word_split(text): # setdefault func is similar with the func get,but it can add new key and set default value to the dic when the key does not exist locations = inverted.setdefault(word, []) locations.append(index) return inverted def inverted_index_add(inverted, doc_id, doc_index): """ Add Invertd-Index doc_index of the document doc_id to the Multi-Document Inverted-Index (inverted), using doc_id as document identifier. {word:{doc_id:[locations]}} """ for word, locations in doc_index.iteritems(): indices = inverted.setdefault(word, {}) indices[doc_id] = locations return inverted def search(inverted, query): """ Returns a set of documents id that contains all the words in your query. """ words = [word for _, word in word_split(query) if word in inverted] results = [set(inverted[word].keys()) for word in words]#Duplicate remove return reduce(lambda x, y: x & y, results) if results else []#find the doc in common def searchPhrase(inverted,query): """ Returns a set of documents id that contains phrase in your query. """ words = [word for _, word in word_split(query) if word in inverted] tempDic = {} doc_return = [] for word in words: word_doc_ids = inverted[word].keys() tempDic.setdefault(word,{}) for ID in word_doc_ids: word_doc_position = inverted[word][ID] tempDic[word].setdefault(ID,word_doc_position) #print tempDic #find the common doc if len(words)>1: minKey = {} for i in range(0,len(words)): tempKeys = tempDic[words[i]].keys() minKey.setdefault(i,tempKeys) minKeyNew = minKey[0] for i in range(1,len(words)): minKeyNew = [val for val in minKeyNew if val in minKey[i]] #print minKeyNew for key in minKeyNew: #the position that the first word appear list1 = tempDic[words[0]][key] #print list1 isAdd = [] for position in list1: isAddForEach = [] for j in range(1,len(words)): if position+j not in tempDic[words[j]][key]: isAddForEach.append(0) else: isAddForEach.append(1) #print isAddForEach if 0 in isAddForEach: isAdd.append(0) else: isAdd.append(1) if 1 in isAdd: doc_return.append(key) else: doc_return.append(tempDic[words[0]].keys()[0]) results = [] for doc_id in doc_return: if doc_id not in results: results.append(doc_id) return results # sys for Chinese using jieba def wordSplitForChinese(text): seg_list = jieba.cut(text) temp = "*****".join(seg_list) seg_list = temp.split("*****") word_list = [] index = 1 for word in seg_list: word_list.append((index,word)) index = index+1 return word_list def inverted_index_chinese(text): """ Create an Inverted-Index of the specified text document. {word:[locations]} """ inverted = {} for index, word in wordSplitForChinese(text): locations = inverted.setdefault(word, []) locations.append(index) return inverted def inverted_index_add_chinese(inverted, doc_id, doc_index): """ Add Invertd-Index doc_index of the document doc_id to the Multi-Document Inverted-Index (inverted), using doc_id as document identifier. {word:{doc_id:[locations]}} """ for word, locations in doc_index.iteritems(): indices = inverted.setdefault(word, {}) indices[doc_id] = locations return inverted def search_chinese(inverted, query): """ Returns a set of documents id that contains all the words in your query. """ words = [word for _,word in wordSplitForChinese(query) if word in inverted] results = [set(inverted[word].keys()) for word in words]#Duplicate remove return reduce(lambda x, y: x & y, results) if results else []#find the doc in common def searchPhraseChinese(inverted,query): """ Returns a set of documents id that contains phrase in your query. """ words = [word for _, word in wordSplitForChinese(query) if word in inverted] tempDic = {} doc_return = [] for word in words: word_doc_ids = inverted[word].keys() tempDic.setdefault(word,{}) for ID in word_doc_ids: word_doc_position = inverted[word][ID] tempDic[word].setdefault(ID,word_doc_position) #print tempDic #find the common doc if len(words)>1: minKey = {} for i in range(0,len(words)): tempKeys = tempDic[words[i]].keys() minKey.setdefault(i,tempKeys) minKeyNew = minKey[0] for i in range(1,len(words)): minKeyNew = [val for val in minKeyNew if val in minKey[i]] #print minKeyNew for key in minKeyNew: #the position that the first word appear list1 = tempDic[words[0]][key] #print list1 isAdd = [] for position in list1: isAddForEach = [] for j in range(1,len(words)): if position+j not in tempDic[words[j]][key]: isAddForEach.append(0) else: isAddForEach.append(1) #print isAddForEach if 0 in isAddForEach: isAdd.append(0) else: isAdd.append(1) if 1 in isAdd: doc_return.append(key) else: doc_return.append(tempDic[words[0]].keys()[0]) results = [] for doc_id in doc_return: if doc_id not in results: results.append(doc_id) return results doc1 = """ Niners head coach Mike Singletary will let Alex Smith remain his starting quarterback, but his vote of confidence is anything but a long-term mandate. Smith now will work on a week-to-week basis, because Singletary has voided his year-long lease on the job. "I think from this point on, you have to do what's best for the football team," Singletary said Monday, one day after threatening to bench Smith during a 27-24 loss to the visiting Eagles.Zero-rate buildings """ doc2 = """ The fifth edition of West Coast Green, a conference focusing on "green" home innovations and products, rolled into San Francisco's Fort Mason last week intent, per usual, on making our living spaces more environmentally friendly - one used-tire house at a time.Zero-rated buildings To that end, there were presentations on topics such as water efficiency and the burgeoning future of Net Zero-rated buildings that consume no energy and produce no carbon emissions.on a job,on the job """ doc3 = """ 土豆洋葱煎饼 小贴士:土豆一定不要擦丝 用刀切 稍微粗一些口感会像薯条 面粉不要太多 拌匀后一定是比较稀释的状态 这样吃得时候满嘴都是土豆和洋葱的香味 而不是吃一块面饼 面粉只是为了帮助土豆洋葱可以成型 切洋葱的时候可以先把洋葱切成大块放入水里泡一会儿 这样不辣眼睛 喜欢脆一点的可以重复炸一遍 """ doc4=""" 鸡肉切好,青椒鸡切小块些好入味。淘洗干净。 青椒去蒂洗净切段 香姑洗净切成喜欢的大小(太小了炒熟后找不到哦,哈哈) 姜切成小段再拍散,蒜拍一点,留一点整的 锅里烧水,量至少要能没过鸡肉。水开后放一段拍好的姜,加一铲子料酒,这一步开水去浮沫。姜、料酒去腥 随后倒入鸡肉用馋子和散。让每块鸡肉都和水充分接触(大火) 肉下锅后,在锅里和个一分来钟就就会看到有油沫什么的浮起来,然后过滤用筛子沥起。(不能在锅里时间太长了,要不肉太老了,要塞牙哦~~) 锅内放油烧热,油温七八成热(能看到有少量油烟往外冒),然后倒入鸡肉大火翻炒,刚开始一放肉,会看到油变得很浑,那是里面的水分太多。鸡肉放进去小炒一会(一般一两分钟),油有变清一点即可捞出鸡肉。肉不要炸太久哦,老了很干,后面也不好入味 鸡肉捞出锅后,剩下油烧一会,待油水分差不多没了,洒入花椒(我忘了厨房没有,忘买了就没放)后立马放拍好的姜和蒜 姜、蒜在油锅里和两下倒进切好的青椒翻炒均匀 青椒下锅翻炒三四下,下鸡肉 鸡肉下锅翻炒三四下放入剥好的整颗的大蒜,同时加盐,盐少少的放,不够再加 然后不快不慢的翻炒,大概炒个三四分钟,能看到鸡骨与肉分离的样子,尝一下鸡肉,已经能吃出青椒味了 倒入香姑,把香姑往锅底和。不喜欢香姑的可以不加,省略此步 加入香姑后翻炒至香姑体积变小,明显脱水。加少量老抽拌均(酱油也行),一是调色,二是让肉更入味。 老抽/酱油加进锅翻炒一分钟左右,从锅周边加少量水,小火闷两分钟,有锅盖就加盖闷吧。。我找不到锅盖啦!! 水差不多干就了就行啦,要是水加多了不好干,到后面就大火收一下就好啦。这就可以了,吃的时候,像吃火锅一样,小火加热着吃,关火了话吃一会会感觉比较油。吃的时候可以在锅底埋点白菜什么的小菜,也是很入味的哦~~ 大蒜炒熟了好好吃,没点蒜冲味 """ inverted = {} documents = {'doc1':doc1, 'doc2':doc2} for doc_id, text in documents.iteritems(): doc_index = inverted_index(text) inverted_index_add(inverted, doc_id, doc_index) # Print Inverted-Index #for word, doc_locations in inverted.iteritems(): #print word, doc_locations #search common words print "*****search common words*****" queries = ['Week', 'Niners', 'coast'] for query in queries: result_docs = search(inverted, query) print "Search for '%s': %r" % (query, result_docs) #search phrases print print "*****search phrases*****" newQueries = ['Zero-rated buildings','on the job','West Coast Green'] for query in newQueries: result_docs = searchPhrase(inverted, query) print "Search for '%s': %r" % (query, result_docs) #chinese search #s = "python中文编码函数" #s1 = unicode(s, 'utf-8') #print s1 print print "*****search chinese phrases*****" inversed_chinese = {} documents_chinese = {'doc3':doc3, 'doc4':doc4} for doc_id, text in documents_chinese.iteritems(): doc_index = inverted_index_chinese(text) inverted_index_add_chinese(inversed_chinese, doc_id, doc_index) #search common chinese words print "*****search common chinese words*****" chineseQueries = ['土豆','青椒','花椒'] for query in chineseQueries: result_docs = search_chinese(inversed_chinese, query) print "Search for ",query,":",result_docs #search chinese phrases print "*****search common chinese words*****" chineseQueries = ['土豆洋葱煎饼','翻炒一分钟','土豆和洋葱的香味'] for query in chineseQueries: result_docs = search_chinese(inversed_chinese, query) print "Search for ",query,":",result_docs
相关文章推荐
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- install and upgrade scrapy
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- Python中zip()函数用法举例
- Python中map()函数浅析
- Python将excel导入到mysql中
- Python在CAM软件Genesis2000中的应用
- 使用Shiboken为C++和Qt库创建Python绑定
- FREEBASIC 编译可被python调用的dll函数示例
- Python 七步捉虫法