您的位置：首页 > 编程语言 > Python开发
python 短语查询（中文版本＋英文版本）

2015-11-25 20:20 645 查看
# -*- coding: utf-8 -*-
import jieba
import sys

def word_split(text):
"""
Split a text in words. Returns a list of tuple that contains
(word, location) location is the starting words position of the word.
alse do the job of normalization
"""
word_list = []
wcurrent = []
windex = 0
#enumerate can get the index and the specific content of a string
for i, c in enumerate(text):
if c.isalnum():
wcurrent.append(c)
elif wcurrent:
windex = windex + 1
word = u''.join(wcurrent).lower()
word_list.append((windex, word))
wcurrent = []

if wcurrent:
windex = windex + 1
word = u''.join(wcurrent).lower()
word_list.append((windex, word))

return word_list

def inverted_index(text):
"""
Create an Inverted-Index of the specified text document.
{word:[locations]}
"""
inverted = {}

for index, word in word_split(text):
# setdefault func is similar with the func get,but it can add new key and set default value to the dic when the key does not exist
locations = inverted.setdefault(word, [])
locations.append(index)

return inverted

def inverted_index_add(inverted, doc_id, doc_index):
"""
Add Invertd-Index doc_index of the document doc_id to the
Multi-Document Inverted-Index (inverted),
using doc_id as document identifier.
{word:{doc_id:[locations]}}
"""
for word, locations in doc_index.iteritems():
indices = inverted.setdefault(word, {})
indices[doc_id] = locations
return inverted

def search(inverted, query):
"""
Returns a set of documents id that contains all the words in your query.
"""
words = [word for _, word in word_split(query) if word in inverted]
results = [set(inverted[word].keys()) for word in words]#Duplicate remove
return reduce(lambda x, y: x & y, results) if results else []#find the doc in common

def searchPhrase(inverted,query):
"""
Returns a set of documents id that contains phrase in your query.
"""
words = [word for _, word in word_split(query) if word in inverted]
tempDic = {}
doc_return = []
for word in words:
word_doc_ids =  inverted[word].keys()
tempDic.setdefault(word,{})
for ID in word_doc_ids:
word_doc_position =  inverted[word][ID]
tempDic[word].setdefault(ID,word_doc_position)
#print tempDic
#find the common doc
if len(words)>1:
minKey = {}
for i in range(0,len(words)):
tempKeys = tempDic[words[i]].keys()
minKey.setdefault(i,tempKeys)
minKeyNew = minKey[0]
for i in range(1,len(words)):
minKeyNew = [val for val in minKeyNew if val in minKey[i]]
#print minKeyNew
for key in minKeyNew:
#the position that the first word appear
list1 = tempDic[words[0]][key]
#print list1
isAdd = []
for position in list1:
isAddForEach = []
for j in range(1,len(words)):
if position+j not in tempDic[words[j]][key]:
isAddForEach.append(0)
else:
isAddForEach.append(1)
#print isAddForEach
if 0 in isAddForEach:
isAdd.append(0)
else:
isAdd.append(1)
if 1 in isAdd:
doc_return.append(key)
else:
doc_return.append(tempDic[words[0]].keys()[0])
results = []
for doc_id in doc_return:
if doc_id not in results:
results.append(doc_id)
return results

# sys for Chinese using jieba
def wordSplitForChinese(text):
seg_list = jieba.cut(text)
temp = "*****".join(seg_list)
seg_list = temp.split("*****")
word_list = []
index = 1
for word in seg_list:
word_list.append((index,word))
index = index+1
return word_list

def inverted_index_chinese(text):
"""
Create an Inverted-Index of the specified text document.
{word:[locations]}
"""
inverted = {}

for index, word in wordSplitForChinese(text):
locations = inverted.setdefault(word, [])
locations.append(index)

return inverted

def inverted_index_add_chinese(inverted, doc_id, doc_index):
"""
Add Invertd-Index doc_index of the document doc_id to the
Multi-Document Inverted-Index (inverted),
using doc_id as document identifier.
{word:{doc_id:[locations]}}
"""
for word, locations in doc_index.iteritems():
indices = inverted.setdefault(word, {})
indices[doc_id] = locations
return inverted

def search_chinese(inverted, query):
"""
Returns a set of documents id that contains all the words in your query.
"""
words = [word for _,word in wordSplitForChinese(query) if word in inverted]
results = [set(inverted[word].keys()) for word in words]#Duplicate remove
return reduce(lambda x, y: x & y, results) if results else []#find the doc in common

def searchPhraseChinese(inverted,query):
"""
Returns a set of documents id that contains phrase in your query.
"""
words = [word for _, word in wordSplitForChinese(query) if word in inverted]
tempDic = {}
doc_return = []
for word in words:
word_doc_ids =  inverted[word].keys()
tempDic.setdefault(word,{})
for ID in word_doc_ids:
word_doc_position =  inverted[word][ID]
tempDic[word].setdefault(ID,word_doc_position)
#print tempDic
#find the common doc
if len(words)>1:
minKey = {}
for i in range(0,len(words)):
tempKeys = tempDic[words[i]].keys()
minKey.setdefault(i,tempKeys)
minKeyNew = minKey[0]
for i in range(1,len(words)):
minKeyNew = [val for val in minKeyNew if val in minKey[i]]
#print minKeyNew
for key in minKeyNew:
#the position that the first word appear
list1 = tempDic[words[0]][key]
#print list1
isAdd = []
for position in list1:
isAddForEach = []
for j in range(1,len(words)):
if position+j not in tempDic[words[j]][key]:
isAddForEach.append(0)
else:
isAddForEach.append(1)
#print isAddForEach
if 0 in isAddForEach:
isAdd.append(0)
else:
isAdd.append(1)
if 1 in isAdd:
doc_return.append(key)
else:
doc_return.append(tempDic[words[0]].keys()[0])
results = []
for doc_id in doc_return:
if doc_id not in results:
results.append(doc_id)
return results

doc1 = """
Niners head coach Mike Singletary will let Alex Smith remain his starting
quarterback, but his vote of confidence is anything but a long-term mandate.
Smith now will work on a week-to-week basis, because Singletary has voided
his year-long lease on the job.
"I think from this point on, you have to do what's best for the football team,"
Singletary said Monday, one day after threatening to bench Smith during a
27-24 loss to the visiting Eagles.Zero-rate buildings
"""

doc2 = """
The fifth edition of West Coast Green, a conference focusing on "green" home
innovations and products, rolled into San Francisco's Fort Mason last week
intent, per usual, on making our living spaces more environmentally friendly
- one used-tire house at a time.Zero-rated buildings
To that end, there were presentations on topics such as water efficiency and
the burgeoning future of Net Zero-rated buildings that consume no energy and
produce no carbon emissions.on a job,on the job
"""

doc3 = """
土豆洋葱煎饼
小贴士:土豆一定不要擦丝
用刀切 稍微粗一些口感会像薯条 面粉不要太多
拌匀后一定是比较稀释的状态 这样吃得时候满嘴都是土豆和洋葱的香味
而不是吃一块面饼 面粉只是为了帮助土豆洋葱可以成型
切洋葱的时候可以先把洋葱切成大块放入水里泡一会儿
这样不辣眼睛 喜欢脆一点的可以重复炸一遍
"""

doc4="""
鸡肉切好，青椒鸡切小块些好入味。淘洗干净。
青椒去蒂洗净切段
香姑洗净切成喜欢的大小（太小了炒熟后找不到哦，哈哈）
姜切成小段再拍散，蒜拍一点，留一点整的
锅里烧水，量至少要能没过鸡肉。水开后放一段拍好的姜，加一铲子料酒，这一步开水去浮沫。姜、料酒去腥
随后倒入鸡肉用馋子和散。让每块鸡肉都和水充分接触（大火）
肉下锅后，在锅里和个一分来钟就就会看到有油沫什么的浮起来，然后过滤用筛子沥起。（不能在锅里时间太长了，要不肉太老了，要塞牙哦～～）
锅内放油烧热，油温七八成热（能看到有少量油烟往外冒），然后倒入鸡肉大火翻炒，刚开始一放肉，会看到油变得很浑，那是里面的水分太多。鸡肉放进去小炒一会（一般一两分钟），油有变清一点即可捞出鸡肉。肉不要炸太久哦，老了很干，后面也不好入味
鸡肉捞出锅后，剩下油烧一会，待油水分差不多没了，洒入花椒（我忘了厨房没有，忘买了就没放）后立马放拍好的姜和蒜
姜、蒜在油锅里和两下倒进切好的青椒翻炒均匀
青椒下锅翻炒三四下，下鸡肉
鸡肉下锅翻炒三四下放入剥好的整颗的大蒜，同时加盐，盐少少的放，不够再加
然后不快不慢的翻炒，大概炒个三四分钟，能看到鸡骨与肉分离的样子，尝一下鸡肉，已经能吃出青椒味了
倒入香姑，把香姑往锅底和。不喜欢香姑的可以不加，省略此步
加入香姑后翻炒至香姑体积变小，明显脱水。加少量老抽拌均（酱油也行），一是调色，二是让肉更入味。
老抽/酱油加进锅翻炒一分钟左右，从锅周边加少量水，小火闷两分钟，有锅盖就加盖闷吧。。我找不到锅盖啦！！
水差不多干就了就行啦，要是水加多了不好干，到后面就大火收一下就好啦。这就可以了，吃的时候，像吃火锅一样，小火加热着吃，关火了话吃一会会感觉比较油。吃的时候可以在锅底埋点白菜什么的小菜，也是很入味的哦～～
大蒜炒熟了好好吃，没点蒜冲味
"""
inverted = {}
documents = {'doc1':doc1, 'doc2':doc2}
for doc_id, text in documents.iteritems():
doc_index = inverted_index(text)
inverted_index_add(inverted, doc_id, doc_index)

# Print Inverted-Index
#for word, doc_locations in inverted.iteritems():
#print word, doc_locations

#search common words
print "*****search common words*****"
queries = ['Week', 'Niners', 'coast']
for query in queries:
result_docs = search(inverted, query)
print "Search for '%s': %r" % (query, result_docs)

#search phrases
print
print "*****search phrases*****"
newQueries = ['Zero-rated buildings','on the job','West Coast Green']
for query in newQueries:
result_docs = searchPhrase(inverted, query)
print "Search for '%s': %r" % (query, result_docs)

#chinese search
#s = "python中文编码函数"
#s1 = unicode(s, 'utf-8')
#print s1
print
print "*****search chinese phrases*****"
inversed_chinese = {}

documents_chinese = {'doc3':doc3, 'doc4':doc4}
for doc_id, text in documents_chinese.iteritems():
doc_index = inverted_index_chinese(text)
inverted_index_add_chinese(inversed_chinese, doc_id, doc_index)

#search common chinese words
print "*****search common chinese words*****"
chineseQueries = ['土豆','青椒','花椒']
for query in chineseQueries:
result_docs = search_chinese(inversed_chinese, query)
print "Search for ",query,":",result_docs

#search chinese phrases
print "*****search common chinese words*****"
chineseQueries = ['土豆洋葱煎饼','翻炒一分钟','土豆和洋葱的香味']
for query in chineseQueries:
result_docs = search_chinese(inversed_chinese, query)
print "Search for ",query,":",result_docs
内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理
标签： 短语查询 python
相关文章推荐
新的分享
章节导航