您的位置:首页 > 其它

一个程序搞定最小编辑距离,最大公共子串,最大连续公共子串

2018-01-17 17:14 309 查看
今天看了浏览器的拼写检查的实现流程,发现其中最核心的就是计算字符串之间的最小编辑距离,同时又联想到之前用动态规划实现的子串等相关问题,所以今天在这里做一个总结,用一个代码同时求解两个字符串之间的最小编辑距离,最大子串问题。如下代码:(python)

def compute_distance(A, B, distance_dict, substr_len, sub_str, sub_continue_str, sub_continue_str_len): ### 核心函数,主要应用动态规划思想
i=len(A)
j=len(B)

if i==0:
distance_dict[(i, j)]=j
substr_len[(i,j)]=0
sub_str[(i,j)]=[]
sub_continue_str_len[(i,j)]=0
sub_continue_str[(i,j)]=[]

elif j==0:
distance_dict[(i,j)]=i
substr_len[(i,j)]=0
sub_str[(i,j)]=[]
sub_continue_str[(i,j)]=[]
sub_continue_str_len[(i,j)]=0

else:
if (i-1,j) not in distance_dict.keys():
distance_dict[(i-1, j)], substr_len[(i-1,j)], sub_str[(i-1,j)], sub_continue_str[(i-1,j)], sub_continue_str_len[(i-1,j)]=compute_distance(A[:-1], B, distance_dict, substr_len, sub_str, sub_continue_str, sub_continue_str_len)

if (i, j-1) not in distance_dict.keys():
distance_dict[(i, j-1)], substr_len[(i,j-1)], sub_str[(i, j-1)], sub_continue_str[(i,j-1)], sub_continue_str_len[(i,j-1)]=compute_distance(A, B[:-1], distance_dict, substr_len, sub_str, sub_continue_str, sub_continue_str_len)

if (i-1, j-1) not in distance_dict.keys():
distance_dict[(i-1, j-1)], substr_len[(i-1, j-1)], sub_str[(i-1, j-1)], sub_continue_str[(i-1,j-1)], sub_continue_str_len[(i-1,j-1)]=compute_distance(A[:-1], B[:-1], distance_dict, substr_len, sub_str, sub_continue_str, sub_continue_str_len)

### 动态规划思想的体现:使用字典记录所有计算过的数据,避免重复计算,从而加快速度
if A[-1]==B[-1]:
distance_dict[(i,j)]=min([distance_dict[(i-1, j)]+1, distance_dict[(i, j-1)]+1, distance_dict[(i-1,j-1)]])
substr_len[(i,j)]=substr_len[(i-1, j-1)]+1
sub_str[(i,j)]=sub_str[(i-1, j-1)]+[A[-1]]
sub_continue_str[(i,j)]=sub_continue_str[(i-1,j-1)]+[A[-1]]
sub_continue_str_len[(i,j)]=sub_continue_str_len[(i-1,j-1)]+1
else:
distance_dict[(i, j)] = min([distance_dict[(i - 1, j)]+1, distance_dict[(i, j - 1)]+1,distance_dict[(i - 1, j - 1)]+2])
substr_len[(i,j)]=max([substr_len[(i-1,j)], substr_len[(i, j-1)]])
sub_str[(i,j)]=sub_str[(i-1,j)] if substr_len[(i-1,j)] > substr_len[(i, j-1)] else sub_str[(i,j-1)]
sub_continue_str_len[(i,j)]=0
sub_continue_str[(i,j)]=[]

return distance_dict[(i,j)], substr_len[(i,j)], sub_str[(i,j)], sub_continue_str[(i,j)], sub_continue_str_len[(i,j)]

def main():
A='asdefats' #'INTENaTION' #'AGGCTATCACCTGACCTCCAGGCCGATGCCC'
B='werasdfaswer' #'EXECUaTIONEE' #'TAGCTATCACGACCGCGGTCGATTTGCCCGAC'

distance_dict={}。### 记录编辑距离
substr_len={}。 ### 记录最长公共子串的长度
substr={}。 ### 记录最长公共子串
sub_continue_str={}。 ## 记录连续的最长公共子串
sub_continue_str_len={}。 ### 记录连续的最长公共子串的长度

dis, len, str, con_str, con_str_len=compute_distance(A, B, distance_dict, substr_len, substr, sub_continue_str, sub_continue_str_len)

### ---------result-------------
max_sub_str=str

min_edit_distance=dis

key=max(sub_continue_str_len.items(), key=lambda x:x[1])[0]
max_continue_sub_str=sub_continue_str[key]

print ('the min edit distance is : ', min_edit_distance)
print ('the max sub squence is: ', max_sub_str)
print ('the max continue sub squence is: ', max_continue_sub_str)

if __name__=='__main__':
main()运行结果如下:
the min edit distance is :  8

the max sub squence is:  ['a', 's', 'd', 'f', 'a', 's']

the max continue sub squence is:  ['a', 's', 'd']
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
相关文章推荐