您的位置:首页 > 编程语言 > Python开发

crf的Python实现代码

2017-05-08 15:15 330 查看
作者:金良(golden1314521@gmail.com) csdn博客: http://blog.csdn.net/u012176591


对数域操作函数

class Logspace:
def __init__(self):
self.LOGZERO =np.nan
def eexp(self,x):
if np.isnan(x):
return 0
else:
return np.exp(x)
def eln(self,x):
if x == 0:
return self.LOGZERO
elif x>0:
return np.log(x)
else:
print 'Wrong!!!\n\t negative input error'
return np.nan
def elnsum(self,elnx,elny):
if np.isnan(elnx):
return elny
elif np.isnan(elny):
return elnx
elif elnx > elny:
return elnx + self.eln(1+np.exp(elny-elnx))
else:
return elny + self.eln(1+np.exp(elnx-elny))
def elnproduct(self,elnx,elny):
if np.isnan(elnx) or np.isnan(elny):
return self.LOGZERO
else:
return elnx + elny
def elnmatprod(self,elnx,elny):
#array([[ 0.]])其size是2
xsize = np.size(np.shape(elnx))
ysize = np.size(np.shape(elny))

if xsize == 1 and ysize == 1:
r = self.LOGZERO
for i in range(np.shape(elnx)[0]):
r = self.elnsum(r,self.elnproduct(elnx[i],elny[i]))
return r
elif xsize == 1 and not ysize == 1:
n = np.shape(elny)[1]
r = np.zeros(n)
for i in range(n):
r[i] = self.elnmatprod(elnx,elny[:,i])
return r
elif not xsize == 1 and ysize == 1:
n = np.shape(elnx)[0]
r = np.zeros(n)
for i in range(n):
r[i] = self.elnmatprod(elnx[i,:],elny)
return r
else:
m,n= np.shape(elnx)
p = np.shape(elny)[1]
r = np.zeros((m,p))
for i in range(m):
for j in range(p):
r[i][j] = self.elnmatprod(elnx[i,:],elny[:,j])
return r
def eexpmat(self,elny):
expy = np.copy(elny)
if np.size(np.shape(elny)) == 1:
for i in range(np.shape(elny)[0]):
expy[i] = self.eexp(expy[i])
else:
for i in range(np.shape(elny)[0]):
for j in range(np.shape(elny)[1]):
expy[i][j] = self.eexp(expy[i][j])
return expy
def elnmat(self,x):
elnx = np.copy(x)
if np.size(np.shape(x)) == 1:
for i in range(np.shape(x)[0]):
elnx[i] = self.eln(x[i])
else:
for i in range(np.shape(x)[0]):
for j in range(np.shape(x)[1]):
elnx[i,j] = self.eln(x[i,j])
return elnx

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
1146b

39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

测试举例
logspace = Logspace()
M1 = np.array([1,0.5])
M2 = np.array([[1.3,1.5],[1.8,0.5]])
M3 = np.array([[0.8,1.5],[1.8,0.7]])
M4 = np.array([0,0])

print logspace.eexpmat(logspace.elnmatprod(M1,M2))
print np.dot(logspace.eexpmat(M1),logspace.eexpmat(M2))

1
2
3
4
5
6
7
8
1
2
3
4
5
6
7
8

[ 19.94836491 14.90077579] 

[ 19.94836491 14.90077579]


条件随机场的函数

def read_corps(corpsfile='testchunk.data'):
#http://www.chokkan.org/software/crfsuite/tutorial.html,该页面有两个网址可下载数据集,该数据集量很大
#http://blog.dpdearing.com/2011/12/opennlp-part-of-speech-pos-tags-penn-english-treebank/
tagids = defaultdict(lambda: len(tagids))
tagids["<S>"] = 0

corps=[]
onesentence = []
words = [ "<S>" ]
tags  = [   0   ]
#wordnumcount = 0
with open(corpsfile,'r') as f:
for line in f:
if len(line)<=1:
pass
elif line != '. . O\n':
# '. . O\n'表示一句话结束,当一句话未结束则将该单词加入列表onesentence
onesentence.append(line)
else: #如果一句话结束,则对该句话的所有出现的单词进行处理,将处理结果存入列表corps
for texts in onesentence:
#wordnumcount += 1
w_t = texts.strip().split(" ")
#print w_t
try:
#由于表示数字的字符串变化较多,为了减少其干扰,这里将其检测出来并替换掉
float(w_t[0].strip().replace(',',''));
#print w_t
words.append('#CD#')
except:
words.append(w_t[0].lower())
#if w_t[1] in{ '``',',',"''",'$','#',')','('}:
#    print w_t
tags.append(tagids[w_t[1]])
words.append("<S>") #words是一句话的单词组成的列表
tags.append(0)      #tags是一句话的标注组成的列表,与单词列表words一一对应
if np.shape(words)[0] > 2: #排除掉空句子
corps.append((words,tags))

#对onesentence,words和tags重新初始化
onesentence = []
words = [ "<S>" ]
tags  = [   0   ]
#print '一共出现的单词个数:'+np.str(wordnumcount)
#一共出现的单词个数:40377
return corps,tagids
def getfeatureTS(corps):
featuresets = set() #特征的集合
featureT = [] #转移特征的列表,比如列表元素('T', 2, 3)表示从状态2转到特征3
featureS = [] #状态特征的列表,比如列表元素('S','Confidence', 1)
for corp in corps:
for i in range(np.shape(corp[0])[0]):
if corp[0][i] == '<S>':
continue
if ('S',corp[0][i],corp[1][i]) not in featuresets:
featuresets.add(('S',corp[0][i],corp[1][i]))
featureS.append(('S',corp[0][i],corp[1][i]))
if corp[0][i-1] != '<S>':
if ('T',corp[1][i-1],corp[1][i]) not in featuresets:
featuresets.add(('T',corp[1][i-1],corp[1][i]))
featureT.append(('T',corp[1][i-1],corp[1][i]))
featureTS = featureT+featureS
words2tagids = words2tagidfromfeatureS(featureS)
return featureTS,words2tagids
def getpriorfeatureE(corps,featureTS):
#计算先验特征期望值
N = np.shape(corps)[0] #训练样本数
K = np.shape(featureTS)[0] #特征数
priorfeatureE = np.zeros(K)

for corp in corps:
for i in range(np.shape(corp[0])[0]):
if corp[0][i] == '<S>':
continue
try:
idex = featureTS.index(('S', corp[0][i], corp[1][i]))
priorfeatureE[idex] += 1.0
except:
pass
try:
idex = featureTS.index(('T', corp[1][i-1], corp[1][i]))
priorfeatureE[idex] += 1.0
except:
pass
priorfeatureE /=N
#plt.plot(priorfeatureE)
#从特征的先验期望值可以看出无论是转移特征(从横坐标0开始)还是状态特征(从横坐标318开始),先被记录的先验期望值越大
return priorfeatureE
def words2tagidfromfeatureS(featureS):
#统计所有单词分别对应的词性列表
words2tagids = {}
for feature in featureS:
word = feature[1]
state = feature[2]
if word in words2tagids:
words2tagids[word].append(state)
else:
words2tagids[word] = [state]

#lennums列表统计单词对应的词性的长度的分布
#lennums = [[lenlist.count(i) for i in range(1,max(lenlist)+1)]
#           for lenlist in [[len(words2tagids[i]) for i in words2tagids]]][0]
#lennums = [3760, 389, 32, 1]
return words2tagids
def getpostfeatureE(weights,corps,featureTS,words2tagids):
K = np.shape(featureTS)[0] #特征数
postfeatureE = np.zeros(K) #特征的后验期望值
N = np.shape(corps)[0]
for corpidx in range(N):
corp = corps[corpidx][0][1:-1]

lencorp = np.size(corp) #语料长度,即句子中的单词数
Mlist = {}
Mlist['mat'] = ['']*(lencorp+1)
Mlist['dim'] = [words2tagids[corp[i]] for i in range(lencorp)]
Mlist['len'] = [np.size(words2tagids[corp[i]]) for i in range(lencorp)]
for i in range(lencorp+1):
if i == 0:#第一个矩阵,只有状态特征的行向量
d = Mlist['len'][0]
Mlist['mat'][i] = np.zeros((1,d))
for j in range(d):
Mlist['mat'][i][0,j] = weights[featureTS.index(('S', corp[0], words2tagids[corp[0]][j]))]
continue
if i == lencorp:#最后一个矩阵,元素为0的列向量矩阵
Mlist['mat'][i] = np.zeros((Mlist['len'][-1],1))
continue
#既非第一个矩阵,亦非第二个矩阵,每个元素要计算状态特征和转移特征
Mlist['mat'][i] = np.zeros((Mlist['len'][i-1],Mlist['len'][i]))
for d1 in range(Mlist['len'][i-1]):
for d2 in range(Mlist['len'][i]):
id1 = words2tagids[corp[i-1]][d1]
id2 = words2tagids[corp[i]][d2]
try:
Sweight = weights[featureTS.index(('S', corp[i], id2))]
except:
Sweight = 0
try:
Tweight = weights[featureTS.index(('T', id1, id2))]
except:
Tweight = 0
Mlist['mat'][i][d1,d2] = Sweight + Tweight

#return  Mlist,corps[0]
#return 0

z = np.array([[0]])
for i in range(lencorp+1):
z = logspace.elnmatprod(z,Mlist['mat'][i])

Alphalist = ['']*(lencorp+2)
Betalist = ['']*(lencorp+2)
Alphalist[0] = np.zeros((1,1))  # 第一个前向向量:1*1的矩阵
Betalist[-1] = np.zeros((Mlist['len'][-1],1))
#Alphalist里的元素是单行矩阵,Betalist里的元素是单列矩阵
for i in range(
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: