您的位置:首页 > 编程语言 > ASP

keras实现aspect level中文文本情感分类-源自EMNLP2016

2018-01-02 08:36 911 查看
原论文:http://link.zhihu.com/?target=https%3A//arxiv.org/pdf/1605.08900v2.pdf

代码实现:

#coding=utf-8

import csv,codecs
import jieba
# jieba.load_userdict('wordDict.txt')
import pandas as pd
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import *
from keras.optimizers import *
from keras.layers.core import *
from keras.layers import *
from keras.regularizers import l2
from keras import backend as K
from keras.utils.vis_utils import plot_model

np.random.seed(1337)

# 读取训练集
def readtrain():
# with open('allTrain_includeView.csv', 'r') as csvfile:
#     reader = csv.reader(csvfile)
#     column1 = [row for row in reader]
# content_train = [i[1] for i in column1[1:]]
# view_train = [i[2] for i in column1[1:]]
# opinion_train = [i[3] for i in column1[1:]]
# print ('训练集有 %s 条句子' % len(content_train))
# train = [content_train, view_train, opinion_train]

# f = codecs.open('allTrain_includeView.txt', 'r', encoding='utf-8')
# lines = f.readlines()
# content_train = [str(line).split(',')[1:-2] for line in lines]
# view_train = [str(line).split(',')[-2:-1] for line in lines]
# opinion_train = ['neg' if str(str(line).split(',')[-1:]).find('neg')>=0 else 'pos' for line in lines]
# print(opinion_train)
# print('训练集有 %s 条句子' % len(content_train))
# train = [content_train, view_train, opinion_train]

f = codecs.open('data/clothing/pos.txt', 'r', encoding='gb18030')
lines = f.readlines()
f.close()
content_train = [str(line).strip() for line in lines]
view_train =['衣服' for i in range(len(lines))]
opinion_train = ['pos' for line in lines]
f = codecs.open('data/clothing/neg.txt', 'r', encoding='gb18030')
lines = f.readlines()
f.close()
content_train += [str(line).strip() for line in lines]
view_train +=['衣服' for i in range(len(lines))]
opinion_train += ['neg' for line in lines]

f = codecs.open('data/fruit/pos.txt', 'r', encoding='gb18030')
lines = f.readlines()
f.close()
content_train += [str(line).strip() for line in lines]
view_train +=['水果' for i in range(len(lines))]
opinion_train += ['pos' for line in lines]
f = codecs.open('data/fruit/neg.txt', 'r', encoding='gb18030')
lines = f.readlines()
f.close()
content_train += [str(line).strip() for line in lines]
view_train +=['水果' for i in range(len(lines))]
opinion_train += ['neg' for line in lines]

f = codecs.open('data/hotel/pos.txt', 'r', encoding='gb18030')
lines = f.readlines()
f.close()
content_train += [str(line).strip() for line in lines]
view_train +=['酒店' for i in range(len(lines))]
opinion_train += ['pos' for line in lines]
f = codecs.open('data/hotel/neg.txt', 'r', encoding='gb18030')
lines = f.readlines()
f.close()
content_train += [str(line).strip() for line in lines]
view_train +=['酒店' for i in range(len(lines))]
opinion_train += ['neg' for line in lines]

f = codecs.open('data/pda/pos.txt', 'r', encoding='gb18030')
lines = f.readlines()
f.close()
content_train += [str(line).strip() for line in lines]
view_train +=['手机' for i in range(len(lines))]
opinion_train += ['pos' for line in lines]
f = codecs.open('data/pda/neg.txt', 'r', encoding='gb18030')
lines = f.readlines()
f.close()
content_train += [str(line).strip() for line in lines]
view_train +=['手机' for i in range(len(lines))]
opinion_train += ['neg' for line in lines]

f = codecs.open('data/shampoo/pos.txt', 'r', encoding='gb18030')
lines = f.readlines()
f.close()
content_train += [str(line).strip() for line in lines]
view_train +=['洗发' for i in range(len(lines))]
opinion_train += ['pos' for line in lines]
f = codecs.open('data/shampoo/neg.txt', 'r', encoding='gb18030')
lines = f.readlines()
f.close()
content_train += [str(line).strip() for line in lines]
view_train +=['洗发' for i in range(len(lines))]
opinion_train += ['neg' for line in lines]

print('训练集有 %s 条句子' % len(content_train))
train = [content_train, view_train, opinion_train]
return train

# 将utf8的列表转换成unicode
def changeListCode(b):
a = []
for i in b:
a.append(i)
return a

# 对列表进行分词用逗号连接
def segmentWord2(cont):
c = []
for i in cont:
a = list(jieba.cut(str(i)))
c.append(a)
# print(c)
return c

def transLabel(labels):
for i in range(len(labels)):
if labels[i] == 'pos':
labels[i] = 1
# elif labels[i] == 'neu':
#     labels[i] = 1
elif labels[i] == 'neg':
labels[i] = 0
else: print ("label无效:",labels[i])
return labels

# content = ["我 来到 北京 清华大学", "他 来到 了 网易 杭研 大厦"]  sklearn输入格式
# content = [['我','来到', '北京'], ['他','来到','了']]  keras输入格式
train = readtrain()

content = segmentWord2(train[0])  #所有的词一起分词,包括训练集和预测集
view = changeListCode(train[1])
opinion = transLabel(train[2])

w = []  # 将所有词语整合在一起
for i in content:
w.extend(i)
for i in view:  # 把view的词也加入进去
w.append(i)

def get_aspect(X):
ans = X[:, 0, :]
return ans

def get_context(X):
ans = X[:, 1:, :]
return ans

def get_R(X):
Y, alpha = X[0], X[1]
# ans = K.T.batched_dot(Y, alpha) #theano backend
ans = K.batch_dot(Y, alpha) #tensorflow backend
# print(K.shape(Y))
# print(K.shape(alpha))
# print(K.shape(ans))
return ans

# 参数设置
maxlen = 81
batch = 32
emb = 300
class_num=2

print('Preprocessing...')
dict = pd.DataFrame(pd.Series(w).value_counts())  # 统计词的出现次数
del w
dict['id'] = list(range(1, len(dict) + 1))
get_sent = lambda x: list(dict['id'][x])
sent = pd.Series(content).apply(get_sent)
# print(dict)
# print(sent)
# exit()

for i in range(len(content)):  # 在第一个位置插入view的值,每个句子的第一个词为视角
a = dict['id'][view[i]]
sent[i].insert(0,a)

sent = list(sequence.pad_sequences(sent, maxlen=maxlen))

train_content = np.array(sent)
train_opinion = np.array(opinion)
train_opinion1 = np_utils.to_categorical(train_opinion, class_num)

print('Build model...')

main_input = Input(shape=(maxlen,), dtype='int32', name='main_input')
x = Embedding(output_dim=emb, input_dim=len(dict)+1, input_length=maxlen, name='x')(main_input)
drop_out = Dropout(0.1, name='dropout')(x)
w_aspect = Lambda(get_aspect, output_shape=(emb,), name="w_aspect")(drop_out)
w_context = Lambda(get_context, output_shape=(maxlen-1,emb), name="w_context")(drop_out)

w_aspect = Dense(emb, kernel_regularizer=l2(0.01), name="w_aspect_1")(w_aspect)

# hop 1
w_aspects = RepeatVector(maxlen-1, name="w_aspects1---hop-1")(w_aspect)
merged = concatenate([w_context, w_aspects], name='merged1')

distributed = TimeDistributed(Dense(1, kernel_regularizer=l2(0.01), activation='tanh'), name="distributed1---hop-1")(merged)
flat_alpha = Flatten(name="flat_alpha1---hop-1")(distributed)
alpha = Dense(maxlen-1, activation='softmax', name="alpha1---hop-1")(flat_alpha)
w_context_trans = Permute((2, 1), name="w_context_trans1")(w_context)
r_ = Lambda(get_R, output_shape=(emb,1), name="r_1---hop-1")([w_context_trans, alpha])

r = Reshape((emb,), name="r1---hop-1")(r_)
w_aspect_linear = Dense(emb, kernel_regularizer=l2(0.01), activation='linear')(w_aspect)
merged = add([r, w_aspect_linear])

w_aspect = Dense(emb, kernel_regularizer=l2(0.01), name="w_aspect_2---hop-1")(merged)

# hop 2
w_aspects = RepeatVector(maxlen-1, name="w_aspects2")(w_aspect)
merged = concatenate([w_context, w_aspects], name='merged2')
distributed = TimeDistributed(Dense(1, kernel_regularizer=l2(0.01), activation='tanh'), name="distributed2")(merged)
flat_alpha = Flatten(name="flat_alpha2")(distributed)
alpha = Dense(maxlen-1, activation='softmax', name="alpha2")(flat_alpha)
w_context_trans = Permute((2, 1), name="w_context_trans2")(w_context)
# r_ = merge([w_context_trans, alpha], output_shape=(emb, 1), name="r_2", mode=get_R)
r_ = Lambda(get_R, output_shape=(emb,1), name="r_2")([w_context_trans, alpha])
r = Reshape((emb,), name="r2")(r_)
w_aspect_linear = Dense(emb, kernel_regularizer=l2(0.01), activation='linear')(w_aspect)
merged = add([r, w_aspect_linear])

w_aspect = Dense(emb, kernel_regularizer=l2(0.01), name="w_aspect_3")(merged)

# hop 3
w_aspects = RepeatVector(maxlen-1, name="w_aspects3")(w_aspect)
merged = concatenate([w_context, w_aspects], name='merged3')
distributed = TimeDistributed(Dense(1, kernel_regularizer=l2(0.01), activation='tanh'), name="distributed3")(merged)
flat_alpha = Flatten(name="flat_alpha3")(distributed)
alpha = Dense(maxlen-1, activation='softmax', name="alpha3")(flat_alpha)
w_context_trans = Permute((2, 1), name="w_context_trans3")(w_context)
r_ = Lambda(get_R, output_shape=(emb,1), name="r_3")([w_context_trans, alpha])
r = Reshape((emb,), name="r3")(r_)
w_aspect_linear = Dense(emb, kernel_regularizer=l2(0.01), activation='linear')(w_aspect)
merged = add([r, w_aspect_linear])

h_ = Activation('tanh')(merged)
out = Dense(class_num, activation='softmax')(h_)
output = out
model = Model(inputs=[main_input], outputs=output)
# exit()

model.summary()
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

model.compile(loss='categorical_crossentropy',
optimizer=adam(), # 或者SGD(lr=0.03, momentum=0.9, nesterov=True)
metrics=['accuracy'])

# if os.path.exists('model.w'):
#     model.load_weights('model.w')

print('Train...')
model.fit(train_content, train_opinion1,
shuffle=True,
batch_size=batch, epochs=500,
validation_split=0.02)

model.save_weights('model.w')

K.clear_session()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: