keras实现aspect level中文文本情感分类-源自EMNLP2016
2018-01-02 08:36
911 查看
原论文:http://link.zhihu.com/?target=https%3A//arxiv.org/pdf/1605.08900v2.pdf
代码实现:
#coding=utf-8 import csv,codecs import jieba # jieba.load_userdict('wordDict.txt') import pandas as pd from keras.preprocessing import sequence from keras.utils import np_utils from keras.models import * from keras.optimizers import * from keras.layers.core import * from keras.layers import * from keras.regularizers import l2 from keras import backend as K from keras.utils.vis_utils import plot_model np.random.seed(1337) # 读取训练集 def readtrain(): # with open('allTrain_includeView.csv', 'r') as csvfile: # reader = csv.reader(csvfile) # column1 = [row for row in reader] # content_train = [i[1] for i in column1[1:]] # view_train = [i[2] for i in column1[1:]] # opinion_train = [i[3] for i in column1[1:]] # print ('训练集有 %s 条句子' % len(content_train)) # train = [content_train, view_train, opinion_train] # f = codecs.open('allTrain_includeView.txt', 'r', encoding='utf-8') # lines = f.readlines() # content_train = [str(line).split(',')[1:-2] for line in lines] # view_train = [str(line).split(',')[-2:-1] for line in lines] # opinion_train = ['neg' if str(str(line).split(',')[-1:]).find('neg')>=0 else 'pos' for line in lines] # print(opinion_train) # print('训练集有 %s 条句子' % len(content_train)) # train = [content_train, view_train, opinion_train] f = codecs.open('data/clothing/pos.txt', 'r', encoding='gb18030') lines = f.readlines() f.close() content_train = [str(line).strip() for line in lines] view_train =['衣服' for i in range(len(lines))] opinion_train = ['pos' for line in lines] f = codecs.open('data/clothing/neg.txt', 'r', encoding='gb18030') lines = f.readlines() f.close() content_train += [str(line).strip() for line in lines] view_train +=['衣服' for i in range(len(lines))] opinion_train += ['neg' for line in lines] f = codecs.open('data/fruit/pos.txt', 'r', encoding='gb18030') lines = f.readlines() f.close() content_train += [str(line).strip() for line in lines] view_train +=['水果' for i in range(len(lines))] opinion_train += ['pos' for line in lines] f = codecs.open('data/fruit/neg.txt', 'r', encoding='gb18030') lines = f.readlines() f.close() content_train += [str(line).strip() for line in lines] view_train +=['水果' for i in range(len(lines))] opinion_train += ['neg' for line in lines] f = codecs.open('data/hotel/pos.txt', 'r', encoding='gb18030') lines = f.readlines() f.close() content_train += [str(line).strip() for line in lines] view_train +=['酒店' for i in range(len(lines))] opinion_train += ['pos' for line in lines] f = codecs.open('data/hotel/neg.txt', 'r', encoding='gb18030') lines = f.readlines() f.close() content_train += [str(line).strip() for line in lines] view_train +=['酒店' for i in range(len(lines))] opinion_train += ['neg' for line in lines] f = codecs.open('data/pda/pos.txt', 'r', encoding='gb18030') lines = f.readlines() f.close() content_train += [str(line).strip() for line in lines] view_train +=['手机' for i in range(len(lines))] opinion_train += ['pos' for line in lines] f = codecs.open('data/pda/neg.txt', 'r', encoding='gb18030') lines = f.readlines() f.close() content_train += [str(line).strip() for line in lines] view_train +=['手机' for i in range(len(lines))] opinion_train += ['neg' for line in lines] f = codecs.open('data/shampoo/pos.txt', 'r', encoding='gb18030') lines = f.readlines() f.close() content_train += [str(line).strip() for line in lines] view_train +=['洗发' for i in range(len(lines))] opinion_train += ['pos' for line in lines] f = codecs.open('data/shampoo/neg.txt', 'r', encoding='gb18030') lines = f.readlines() f.close() content_train += [str(line).strip() for line in lines] view_train +=['洗发' for i in range(len(lines))] opinion_train += ['neg' for line in lines] print('训练集有 %s 条句子' % len(content_train)) train = [content_train, view_train, opinion_train] return train # 将utf8的列表转换成unicode def changeListCode(b): a = [] for i in b: a.append(i) return a # 对列表进行分词用逗号连接 def segmentWord2(cont): c = [] for i in cont: a = list(jieba.cut(str(i))) c.append(a) # print(c) return c def transLabel(labels): for i in range(len(labels)): if labels[i] == 'pos': labels[i] = 1 # elif labels[i] == 'neu': # labels[i] = 1 elif labels[i] == 'neg': labels[i] = 0 else: print ("label无效:",labels[i]) return labels # content = ["我 来到 北京 清华大学", "他 来到 了 网易 杭研 大厦"] sklearn输入格式 # content = [['我','来到', '北京'], ['他','来到','了']] keras输入格式 train = readtrain() content = segmentWord2(train[0]) #所有的词一起分词,包括训练集和预测集 view = changeListCode(train[1]) opinion = transLabel(train[2]) w = [] # 将所有词语整合在一起 for i in content: w.extend(i) for i in view: # 把view的词也加入进去 w.append(i) def get_aspect(X): ans = X[:, 0, :] return ans def get_context(X): ans = X[:, 1:, :] return ans def get_R(X): Y, alpha = X[0], X[1] # ans = K.T.batched_dot(Y, alpha) #theano backend ans = K.batch_dot(Y, alpha) #tensorflow backend # print(K.shape(Y)) # print(K.shape(alpha)) # print(K.shape(ans)) return ans # 参数设置 maxlen = 81 batch = 32 emb = 300 class_num=2 print('Preprocessing...') dict = pd.DataFrame(pd.Series(w).value_counts()) # 统计词的出现次数 del w dict['id'] = list(range(1, len(dict) + 1)) get_sent = lambda x: list(dict['id'][x]) sent = pd.Series(content).apply(get_sent) # print(dict) # print(sent) # exit() for i in range(len(content)): # 在第一个位置插入view的值,每个句子的第一个词为视角 a = dict['id'][view[i]] sent[i].insert(0,a) sent = list(sequence.pad_sequences(sent, maxlen=maxlen)) train_content = np.array(sent) train_opinion = np.array(opinion) train_opinion1 = np_utils.to_categorical(train_opinion, class_num) print('Build model...') main_input = Input(shape=(maxlen,), dtype='int32', name='main_input') x = Embedding(output_dim=emb, input_dim=len(dict)+1, input_length=maxlen, name='x')(main_input) drop_out = Dropout(0.1, name='dropout')(x) w_aspect = Lambda(get_aspect, output_shape=(emb,), name="w_aspect")(drop_out) w_context = Lambda(get_context, output_shape=(maxlen-1,emb), name="w_context")(drop_out) w_aspect = Dense(emb, kernel_regularizer=l2(0.01), name="w_aspect_1")(w_aspect) # hop 1 w_aspects = RepeatVector(maxlen-1, name="w_aspects1---hop-1")(w_aspect) merged = concatenate([w_context, w_aspects], name='merged1') distributed = TimeDistributed(Dense(1, kernel_regularizer=l2(0.01), activation='tanh'), name="distributed1---hop-1")(merged) flat_alpha = Flatten(name="flat_alpha1---hop-1")(distributed) alpha = Dense(maxlen-1, activation='softmax', name="alpha1---hop-1")(flat_alpha) w_context_trans = Permute((2, 1), name="w_context_trans1")(w_context) r_ = Lambda(get_R, output_shape=(emb,1), name="r_1---hop-1")([w_context_trans, alpha]) r = Reshape((emb,), name="r1---hop-1")(r_) w_aspect_linear = Dense(emb, kernel_regularizer=l2(0.01), activation='linear')(w_aspect) merged = add([r, w_aspect_linear]) w_aspect = Dense(emb, kernel_regularizer=l2(0.01), name="w_aspect_2---hop-1")(merged) # hop 2 w_aspects = RepeatVector(maxlen-1, name="w_aspects2")(w_aspect) merged = concatenate([w_context, w_aspects], name='merged2') distributed = TimeDistributed(Dense(1, kernel_regularizer=l2(0.01), activation='tanh'), name="distributed2")(merged) flat_alpha = Flatten(name="flat_alpha2")(distributed) alpha = Dense(maxlen-1, activation='softmax', name="alpha2")(flat_alpha) w_context_trans = Permute((2, 1), name="w_context_trans2")(w_context) # r_ = merge([w_context_trans, alpha], output_shape=(emb, 1), name="r_2", mode=get_R) r_ = Lambda(get_R, output_shape=(emb,1), name="r_2")([w_context_trans, alpha]) r = Reshape((emb,), name="r2")(r_) w_aspect_linear = Dense(emb, kernel_regularizer=l2(0.01), activation='linear')(w_aspect) merged = add([r, w_aspect_linear]) w_aspect = Dense(emb, kernel_regularizer=l2(0.01), name="w_aspect_3")(merged) # hop 3 w_aspects = RepeatVector(maxlen-1, name="w_aspects3")(w_aspect) merged = concatenate([w_context, w_aspects], name='merged3') distributed = TimeDistributed(Dense(1, kernel_regularizer=l2(0.01), activation='tanh'), name="distributed3")(merged) flat_alpha = Flatten(name="flat_alpha3")(distributed) alpha = Dense(maxlen-1, activation='softmax', name="alpha3")(flat_alpha) w_context_trans = Permute((2, 1), name="w_context_trans3")(w_context) r_ = Lambda(get_R, output_shape=(emb,1), name="r_3")([w_context_trans, alpha]) r = Reshape((emb,), name="r3")(r_) w_aspect_linear = Dense(emb, kernel_regularizer=l2(0.01), activation='linear')(w_aspect) merged = add([r, w_aspect_linear]) h_ = Activation('tanh')(merged) out = Dense(class_num, activation='softmax')(h_) output = out model = Model(inputs=[main_input], outputs=output) # exit() model.summary() plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True) model.compile(loss='categorical_crossentropy', optimizer=adam(), # 或者SGD(lr=0.03, momentum=0.9, nesterov=True) metrics=['accuracy']) # if os.path.exists('model.w'): # model.load_weights('model.w') print('Train...') model.fit(train_content, train_opinion1, shuffle=True, batch_size=batch, epochs=500, validation_split=0.02) model.save_weights('model.w') K.clear_session()
相关文章推荐
- 基于Attention Model的Aspect level文本情感分类---用Python+Keras实现
- 基于情感词典和朴素贝叶斯算法实现中文文本情感分类
- Keras实现的文本情感分类例子
- 自然语言处理课程作业 中文文本情感分类
- cnn、rnn实现中文文本分类(基于tensorflow)
- Spark MLlib实现的中文文本分类–Native Bayes
- Python实现购物评论文本情感分析操作【基于中文文本挖掘库snownlp】
- CNN中文文本分类-基于TensorFlow实现
- Spark2.0 特征提取、转换、选择之二:特征选择、文本处理,以中文自然语言处理(情感分类)为例
- 自然语言处理课程作业 中文文本情感分类
- LSTM 文本情感分析/序列分类 Keras
- 自然语言处理课程作业 中文文本情感分类
- Spark MLlib实现的中文文本分类–Naive Bayes
- Tensorflow学习笔记--使用keras完成文本情感分类问题
- 如何用Python和机器学习训练中文文本情感分类模型?
- 【NLP】TensorFlow实现CNN用于中文文本分类
- aspect level 的文本情感分类试验结果1
- Chinese-Text-Classification,用卷积神经网络基于 Tensorflow 实现的中文文本分类。
- Spark2.0 特征提取、转换、选择之二:特征选择、文本处理,以中文自然语言处理(情感分类为例)
- Spark MLlib实现的中文文本分类–Naive Bayes