[TensorFlow实战练习]2-对推特数据的情绪分析分类
2017-05-12 14:37
531 查看
前言
本实战延续上一贴的内容,主要是学习文本数据分类过程,相比第一个实战练习,讲一下不同和优化过程。数据量增加,有160万条推特数据,我的小Air完全读不下这么多。大数据处理方面还有很多工作要去做。
数据增大带来的是使用TFRecord去读取和处理数据,这样会极大的方便我们之后的队列处理过程。
相比上一个实战过程,不在使用readline这种读取方法,利用pandas对原始数据进行处理,还是因为数据量太大,不能把160W完全读入内存中。
实验流程
构建词汇表,保存词汇表。构建过长的词汇表这个方法对于不定长数据的效果不好,速度变得很慢。无形中加大了数据量,以后想到好的解决方法再修改。创建训练数据集,word2vector向量化,保存为TFRecord格式。
搭建前向神经网络,batch化训练,利用队列读取TFRecord数据,每次迭代对所有数据进行训练,每次利用测试数据评估accuracy
比较准确率,保存最好的模型
加载模型,对新的tweet数据进行预测
实验数据 http://help.sentiment140.com/for-students/
0 – the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 – the id of the tweet (2087)
2 – the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 – the query (lyx). If there is no query, then this value is NO_QUERY.
4 – the user that tweeted (robotickilldozr)
5 – the text of the tweet (Lyx is cool)
下面开始贴代码了
create_lexcion.py
# -*- coding: UTF-8 -* import numpy as np import tensorflow as tf import pandas as pd import nltk from nltk.tokenize import word_tokenize from collections import Counter from nltk.stem import WordNetLemmatizer import pickle def create_vacabularyDict(filename): wordList = [] for word in filename: word = word_tokenize(word.lower().decode('latin-1')) wordList.extend(word) # 词形还原 (cats->cat) lemmatizer = WordNetLemmatizer() wordList = [lemmatizer.lemmatize(word) for word in wordList] wordSet = [] word_count = Counter(wordList) for word in word_count: if word_count[word] < 100000 and word_count[word] > 100: wordSet.append(word) # char2int = dict((c, i) for i, c in enumerate(wordSet)) # int2char = dict((i, c) for i, c in enumerate(wordSet)) return wordSet org_train_filepath = "../data/trainingandtestdata/training.1600000.processed.noemoticon.csv" org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv" df = pd.read_csv(org_train_filepath, names=['polarity','id','data','Query','username','text'], usecols=['polarity','text']) # dftemp = df[:10000] dftext = df['text'] wordList = create_vacabularyDict(dftext) #保存词汇表 with open('../lexcion.pickle', 'wb') as f: pickle.dump(wordList, f)
create_dataset.py
# -*- coding: UTF-8 -* import numpy as np import tensorflow as tf import pandas as pd import nltk from nltk.tokenize import word_tokenize from collections import Counter from nltk.stem import WordNetLemmatizer import pickle from sklearn.utils import shuffle #预处理 def process_for_train(file,wordList): dftemp = file.copy() # length = len(df) # num_process = 1000 # dftemp = pd.DataFrame(columns=['polarity','text']) dataset = [] lemmatizer = WordNetLemmatizer() int_to_vector = { 0:[0,0,1], 2:[0,1,0], 4:[1,0,0] } def word2vector(text): line_text = word_tokenize(text.lower().decode('latin-1')) words = [lemmatizer.lemmatize(word) for word in line_text] feature = np.zeros(len(wordList)) for word in words: if word in wordList: feature[wordList.index(word)] = 1 feature = list(feature) return feature dftemp['pol 4000 arity'] = dftemp['polarity'].map(int_to_vector) dftemp['text'] = dftemp['text'].map(word2vector) return dftemp def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList(value=[value])) def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) org_train_filepath = "../data/trainingandtestdata/training.1600000.processed.noemoticon.csv" org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv" df = pd.read_csv(org_train_filepath, names=['polarity','id','data','Query','username','text'], usecols=['polarity','text']) df = shuffle(df) df = df[:3000].reset_index(drop=True) f = open('../lexcion.pickle', 'rb') wordList = pickle.load(f) f.close() dftemp = process_for_train(df,wordList) num_dataset = len(dftemp) # len_dataset = 5000 filename = "../data/output_train.tfrecords" writer = tf.python_io.TFRecordWriter(filename) for i in range(num_dataset): # word = ",".join(map(str,dftemp.get_value(index,'text'))) # label = ",".join(map(str,dftemp.get_value(index,'polarity'))) word = map(int,dftemp.get_value(i,'text')) label = map(int,dftemp.get_value(i,'polarity')) example = tf.train.Example(features=tf.train.Features( feature={ 'word': _int64_feature(word), 'label': _int64_feature(label) } )) writer.write(example.SerializeToString()) writer.close()
train.py
# -*- coding: UTF-8 -* import numpy as np import tensorflow as tf import pandas as pd import nltk from nltk.tokenize import word_tokenize from collections import Counter from nltk.stem import WordNetLemmatizer import pickle # from create_dataset import process_for_train def process_for_train(file,wordList): dftemp = file.copy() dataset = [] lemmatizer = WordNetLemmatizer() int_to_vector = { 0:[0,0,1], 2:[0,1,0], 4:[1,0,0] } def word2vector(text): line_text = word_tokenize(text.lower().decode('latin-1')) words = [lemmatizer.lemmatize(word) for word in line_text] feature = np.zeros(len(wordList)) for word in words: if word in wordList: feature[wordList.index(word)] = 1 feature = list(feature) return feature dftemp['polarity'] = dftemp['polarity'].map(int_to_vector) dftemp['text'] = dftemp['text'].map(word2vector) return dftemp #定义全连接神经网络结构 def inference(input_tesnor): with tf.variable_scope('layer1-fc'): fc1_W = tf.get_variable("weight",[N_INPUT,NUM_LAYER1] ,initializer=tf.truncated_normal_initializer(stddev=0.1)) fc1_b = tf.get_variable("bias",[NUM_LAYER1], initializer=tf.random_normal_initializer(stddev=1.0)) fc1 = tf.add(tf.matmul(input_tesnor,fc1_W),fc1_b) relu1 = tf.nn.relu(fc1) # if train: # fc1 = tf.nn.dropout(fc1,0.5) with tf.variable_scope('layer2-fc'): fc2_W = tf.get_variable("weight", [NUM_LAYER1, NUM_LAYER2] , initializer=tf.truncated_normal_initializer(stddev=0.1)) fc2_b = tf.get_variable("bias", [NUM_LAYER2], initializer=tf.random_normal_initializer(stddev=1.0)) fc2 = tf.add(tf.matmul(relu1,fc2_W),fc2_b) relu2 = tf.nn.relu(fc2) with tf.variable_scope('output-fc'): output_W = tf.get_variable("weight", [NUM_LAYER2, N_OUTPUT] , initializer=tf.truncated_normal_initializer(stddev=0.1)) output_b = tf.get_variable("bias", [N_OUTPUT], initializer=tf.random_normal_initializer(stddev=1.0)) output = tf.add(tf.matmul(relu2,output_W),output_b) return output N_INPUT = 8053 N_OUTPUT = 3 NUM_LAYER1 = 1000 NUM_LAYER2 = 1000 TRAIN_EPOCHS = 1000 NUM_TRAIN = 3000 org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv" df = pd.read_csv(org_test_filepath, names=['polarity','id','data','Query','username','text'], usecols=['polarity','text']) f = open('../lexcion.pickle', 'rb') wordList = pickle.load(f) f.close() dftemp = process_for_train(df[:300],wordList) X_test = np.reshape(np.array(dftemp['text'].tolist()),(len(dftemp),N_INPUT)) y_test = np.reshape(np.array(dftemp['polarity'].tolist()),(len(dftemp),N_OUTPUT)) file = "../data/output_train.tfrecords" test_file = "output_test.tfrecords" #string_input_producer用于多文件输入 filename_queue = tf.train.string_input_producer([file], shuffle=False) reader = tf.TFRecordReader() _,serialized_example = reader.read(filename_queue) features = tf.parse_single_example( serialized_example, features={ 'word':tf.FixedLenFeature([8053],tf.int64), 'label':tf.FixedLenFeature([3],tf.int64) } ) X_train = features['word'] y_train = features['label'] min_after_dequeue = 1000 batch_size = 100 capacity = min_after_dequeue + batch_size * 3 X_batch,y_batch = tf.train.shuffle_batch([X_train,y_train], min_after_dequeue=min_after_dequeue, batch_size=batch_size, capacity=capacity) def train(): X = tf.placeholder(tf.float32,[None,N_INPUT],name='x-input') y = tf.placeholder(tf.float32,[None,N_OUTPUT],name='y-output') predict = inference(X) cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_batch,logits=predict)) optimizer = tf.train.AdamOptimizer().minimize(cost_func) #eval statics # test_predict = inference(X) correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct, 'float')) with tf.Session() as sess: tf.global_variables_initializer().run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess,coord=coord) saver = tf.train.Saver() pre_accuracy = 0 #准确率 for i in range(TRAIN_EPOCHS): # for j in range() # cur_X_batch,cur_y_batch = sess.run([X_batch,y_batch]) # feed_dict = {X: cur_X_batch, y: cur_y_batch} for j in range(int(NUM_TRAIN/batch_size)): cur_X_batch, cur_y_batch = sess.run([X_batch, y_batch]) _, loss = sess.run([optimizer, cost_func],feed_dict = {X: cur_X_batch, y: cur_y_batch}) if i % 10 == 0: print "After %d training step,loss is %g" %(i,loss) # if i % 10 == 0: temp_accuracy = sess.run(accuracy,feed_dict={X: X_test, y: y_test}) if temp_accuracy > pre_accuracy: # 保存准确率最高的训练模型 print('accuracy is : ', temp_accuracy) pre_accuracy = temp_accuracy saver.save(sess, '../model/model.ckpt') # 保存session # print('accuracy: ', accuracy.eval({X: list(X_test), y: list(y_test)})) coord.request_stop() coord.join(threads) train()
test.py
# -*- coding: UTF-8 -* import tensorflow as tf import pickle from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer import numpy as np f = open('../lexcion.pickle', 'rb') wordList = pickle.load(f) f.close() N_INPUT = len(wordList) # 输入层 N_OUTPUT = 3 NUM_LAYER1 = 1000 NUM_LAYER2 = 1000 def word2vector(text): line_text = word_tokenize(text.lower().decode('latin-1')) lemmatizer = WordNetLemmatizer() words = [lemmatizer.lemmatize(word) for word in line_text] feature = np.zeros(len(wordList)) for word in words: if word in wordList: feature[wordList.index(word)] = 1 feature = list(feature) return feature #定义全连接神经网络结构 def inference(input_tesnor): with tf.variable_scope('layer1-fc'): fc1_W = tf.get_variable("weight",[N_INPUT,NUM_LAYER1] ,initializer=tf.truncated_normal_initializer(stddev=0.1)) fc1_b = tf.get_variable("bias",[NUM_LAYER1], initializer=tf.random_normal_initializer(stddev=1.0)) fc1 = tf.add(tf.matmul(input_tesnor,fc1_W),fc1_b) relu1 = tf.nn.relu(fc1) # if train: # fc1 = tf.nn.dropout(fc1,0.5) with tf.variable_scope('layer2-fc'): fc2_W = tf.get_variable("weight", [NUM_LAYER1, NUM_LAYER2] , initializer=tf.truncated_normal_initializer(stddev=0.1)) fc2_b = tf.get_variable("bias", [NUM_LAYER2], initializer=tf.random_normal_initializer(stddev=1.0)) fc2 = tf.add(tf.matmul(relu1,fc2_W),fc2_b) relu2 = tf.nn.relu(fc2) with tf.variable_scope('output-fc'): output_W = tf.get_variable("weight", [NUM_LAYER2, N_OUTPUT] , initializer=tf.truncated_normal_initializer(stddev=0.1)) output_b = tf.get_variable("bias", [N_OUTPUT], initializer=tf.random_normal_initializer(stddev=1.0)) output = tf.add(tf.matmul(relu2,output_W),output_b) return output def prediction(tweet_text): tweet_vector = word2vector(tweet_text) X = tf.placeholder('float') predict = inference(X) with tf.Session() as sess: tf.global_variables_initializer().run() saver = tf.train.Saver() saver.restore(sess, '../model/model.ckpt') res = sess.run(tf.argmax(predict.eval(feed_dict={X: [tweet_vector]}),1)) return res print prediction("happy time ")
小计
讲道理,实验效果不是很好,准确率大约在80%左右,对于中性推特的预测效果最差。改了很多地方,效果提升不大。有好的建议欢迎大神提出。
文本分类暂时告一段落,下面应该会把TFLearn和TensorBorder可视化做一做。
参考文献
http://blog.topspeedsnail.com/archives/10420
https://www.tensorflow.org/api_docs/
TensorFlow实战Google深度学习框架 Caicloud
相关文章推荐
- [TensorFlow实战练习]1-对电影评论的分类
- 分析分类数据(比较比例)
- 新手教程:如何精准收集网站数据与实战分析
- (转载)大数据实战:站在JMP分析平台上的FIT足迹识别技术
- HDInsight-Hadoop实战(二)传感器数据分析
- R语言与数据分析之三:分类算法2
- 挑战ASP+SqlServer动态列数据显示——之实战分析
- irms模拟数据生成及数据分析 分类: H_HISTORY 2015-03-06 14:17 212人阅读 评论(0) 收藏
- 项目视频讲解_基于SOA 思想下的WebService实战(电子商务需求,分析,架构全涉及,百万数据优化)
- #小练习 使用正则抓取oschina博客专区首页数据 分类: python 小练习 正则表达式 2013-11-11 17:22 604人阅读 评论(0) 收藏
- Hive 实战练习(一)—按照日期将每天的数据导入Hive表中
- 实战Scribe日志搜集和数据分析
- 网站分析实战——如何以数据驱动决策,提升网站价值(大数据时代的分析利器)
- 网站分析实战——如何以数据驱动决策,提升网站价值(大数据时代的分析利器)
- 机器学习&数据挖掘笔记_25(PGM练习九:HMM用于分类)
- 分析分类数据(比较比例)
- 网站分析实战--如何以数据驱动决策,提升网站价值(大数据时代的分析利器)
- 大数据处理例之STL实战练习
- 机器学习实战之k-近邻算法(5)--- 完整版约会网站数据分类
- 深入浅出Hadoop Mahout数据挖掘实战(算法分析、项目实战、中文分词技术)