您的位置：首页 > 大数据

[TensorFlow实战练习]2-对推特数据的情绪分析分类

2017-05-12 14:37 531 查看

前言

本实战延续上一贴的内容，主要是学习文本数据分类过程，相比第一个实战练习，讲一下不同和优化过程。

数据量增加，有160万条推特数据，我的小Air完全读不下这么多。大数据处理方面还有很多工作要去做。

数据增大带来的是使用TFRecord去读取和处理数据，这样会极大的方便我们之后的队列处理过程。

相比上一个实战过程，不在使用readline这种读取方法，利用pandas对原始数据进行处理，还是因为数据量太大，不能把160W完全读入内存中。

实验流程

构建词汇表，保存词汇表。构建过长的词汇表这个方法对于不定长数据的效果不好，速度变得很慢。无形中加大了数据量，以后想到好的解决方法再修改。

创建训练数据集，word2vector向量化，保存为TFRecord格式。

搭建前向神经网络，batch化训练，利用队列读取TFRecord数据，每次迭代对所有数据进行训练，每次利用测试数据评估accuracy

比较准确率，保存最好的模型

加载模型，对新的tweet数据进行预测

实验数据 http://help.sentiment140.com/for-students/

0 – the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

1 – the id of the tweet (2087)

2 – the date of the tweet (Sat May 16 23:58:44 UTC 2009)

3 – the query (lyx). If there is no query, then this value is NO_QUERY.

4 – the user that tweeted (robotickilldozr)

5 – the text of the tweet (Lyx is cool)

下面开始贴代码了

create_lexcion.py

# -*- coding: UTF-8 -*
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
import pickle

def create_vacabularyDict(filename):
wordList = []
for word in filename:
word = word_tokenize(word.lower().decode('latin-1'))
wordList.extend(word)

# 词形还原 (cats->cat)
lemmatizer = WordNetLemmatizer()
wordList = [lemmatizer.lemmatize(word) for word in wordList]

wordSet = []
word_count = Counter(wordList)
for word in word_count:
if word_count[word] < 100000 and word_count[word] > 100:
wordSet.append(word)

# char2int = dict((c, i) for i, c in enumerate(wordSet))
# int2char = dict((i, c) for i, c in enumerate(wordSet))

return wordSet

org_train_filepath = "../data/trainingandtestdata/training.1600000.processed.noemoticon.csv"
org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv"

df = pd.read_csv(org_train_filepath,
names=['polarity','id','data','Query','username','text'],
usecols=['polarity','text'])

# dftemp = df[:10000]
dftext = df['text']

wordList = create_vacabularyDict(dftext)

#保存词汇表
with open('../lexcion.pickle', 'wb') as f:
pickle.dump(wordList, f)

create_dataset.py

# -*- coding: UTF-8 -*
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
import pickle
from sklearn.utils import shuffle

#预处理
def process_for_train(file,wordList):
dftemp = file.copy()
# length = len(df)
# num_process = 1000
# dftemp = pd.DataFrame(columns=['polarity','text'])
dataset = []
lemmatizer = WordNetLemmatizer()
int_to_vector = {
0:[0,0,1],
2:[0,1,0],
4:[1,0,0]
}

def word2vector(text):
line_text = word_tokenize(text.lower().decode('latin-1'))
words = [lemmatizer.lemmatize(word) for word in line_text]
feature = np.zeros(len(wordList))
for word in words:
if word in wordList:
feature[wordList.index(word)] = 1
feature = list(feature)
return feature

dftemp['pol
4000
arity'] = dftemp['polarity'].map(int_to_vector)

dftemp['text'] = dftemp['text'].map(word2vector)

return dftemp

def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

org_train_filepath = "../data/trainingandtestdata/training.1600000.processed.noemoticon.csv"
org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv"

df = pd.read_csv(org_train_filepath,
names=['polarity','id','data','Query','username','text'],
usecols=['polarity','text'])

df = shuffle(df)
df = df[:3000].reset_index(drop=True)

f = open('../lexcion.pickle', 'rb')
wordList = pickle.load(f)
f.close()

dftemp = process_for_train(df,wordList)
num_dataset = len(dftemp)
# len_dataset = 5000

filename = "../data/output_train.tfrecords"
writer = tf.python_io.TFRecordWriter(filename)
for i in range(num_dataset):
# word = ",".join(map(str,dftemp.get_value(index,'text')))
# label = ",".join(map(str,dftemp.get_value(index,'polarity')))
word = map(int,dftemp.get_value(i,'text'))
label = map(int,dftemp.get_value(i,'polarity'))

example = tf.train.Example(features=tf.train.Features(
feature={
'word': _int64_feature(word),
'label': _int64_feature(label)
}
))

writer.write(example.SerializeToString())

writer.close()

train.py

# -*- coding: UTF-8 -*
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
import pickle
# from create_dataset import process_for_train

def process_for_train(file,wordList):
dftemp = file.copy()
dataset = []
lemmatizer = WordNetLemmatizer()
int_to_vector = {
0:[0,0,1],
2:[0,1,0],
4:[1,0,0]
}

def word2vector(text):
line_text = word_tokenize(text.lower().decode('latin-1'))
words = [lemmatizer.lemmatize(word) for word in line_text]
feature = np.zeros(len(wordList))
for word in words:
if word in wordList:
feature[wordList.index(word)] = 1
feature = list(feature)
return feature

dftemp['polarity'] = dftemp['polarity'].map(int_to_vector)

dftemp['text'] = dftemp['text'].map(word2vector)

return dftemp

#定义全连接神经网络结构
def inference(input_tesnor):
with tf.variable_scope('layer1-fc'):
fc1_W = tf.get_variable("weight",[N_INPUT,NUM_LAYER1]
,initializer=tf.truncated_normal_initializer(stddev=0.1))
fc1_b = tf.get_variable("bias",[NUM_LAYER1],
initializer=tf.random_normal_initializer(stddev=1.0))

fc1 = tf.add(tf.matmul(input_tesnor,fc1_W),fc1_b)
relu1 = tf.nn.relu(fc1)
# if train:
#     fc1 = tf.nn.dropout(fc1,0.5)
with tf.variable_scope('layer2-fc'):
fc2_W = tf.get_variable("weight", [NUM_LAYER1, NUM_LAYER2]
, initializer=tf.truncated_normal_initializer(stddev=0.1))
fc2_b = tf.get_variable("bias", [NUM_LAYER2],
initializer=tf.random_normal_initializer(stddev=1.0))

fc2 = tf.add(tf.matmul(relu1,fc2_W),fc2_b)
relu2 = tf.nn.relu(fc2)

with tf.variable_scope('output-fc'):
output_W = tf.get_variable("weight", [NUM_LAYER2, N_OUTPUT]
, initializer=tf.truncated_normal_initializer(stddev=0.1))
output_b = tf.get_variable("bias", [N_OUTPUT],
initializer=tf.random_normal_initializer(stddev=1.0))

output = tf.add(tf.matmul(relu2,output_W),output_b)

return output

N_INPUT = 8053
N_OUTPUT = 3
NUM_LAYER1 = 1000
NUM_LAYER2 = 1000
TRAIN_EPOCHS = 1000

NUM_TRAIN = 3000

org_test_filepath = "../data/trainingandtestdata/testdata.manual.2009.06.14.csv"

df = pd.read_csv(org_test_filepath,
names=['polarity','id','data','Query','username','text'],
usecols=['polarity','text'])

f = open('../lexcion.pickle', 'rb')
wordList = pickle.load(f)
f.close()

dftemp = process_for_train(df[:300],wordList)

X_test = np.reshape(np.array(dftemp['text'].tolist()),(len(dftemp),N_INPUT))
y_test = np.reshape(np.array(dftemp['polarity'].tolist()),(len(dftemp),N_OUTPUT))

file = "../data/output_train.tfrecords"
test_file = "output_test.tfrecords"
#string_input_producer用于多文件输入
filename_queue = tf.train.string_input_producer([file], shuffle=False)

reader = tf.TFRecordReader()
_,serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
features={
'word':tf.FixedLenFeature([8053],tf.int64),
'label':tf.FixedLenFeature([3],tf.int64)
}
)

X_train = features['word']
y_train = features['label']

min_after_dequeue = 1000
batch_size = 100
capacity = min_after_dequeue + batch_size * 3

X_batch,y_batch = tf.train.shuffle_batch([X_train,y_train],
min_after_dequeue=min_after_dequeue,
batch_size=batch_size,
capacity=capacity)

def train():
X = tf.placeholder(tf.float32,[None,N_INPUT],name='x-input')
y = tf.placeholder(tf.float32,[None,N_OUTPUT],name='y-output')

predict = inference(X)
cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_batch,logits=predict))
optimizer = tf.train.AdamOptimizer().minimize(cost_func)

#eval statics
# test_predict = inference(X)
correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))

with tf.Session() as sess:
tf.global_variables_initializer().run()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess,coord=coord)
saver = tf.train.Saver()

pre_accuracy = 0 #准确率

for i in range(TRAIN_EPOCHS):
# for j in range()
# cur_X_batch,cur_y_batch = sess.run([X_batch,y_batch])
# feed_dict = {X: cur_X_batch, y: cur_y_batch}
for j in range(int(NUM_TRAIN/batch_size)):
cur_X_batch, cur_y_batch = sess.run([X_batch, y_batch])
_, loss = sess.run([optimizer, cost_func],feed_dict = {X: cur_X_batch, y: cur_y_batch})
if i % 10 == 0:
print "After %d training step,loss is %g" %(i,loss)

# if i % 10 == 0:
temp_accuracy = sess.run(accuracy,feed_dict={X: X_test, y: y_test})
if temp_accuracy > pre_accuracy:  # 保存准确率最高的训练模型
print('accuracy is : ', temp_accuracy)
pre_accuracy = temp_accuracy
saver.save(sess, '../model/model.ckpt')  # 保存session
# print('accuracy: ', accuracy.eval({X: list(X_test), y: list(y_test)}))

coord.request_stop()
coord.join(threads)

train()

test.py

# -*- coding: UTF-8 -*
import tensorflow as tf
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

f = open('../lexcion.pickle', 'rb')
wordList = pickle.load(f)
f.close()

N_INPUT = len(wordList)  # 输入层
N_OUTPUT = 3
NUM_LAYER1 = 1000
NUM_LAYER2 = 1000

def word2vector(text):
line_text = word_tokenize(text.lower().decode('latin-1'))
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in line_text]
feature = np.zeros(len(wordList))
for word in words:
if word in wordList:
feature[wordList.index(word)] = 1
feature = list(feature)
return feature

#定义全连接神经网络结构
def inference(input_tesnor):
with tf.variable_scope('layer1-fc'):
fc1_W = tf.get_variable("weight",[N_INPUT,NUM_LAYER1]
,initializer=tf.truncated_normal_initializer(stddev=0.1))
fc1_b = tf.get_variable("bias",[NUM_LAYER1],
initializer=tf.random_normal_initializer(stddev=1.0))

fc1 = tf.add(tf.matmul(input_tesnor,fc1_W),fc1_b)
relu1 = tf.nn.relu(fc1)
# if train:
#     fc1 = tf.nn.dropout(fc1,0.5)
with tf.variable_scope('layer2-fc'):
fc2_W = tf.get_variable("weight", [NUM_LAYER1, NUM_LAYER2]
, initializer=tf.truncated_normal_initializer(stddev=0.1))
fc2_b = tf.get_variable("bias", [NUM_LAYER2],
initializer=tf.random_normal_initializer(stddev=1.0))

fc2 = tf.add(tf.matmul(relu1,fc2_W),fc2_b)
relu2 = tf.nn.relu(fc2)

with tf.variable_scope('output-fc'):
output_W = tf.get_variable("weight", [NUM_LAYER2, N_OUTPUT]
, initializer=tf.truncated_normal_initializer(stddev=0.1))
output_b = tf.get_variable("bias", [N_OUTPUT],
initializer=tf.random_normal_initializer(stddev=1.0))

output = tf.add(tf.matmul(relu2,output_W),output_b)

return output

def prediction(tweet_text):

tweet_vector = word2vector(tweet_text)

X = tf.placeholder('float')

predict = inference(X)

with tf.Session() as sess:

tf.global_variables_initializer().run()
saver = tf.train.Saver()
saver.restore(sess, '../model/model.ckpt')

res = sess.run(tf.argmax(predict.eval(feed_dict={X: [tweet_vector]}),1))
return res

print prediction("happy time ")

小计

讲道理，实验效果不是很好，准确率大约在80%左右，对于中性推特的预测效果最差。改了很多地方，效果提升不大。有好的建议欢迎大神提出。

文本分类暂时告一段落，下面应该会把TFLearn和TensorBorder可视化做一做。

参考文献

http://blog.topspeedsnail.com/archives/10420

https://www.tensorflow.org/api_docs/

TensorFlow实战Google深度学习框架 Caicloud

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 大数据 tensorflow 文本分类

相关文章推荐

新的分享

章节导航