您的位置:首页 > 编程语言

kaggle —— IMDB影评得分估计竞赛代码

2018-04-03 09:38 1306 查看
IMDB影评得分估计竞赛代码

# -*- coding: utf-8 -*-
"""
Created on Mon Apr  2 11:11:39 2017

@author: yichengfan
"""

import pandas as pd

train = pd.read_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\02_data\labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\02_data\testData.tsv', delimiter='\t')

##查看一下各自的前几条数据
train.head()
'''
id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...
'''
test.head()
'''
id                                             review
0  12311_10  Naturally in a film who's main themes are of m...
1    8348_2  This movie is a disaster within a disaster fil...
2    5828_4  All in all, this is a movie for kids. We saw i...
3    7186_2  Afraid of the Dark left me with the impression...
4   12128_7  A very accurate depiction of small time mob li...
'''

#从bs4导入beautifulSoup用于整洁原始文本
from bs4 import BeautifulSoup
#从nltk.corpus 里导入停用词列表(nltk自然语言处理包)
from nltk.corpus import stopwords
import re

#定义函数,完成对原始评论的三项数据处理任务
def review_to_text(review, remove_stopwords):
#去掉html标记
raw_text = BeautifulSoup(review, 'html').get_text()
#去掉非字母字符
letters = re.sub('[^a-zA-Z]', ' ', raw_text)
words = letters.lower().split()
#如果remove_stopwords被激活,则去掉评论里的停用词
if remove_stopwords:
stop_words = set(stopwords.words('english'))
words = [w for w in words if w not in stop_words]
#返回每条评论经此三项预处理任务的词汇列表
return words

#调用函数处理数据
X_train = []
for review in train['review']:
X_train.append(' '.join(review_to_text(review, True)))

X_test = []
for review in test['review']:
X_test.append(' '.join(review_to_text(review, True)))

y_train = train['sentiment']

#导入文本特性抽取器CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB  #贝叶斯模型
from sklearn.pipeline import Pipeline  #用于方便搭建系统流程
from sklearn.grid_search import GridSearchCV #超参数组合的网格搜索

#使用Pipline搭建两组使用朴素贝叶斯模型的分类器,
#区别在于分别使用CountVectorizer, TfidfVectorizer对文本进行抽取
pip_count = Pipeline([('co
d8ff
unt_vec', CountVectorizer(analyzer='word')), ('mnb', MultinomialNB())])
pip_tfidf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer='word')), ('mnb', MultinomialNB())])

#分别配置用于模型超参数搜索组合
params_count = {'count_vec__binary':[True, False], 'count_vec__ngram_range':[(1, 1), (1, 2)],
'mnb__alpha':[0.1, 1.0, 10.0]}
params_tfidf = {'tfidf_vec__binary':[True, False], 'tfidf_vec__ngram_range':[(1, 1), (1, 2)],
'mnb__alpha':[0.1, 1.0, 10.0]}

#使用采取4折交叉验证的方法使用CountVectorizer的朴素贝叶斯模型进行并行化超参数搜索
gs_count = GridSearchCV(pip_count, params_count, cv=4, n_jobs=-1, verbose=1)
gs_count.fit(X_train,y_train)
print(gs_count.best_score_)
'''0.88216'''
print(gs_count.best_params_)
'''
{'count_vec__binary': True, 'count_vec__ngram_range': (1, 2), 'mnb__alpha': 1.0}
'''

count_y_predict = gs_count.predict(X_test)

#使用采取4折交叉验证的方法使用TfidfVectorizer的朴素贝叶斯模型进行并行化超参数搜索
gs_tfidf = GridSearchCV(pip_tfidf, params_tfidf, cv = 4, n_jobs = -1, verbose =1)
gs_tfidf.fit(X_train,y_train)
print(gs_tfidf.best_score_)
'''0.88712'''
print(gs_tfidf.best_params_)
'''
{'mnb__alpha': 0.1, 'tfidf_vec__binary': True, 'tfidf_vec__ngram_range': (1, 2)}
'''

tfidf_y_predict = gs_tfidf.predict(X_test)

#使用pandas对需要提交的数据进行格式化
submission_count = pd.DataFrame({'id':test['id'],'sentiment':count_y_predict})
submission_tfidf = pd.DataFrame({'id':test['id'],'sentiment':tfidf_y_predict})

submission_count.to_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\04_output\submission_count.csv',
index = False)
submission_tfidf.to_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\04_output\submission_tfidf.csv',
index = False)

#从本地读入未标记数据
unlabeled_train = pd.read_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\02_data\unlabeledTrainData.tsv', delimiter='\t', quoting=3)
unlabeled_train.head()
'''
id                                             review
0   "9999_0"  "Watching Time Chasers, it obvious that it was...
1  "45057_0"  "I saw this film about 20 years ago and rememb...
2  "15561_0"  "Minor Spoilers<br /><br />In New York, Joan B...
3   "7161_0"  "I went to see this film with a great deal of ...
4  "43971_0"  "Yes, I agree with everyone on this site this ...
'''

import nltk.data

#准备使用nltk的tokenizer对影评的英文句子进行分割
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#定义函数逐条对影评进行分句
def review_to_sentences(review, tokenizer):
raw_sentences = tokenizer.tokenize(review.strip())
sentences = []
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(review_to_text(raw_sentence, False))
return sentences

#准备用于训练词向量的数据
corpora = []
for review in unlabeled_train['review']:
corpora += review_to_sentences(review, tokenizer)

# 配置训练词向量模型的超参数
num_features = 300
min_word_count = 20
num_workers = 4
context = 10
downsampling = 1e-3

from gensim.models import word2vec
print("Training model...")
#开始词向量的训练
model = word2vec.Word2Vec(corpora, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)

model.init_sims(replace=True)

model_name = r"F:\TS\03_other_parts\kaggle\02_IMDB\02_data\300features_20minwords_10context"
model.save(model_name)

#读入已经训练好的词向量模型
from gensim.models import Word2Vec
model = Word2Vec.load(model_name)
#探查一下该词向量模型的训练成果
model.most_similar("man")
'''
[('woman', 0.6398072242736816),
('lady', 0.593010663986206),
('lad', 0.5564907789230347),
('soldier', 0.5520418882369995),
('chap', 0.5444163084030151),
('person', 0.5429509878158569),
('guy', 0.5271977186203003),
('monk', 0.5111091136932373),
('men', 0.5074273347854614),
('boy', 0.5039346814155579)]
'''

import numpy as np

#定义一个函数使用词向量产生文本特征向量
def makeFeatureVec(words, model, num_features):
featureVec = np.zeros((num_features,),dtype="float32")
nwords = 0.
index2word_set = set(model.index2word)
for word in words:
if word in index2word_set:
nwords = nwords + 1.
featureVec = np.add(featureVec,model[word])
featureVec = np.divide(featureVec,nwords)
return featureVec

#定义另一个每条影评转换为基于词向量的特征向量(平均词向量)
def getAvgFeatureVecs(reviews, model, num_features):
counter = 0
reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
for review in reviews:
reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
counter += 1
return reviewFeatureVecs

#准备新的基于词向量表示的训练和测试特征向量
clean_train_reviews = []
for review in train["review"]:
clean_train_reviews.append( review_to_text( review, remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

clean_test_reviews = []
for review in test["review"]:
clean_test_reviews.append( review_to_text( review, remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

#梯度提升树
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

gbc = GradientBoostingClassifier()

params_gbc = {'n_estimators':[10, 100, 500], 'learning_rate':[0.01, 0.1, 1.0], 'max_depth': [2, 3, 4]}
gs = GridSearchCV(gbc, params_gbc, cv=4, n_jobs=-1, verbose=1)

gs.fit(trainDataVecs, y_train)

print(gs.best_score_)
print(gs.best_params_)

result = gs.predict(testDataVecs)
# Write the test results
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( r"F:\TS\03_other_parts\kaggle\02_IMDB\04_output\submission_w2v.csv", index=False, quoting=3)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: