kaggle —— IMDB影评得分估计竞赛代码
2018-04-03 09:38
1306 查看
IMDB影评得分估计竞赛代码
# -*- coding: utf-8 -*- """ Created on Mon Apr 2 11:11:39 2017 @author: yichengfan """ import pandas as pd train = pd.read_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\02_data\labeledTrainData.tsv', delimiter='\t') test = pd.read_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\02_data\testData.tsv', delimiter='\t') ##查看一下各自的前几条数据 train.head() ''' id sentiment review 0 5814_8 1 With all this stuff going down at the moment w... 1 2381_9 1 \The Classic War of the Worlds\" by Timothy Hi... 2 7759_3 0 The film starts with a manager (Nicholas Bell)... 3 3630_4 0 It must be assumed that those who praised this... 4 9495_8 1 Superbly trashy and wondrously unpretentious 8... ''' test.head() ''' id review 0 12311_10 Naturally in a film who's main themes are of m... 1 8348_2 This movie is a disaster within a disaster fil... 2 5828_4 All in all, this is a movie for kids. We saw i... 3 7186_2 Afraid of the Dark left me with the impression... 4 12128_7 A very accurate depiction of small time mob li... ''' #从bs4导入beautifulSoup用于整洁原始文本 from bs4 import BeautifulSoup #从nltk.corpus 里导入停用词列表(nltk自然语言处理包) from nltk.corpus import stopwords import re #定义函数,完成对原始评论的三项数据处理任务 def review_to_text(review, remove_stopwords): #去掉html标记 raw_text = BeautifulSoup(review, 'html').get_text() #去掉非字母字符 letters = re.sub('[^a-zA-Z]', ' ', raw_text) words = letters.lower().split() #如果remove_stopwords被激活,则去掉评论里的停用词 if remove_stopwords: stop_words = set(stopwords.words('english')) words = [w for w in words if w not in stop_words] #返回每条评论经此三项预处理任务的词汇列表 return words #调用函数处理数据 X_train = [] for review in train['review']: X_train.append(' '.join(review_to_text(review, True))) X_test = [] for review in test['review']: X_test.append(' '.join(review_to_text(review, True))) y_train = train['sentiment'] #导入文本特性抽取器CountVectorizer, TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.naive_bayes import MultinomialNB #贝叶斯模型 from sklearn.pipeline import Pipeline #用于方便搭建系统流程 from sklearn.grid_search import GridSearchCV #超参数组合的网格搜索 #使用Pipline搭建两组使用朴素贝叶斯模型的分类器, #区别在于分别使用CountVectorizer, TfidfVectorizer对文本进行抽取 pip_count = Pipeline([('co d8ff unt_vec', CountVectorizer(analyzer='word')), ('mnb', MultinomialNB())]) pip_tfidf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer='word')), ('mnb', MultinomialNB())]) #分别配置用于模型超参数搜索组合 params_count = {'count_vec__binary':[True, False], 'count_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]} params_tfidf = {'tfidf_vec__binary':[True, False], 'tfidf_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]} #使用采取4折交叉验证的方法使用CountVectorizer的朴素贝叶斯模型进行并行化超参数搜索 gs_count = GridSearchCV(pip_count, params_count, cv=4, n_jobs=-1, verbose=1) gs_count.fit(X_train,y_train) print(gs_count.best_score_) '''0.88216''' print(gs_count.best_params_) ''' {'count_vec__binary': True, 'count_vec__ngram_range': (1, 2), 'mnb__alpha': 1.0} ''' count_y_predict = gs_count.predict(X_test) #使用采取4折交叉验证的方法使用TfidfVectorizer的朴素贝叶斯模型进行并行化超参数搜索 gs_tfidf = GridSearchCV(pip_tfidf, params_tfidf, cv = 4, n_jobs = -1, verbose =1) gs_tfidf.fit(X_train,y_train) print(gs_tfidf.best_score_) '''0.88712''' print(gs_tfidf.best_params_) ''' {'mnb__alpha': 0.1, 'tfidf_vec__binary': True, 'tfidf_vec__ngram_range': (1, 2)} ''' tfidf_y_predict = gs_tfidf.predict(X_test) #使用pandas对需要提交的数据进行格式化 submission_count = pd.DataFrame({'id':test['id'],'sentiment':count_y_predict}) submission_tfidf = pd.DataFrame({'id':test['id'],'sentiment':tfidf_y_predict}) submission_count.to_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\04_output\submission_count.csv', index = False) submission_tfidf.to_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\04_output\submission_tfidf.csv', index = False) #从本地读入未标记数据 unlabeled_train = pd.read_csv(r'F:\TS\03_other_parts\kaggle\02_IMDB\02_data\unlabeledTrainData.tsv', delimiter='\t', quoting=3) unlabeled_train.head() ''' id review 0 "9999_0" "Watching Time Chasers, it obvious that it was... 1 "45057_0" "I saw this film about 20 years ago and rememb... 2 "15561_0" "Minor Spoilers<br /><br />In New York, Joan B... 3 "7161_0" "I went to see this film with a great deal of ... 4 "43971_0" "Yes, I agree with everyone on this site this ... ''' import nltk.data #准备使用nltk的tokenizer对影评的英文句子进行分割 tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #定义函数逐条对影评进行分句 def review_to_sentences(review, tokenizer): raw_sentences = tokenizer.tokenize(review.strip()) sentences = [] for raw_sentence in raw_sentences: if len(raw_sentence) > 0: sentences.append(review_to_text(raw_sentence, False)) return sentences #准备用于训练词向量的数据 corpora = [] for review in unlabeled_train['review']: corpora += review_to_sentences(review, tokenizer) # 配置训练词向量模型的超参数 num_features = 300 min_word_count = 20 num_workers = 4 context = 10 downsampling = 1e-3 from gensim.models import word2vec print("Training model...") #开始词向量的训练 model = word2vec.Word2Vec(corpora, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling) model.init_sims(replace=True) model_name = r"F:\TS\03_other_parts\kaggle\02_IMDB\02_data\300features_20minwords_10context" model.save(model_name) #读入已经训练好的词向量模型 from gensim.models import Word2Vec model = Word2Vec.load(model_name) #探查一下该词向量模型的训练成果 model.most_similar("man") ''' [('woman', 0.6398072242736816), ('lady', 0.593010663986206), ('lad', 0.5564907789230347), ('soldier', 0.5520418882369995), ('chap', 0.5444163084030151), ('person', 0.5429509878158569), ('guy', 0.5271977186203003), ('monk', 0.5111091136932373), ('men', 0.5074273347854614), ('boy', 0.5039346814155579)] ''' import numpy as np #定义一个函数使用词向量产生文本特征向量 def makeFeatureVec(words, model, num_features): featureVec = np.zeros((num_features,),dtype="float32") nwords = 0. index2word_set = set(model.index2word) for word in words: if word in index2word_set: nwords = nwords + 1. featureVec = np.add(featureVec,model[word]) featureVec = np.divide(featureVec,nwords) return featureVec #定义另一个每条影评转换为基于词向量的特征向量(平均词向量) def getAvgFeatureVecs(reviews, model, num_features): counter = 0 reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32") for review in reviews: reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features) counter += 1 return reviewFeatureVecs #准备新的基于词向量表示的训练和测试特征向量 clean_train_reviews = [] for review in train["review"]: clean_train_reviews.append( review_to_text( review, remove_stopwords=True )) trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features ) clean_test_reviews = [] for review in test["review"]: clean_test_reviews.append( review_to_text( review, remove_stopwords=True )) testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features ) #梯度提升树 from sklearn.ensemble import GradientBoostingClassifier from sklearn.grid_search import GridSearchCV gbc = GradientBoostingClassifier() params_gbc = {'n_estimators':[10, 100, 500], 'learning_rate':[0.01, 0.1, 1.0], 'max_depth': [2, 3, 4]} gs = GridSearchCV(gbc, params_gbc, cv=4, n_jobs=-1, verbose=1) gs.fit(trainDataVecs, y_train) print(gs.best_score_) print(gs.best_params_) result = gs.predict(testDataVecs) # Write the test results output = pd.DataFrame( data={"id":test["id"], "sentiment":result} ) output.to_csv( r"F:\TS\03_other_parts\kaggle\02_IMDB\04_output\submission_w2v.csv", index=False, quoting=3)
相关文章推荐
- kaggle数据挖掘竞赛初步--Titanic<数据变换> 完整代码: https://github.com/cindycindyhi/kaggle-Titanic 特征工程系列: Titanic
- 算法竞赛入门经典,练习代码 ,3.4.9 上机练习
- 机器学习(二) 如何做到机器学习竞赛Kaggle排名前2%
- Kaggle 机器学习竞赛冠军及优胜者的源代码汇总
- 大数据竞赛平台——Kaggle 入门
- 算法竞赛 ► 源代码
- [Kaggle] 数据建模分析与竞赛平台介绍
- 基于机器学习多种方法的kaggle竞赛入门之手写数字的图像识别预测
- 如何使用Python在Kaggle竞赛中成为Top15
- Titanic: Machine Learning from Disaster(Kaggle 数据挖掘竞赛)
- 系统性训练,励志刷完挑战程序设计竞赛-代码整理68~103【初级篇】
- 算法竞赛入门经典 习题3-1 得分(Score, ACM/ICPC Seoul 2005, UVa1585)
- 机器学习竞赛(代码)
- 转:十步制胜 Kaggle 数据科学竞赛
- 使用C++模板和不使用C++模板两种情况中的执行代码大小近似估计
- 算法竞赛入门 刘汝佳 例题代码及练习题代码(二)
- 【Kaggle】竞赛记录 Who is the good/better man
- Kaggle竞赛之-titanic学习笔记
- 大数据竞赛平台——Kaggle 入门
- Brown-Mood估计、Theil估计 2、Siegel估计、线性分位回归的r语言代码