bagofwords tf-idf word2vec特征实践
2017-09-13 14:34
176 查看
1 bagofwords + bayes
import pandas as pdfrom sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt
#数据处理
def data_prepare():
df = pd.read_excel("window regulator01.xlsx")
split = 0.7
Nodf = df[df.categories == 0]
print("0样本数量 = %d" %len(Nodf))
Yesdf = df[df.categories == 1]
print("1样本数量 = %d" %len(Yesdf))
d_train = Nodf[:int(split * len(Nodf))]
d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])
d_test = Nodf[int(split * len(Nodf)):]
d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])
print("训练样本:测试样本 = %.1f" %split)
print("训练样本 = %d" %len(d_train))
print("测试样本 = %d" %len(d_test))
return d_train , d_test
def create_model(d_train , d_test):
vectorizer = CountVectorizer() #词袋特征抽取
features = vectorizer.fit_transform(d_train.title)
print("训练样本特征表长度为 " + str(features.shape))
# print(vectorizer.get_feature_names()[3000:3050]) #特征名展示
test_features = vectorizer.transform(d_test.title)
#贝叶斯构建模型
NBmodle = MultinomialNB()
print("训练中。。。")
NBmodle.fit(features , d_train.categories)
print("测试中。。。")
predict = NBmodle.predict_proba(test_features)
y_true = d_test.categories
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
print("acc = %0.2f" %acc )
print("AUC = %0.2f" % auc )
return y_true , predict
#模型评估
def performance(y_true , predict ):
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
fpr, tpr , thr = roc_curve(y_true , predict[:,1])
# plt.ion() #开启interactive mode
plt.plot(fpr , tpr )
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)
plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)
plt.show()
d_train , d_test = data_prepare()
y_true, predict = create_model(d_train , d_test )
performance(y_true, predict)
2 tf-idf + bayes
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve import matplotlib.pyplot as plt #数据处理 def data_prepare(): df = pd.read_excel("window regulator01.xlsx") split = 0.7 Nodf = df[df.categories == 0] print("0样本数量 = %d" %len(Nodf)) Yesdf = df[df.categories == 1] print("1样本数量 = %d" %len(Yesdf)) d_train = Nodf[:int(split * len(Nodf))] d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]]) d_test = Nodf[int(split * len(Nodf)):] d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ]) print("训练样本:测试样本 = %.1f" %split) print("训练样本 = %d" %len(d_train)) print("测试样本 = %d" %len(d_test)) return d_train , d_test def create_model(d_train , d_test): vectorizer = TfidfVectorizer() #词袋特征抽取 features = vectorizer.fit_transform(d_train.title) print("训练样本特征表长度为 " + str(features.shape)) # print(vectorizer.get_feature_names()[3000:3050]) #特征名展示 test_features = vectorizer.transform(d_test.title) #贝叶斯构建模型 NBmodle = MultinomialNB() print("训练中。。。") NBmodle.fit(features , d_train.categories) print("测试中。。。") predict = NBmodle.predict_proba(test_features) y_true = d_test.categories acc = accuracy_score(y_true , predict[:,1] > 0.5) auc = roc_auc_score(y_true, predict[:, 1]) print("acc = %0.2f" %acc ) print("AUC = %0.2f" % auc ) return y_true , predict #模型评估 def performance(y_true , predict ): acc = accuracy_score(y_true , predict[:,1] > 0.5) auc = roc_auc_score(y_true, predict[:, 1]) fpr, tpr , thr = roc_curve(y_true , predict[:,1]) # plt.ion() #开启interactive mode plt.plot(fpr , tpr ) plt.xlabel("False positive rate") plt.ylabel("True positive rate") plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14) plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14) plt.show() d_train , d_test = data_prepare() y_true, predict = create_model(d_train , d_test ) performance(y_true, predict)
3 tf-idf + bayes 参数优化
import pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt
from itertools import product
import csv
#数据处理
def data_prepare():
df = pd.read_excel("window regulator01.xlsx")
split = 0.7
Nodf = df[df.categories == 0]
print("0样本数量 = %d" %len(Nodf))
Yesdf = df[df.categories == 1]
print("1样本数量 = %d" %len(Yesdf))
d_train = Nodf[:int(split * len(Nodf))]
d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])
d_test = Nodf[int(split * len(Nodf)):]
d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])
print("训练样本:测试样本 = %.1f" %split)
print("训练样本 = %d" %len(d_train))
print("测试样本 = %d" %len(d_test))
return d_train , d_test
def create_model(d_train , d_test):
vectorizer = TfidfVectorizer() #特征抽取
features = vectorizer.fit_transform(d_train.title)
print("训练样本特征表长度为 " + str(features.shape))
# print(vectorizer.get_feature_names()[3000:3050]) #特征名展示
test_features = vectorizer.transform(d_test.title)
#贝叶斯构建模型
NBmodle = MultinomialNB()
print("训练中。。。")
NBmodle.fit(features , d_train.categories)
print("测试中。。。")
predict = NBmodle.predict_proba(test_features)
y_true = d_test.categories
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
print("acc = %0.2f" %acc )
print("AUC = %0.2f" % auc )
return y_true , predict
#参数优化的模型
def create_model_param(d_train , d_test , max_features = None , min_df = 1, nb_alpha = 1.0):
vectorizer = TfidfVectorizer(max_features=max_features , min_df=min_df)
features = vectorizer.fit_transform(d_train.title)
print("训练样本特征表长度为 " + str(features.shape))
# print(vectorizer.get_feature_names()[3000:3050]) #特征名展示
test_features = vectorizer.transform(d_test.title)
#贝叶斯构建模型
NBmodle = MultinomialNB(alpha= nb_alpha)
print("训练中。。。")
NBmodle.fit(features , d_train.categories)
print("测试中。。。")
predict = NBmodle.predict_proba(test_features)
y_true = d_test.categories
auc = roc_auc_score(y_true, predict[:, 1])
print("AUC = %0.2f" % auc )
return {
"max_feature" : max_features,
"min_df":min_df,
"nb_alpha":nb_alpha,
"AUC":auc
}
#模型评估
def performance(y_true , predict ):
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
fpr, tpr , thr = roc_curve(y_true , predict[:,1])
# plt.ion() #开启interactive mode
plt.plot(fpr , tpr )
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)
plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)
plt.show()
#参数组合
param_values = {
"max_feature": [1000 , 2000 , 3000 , None],
"min_df": [1,2,3],
"nb_alpha": [0.01 , 0.1 , 1.0],
}
result = []
file = open("result.csv" , "w",newline= "")
writer = csv.DictWriter(file , fieldnames=["AUC","max_feature","min_df","nb_alpha"])
writer.writeheader()
d_train , d_test = data_prepare()
for p in product(*param_values.values()):
print(p)
res = create_model_param(d_train , d_test , p[0],p[1],p[2])
result.append(res)
print(res)
writer.writerow(res)
4word2vec + randforest
import pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt
from itertools import product
import csv
from gensim.models.word2vec import Word2Vec
import nltk
import numpy as np
#数据处理
def data_prepare():
df = pd.read_excel("window regulator01.xlsx")
split = 0.7
Nodf = df[df.categories == 0]
print("0样本数量 = %d" %len(Nodf))
Yesdf = df[df.categories == 1]
print("1样本数量 = %d" %len(Yesdf))
d_train = Nodf[:int(split * len(Nodf))]
d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])
d_test = Nodf[int(split * len(Nodf)):]
d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])
print("训练样本:测试样本 = %.1f" %split)
print("训练样本 = %d" %len(d_train))
print("测试样本 = %d" %len(d_test))
return d_train , d_test
def data_process(doc): #word2vec接受的是将每个文档当做一个列表,整个文档是列表里面的元素也是列表
sentences = []
for d in doc:
d = d.lower()
words = nltk.word_tokenize(d)
sentences.append(words)
return sentences
def featurize_w2v(model, sentences): #文档向量化
f= np.zeros((len(sentences),model.vector_size)) #文档特征为文档中所有单词的平均向量
for i , s in enumerate(sentences):
for w in s:
try:
vec = model[w]
except KeyError:
continue
f[i,:] = f[i,:] + vec
f[i,:] = f[i,:] / len(s)
return f
def create_model(d_train , d_test):
sentences = data_process(d_train.title)
model = Word2Vec(sentences , size = 300 , window=1 , min_count=1 , sample= 1e-3 , worke
b3d0
rs=2)
model.init_sims(replace=True)
feature_train = featurize_w2v(model, sentences) #word2vec特征抽取
RFCmodel =RandomForestClassifier(n_estimators= 100 , n_jobs= -1)
RFCmodel.fit(feature_train, d_train.categories)
test_sentences = data_process(d_test.title)
feature_test = featurize_w2v(model , test_sentences)
predict = RFCmodel.predict_proba(feature_test)
return d_test.categories , predict
#模型评估
def performance(y_true , predict ):
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
fpr, tpr , thr = roc_curve(y_true , predict[:,1])
# plt.ion() #开启interactive mode
plt.plot(fpr , tpr )
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)
plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)
plt.show()
d_train , d_test = data_prepare()
y_true , predict = create_model(d_train , d_test)
performance(y_true , predict )
相关文章推荐
- 自然语言处理之Bag-of-words,TF-IDF模型
- Bag-of-words模型、TF-IDF模型
- 图像特征提取方法:Bag-of-words
- 【Spark Mllib】TF-IDF&Word2Vec——文本相似度
- 图像特征提取方法:Bag-of-words
- SIFT算法的应用--目标识别之Bag-of-words模型
- 基于BoW模型的图像分类 Image Classification with Bag of Visual Words
- sift+bag_of_words+LDA实现图片搜索(一)
- SIFT算法的应用--目标识别之Bag-of-words模型(转)
- TF-IDF原理详解以及python实践
- 短文本分析----基于python的TF-IDF特征词标签自动化提取
- Bag of Words/Bag of Features的Matlab源码发布
- 《Spark机器学习》笔记——Spark高级文本处理技术(NLP、特征哈希、TF-IDF、朴素贝叶斯多分类、Word2Vec)
- 文本分类特征提取之Word2Vec
- Bag Of Visual Words 三大步
- word2vec——高效word特征提取
- Bag-of-words模型入门介绍文章
- (6)文本挖掘(三)——文本特征TFIDF权重计算及文本向量空间VSM表示
- 目标识别:Bag-of-words表示图像
- 特征选择方法之TF-IDF、DF