您的位置:首页 > 编程语言 > Go语言

bagofwords tf-idf word2vec特征实践

2017-09-13 14:34 176 查看

1 bagofwords + bayes

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt

#数据处理
def data_prepare():
df = pd.read_excel("window regulator01.xlsx")
split = 0.7
Nodf = df[df.categories == 0]
print("0样本数量 = %d" %len(Nodf))
Yesdf = df[df.categories == 1]
print("1样本数量 = %d" %len(Yesdf))
d_train = Nodf[:int(split * len(Nodf))]
d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])
d_test = Nodf[int(split * len(Nodf)):]
d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])
print("训练样本:测试样本 = %.1f" %split)
print("训练样本 = %d" %len(d_train))
print("测试样本 = %d" %len(d_test))
return d_train , d_test

def create_model(d_train , d_test):
vectorizer = CountVectorizer() #词袋特征抽取
features = vectorizer.fit_transform(d_train.title)
print("训练样本特征表长度为 " + str(features.shape))
# print(vectorizer.get_feature_names()[3000:3050]) #特征名展示
test_features = vectorizer.transform(d_test.title)
#贝叶斯构建模型
NBmodle = MultinomialNB()
print("训练中。。。")
NBmodle.fit(features , d_train.categories)
print("测试中。。。")
predict = NBmodle.predict_proba(test_features)
y_true = d_test.categories
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
print("acc = %0.2f" %acc )
print("AUC = %0.2f" % auc )
return y_true , predict
#模型评估
def performance(y_true , predict ):
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
fpr, tpr , thr = roc_curve(y_true , predict[:,1])
# plt.ion() #开启interactive mode
plt.plot(fpr , tpr )
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)
plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)
plt.show()

d_train , d_test = data_prepare()
y_true, predict = create_model(d_train , d_test )
performance(y_true, predict)

2 tf-idf + bayes

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt

#数据处理
def data_prepare():
df = pd.read_excel("window regulator01.xlsx")
split = 0.7
Nodf = df[df.categories == 0]
print("0样本数量 = %d" %len(Nodf))
Yesdf = df[df.categories == 1]
print("1样本数量 = %d" %len(Yesdf))
d_train = Nodf[:int(split * len(Nodf))]
d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])
d_test = Nodf[int(split * len(Nodf)):]
d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])
print("训练样本:测试样本 = %.1f" %split)
print("训练样本 = %d" %len(d_train))
print("测试样本 = %d" %len(d_test))
return d_train , d_test

def create_model(d_train , d_test):
vectorizer = TfidfVectorizer() #词袋特征抽取
features = vectorizer.fit_transform(d_train.title)
print("训练样本特征表长度为 " + str(features.shape))
# print(vectorizer.get_feature_names()[3000:3050]) #特征名展示
test_features = vectorizer.transform(d_test.title)
#贝叶斯构建模型
NBmodle = MultinomialNB()
print("训练中。。。")
NBmodle.fit(features , d_train.categories)
print("测试中。。。")
predict = NBmodle.predict_proba(test_features)
y_true = d_test.categories
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
print("acc = %0.2f" %acc )
print("AUC = %0.2f" % auc )
return y_true , predict
#模型评估
def performance(y_true , predict ):
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
fpr, tpr , thr = roc_curve(y_true , predict[:,1])
# plt.ion() #开启interactive mode
plt.plot(fpr , tpr  )
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)
plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)
plt.show()

d_train , d_test = data_prepare()
y_true, predict = create_model(d_train , d_test )
performance(y_true, predict)


3 tf-idf + bayes 参数优化

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt
from itertools import product
import csv

#数据处理
def data_prepare():
df = pd.read_excel("window regulator01.xlsx")
split = 0.7
Nodf = df[df.categories == 0]
print("0样本数量 = %d" %len(Nodf))
Yesdf = df[df.categories == 1]
print("1样本数量 = %d" %len(Yesdf))
d_train = Nodf[:int(split * len(Nodf))]
d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])
d_test = Nodf[int(split * len(Nodf)):]
d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])
print("训练样本:测试样本 = %.1f" %split)
print("训练样本 = %d" %len(d_train))
print("测试样本 = %d" %len(d_test))
return d_train , d_test

def create_model(d_train , d_test):
vectorizer = TfidfVectorizer() #特征抽取
features = vectorizer.fit_transform(d_train.title)
print("训练样本特征表长度为 " + str(features.shape))
# print(vectorizer.get_feature_names()[3000:3050]) #特征名展示
test_features = vectorizer.transform(d_test.title)
#贝叶斯构建模型
NBmodle = MultinomialNB()
print("训练中。。。")
NBmodle.fit(features , d_train.categories)
print("测试中。。。")
predict = NBmodle.predict_proba(test_features)
y_true = d_test.categories
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
print("acc = %0.2f" %acc )
print("AUC = %0.2f" % auc )
return y_true , predict

#参数优化的模型
def create_model_param(d_train , d_test , max_features = None , min_df = 1, nb_alpha = 1.0):
vectorizer = TfidfVectorizer(max_features=max_features , min_df=min_df)
features = vectorizer.fit_transform(d_train.title)
print("训练样本特征表长度为 " + str(features.shape))
# print(vectorizer.get_feature_names()[3000:3050]) #特征名展示
test_features = vectorizer.transform(d_test.title)
#贝叶斯构建模型
NBmodle = MultinomialNB(alpha= nb_alpha)
print("训练中。。。")
NBmodle.fit(features , d_train.categories)
print("测试中。。。")
predict = NBmodle.predict_proba(test_features)
y_true = d_test.categories
auc = roc_auc_score(y_true, predict[:, 1])
print("AUC = %0.2f" % auc )
return {
"max_feature" : max_features,
"min_df":min_df,
"nb_alpha":nb_alpha,
"AUC":auc
}
#模型评估
def performance(y_true , predict ):
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
fpr, tpr , thr = roc_curve(y_true , predict[:,1])
# plt.ion() #开启interactive mode
plt.plot(fpr , tpr )
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)
plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)
plt.show()

#参数组合
param_values = {
"max_feature": [1000 , 2000 , 3000 , None],
"min_df": [1,2,3],
"nb_alpha": [0.01 , 0.1 , 1.0],
}
result = []
file = open("result.csv" , "w",newline= "")
writer = csv.DictWriter(file , fieldnames=["AUC","max_feature","min_df","nb_alpha"])
writer.writeheader()
d_train , d_test = data_prepare()
for p in product(*param_values.values()):
print(p)
res = create_model_param(d_train , d_test , p[0],p[1],p[2])
result.append(res)
print(res)
writer.writerow(res)



4word2vec + randforest

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt
from itertools import product
import csv
from gensim.models.word2vec import Word2Vec
import nltk
import numpy as np

#数据处理
def data_prepare():
df = pd.read_excel("window regulator01.xlsx")
split = 0.7
Nodf = df[df.categories == 0]
print("0样本数量 = %d" %len(Nodf))
Yesdf = df[df.categories == 1]
print("1样本数量 = %d" %len(Yesdf))
d_train = Nodf[:int(split * len(Nodf))]
d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])
d_test = Nodf[int(split * len(Nodf)):]
d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])
print("训练样本:测试样本 = %.1f" %split)
print("训练样本 = %d" %len(d_train))
print("测试样本 = %d" %len(d_test))
return d_train , d_test
def data_process(doc): #word2vec接受的是将每个文档当做一个列表,整个文档是列表里面的元素也是列表
sentences = []
for d in doc:
d = d.lower()
words = nltk.word_tokenize(d)
sentences.append(words)
return sentences
def featurize_w2v(model, sentences): #文档向量化
f= np.zeros((len(sentences),model.vector_size)) #文档特征为文档中所有单词的平均向量
for i , s in enumerate(sentences):
for w in s:
try:
vec = model[w]
except KeyError:
continue
f[i,:] = f[i,:] + vec
f[i,:] = f[i,:] / len(s)
return f
def create_model(d_train , d_test):
sentences = data_process(d_train.title)
model = Word2Vec(sentences , size = 300 , window=1 , min_count=1 , sample= 1e-3 , worke
b3d0
rs=2)
model.init_sims(replace=True)
feature_train = featurize_w2v(model, sentences) #word2vec特征抽取
RFCmodel =RandomForestClassifier(n_estimators= 100 , n_jobs= -1)
RFCmodel.fit(feature_train, d_train.categories)

test_sentences = data_process(d_test.title)
feature_test = featurize_w2v(model , test_sentences)
predict = RFCmodel.predict_proba(feature_test)
return d_test.categories , predict
#模型评估
def performance(y_true , predict ):
acc = accuracy_score(y_true , predict[:,1] > 0.5)
auc = roc_auc_score(y_true, predict[:, 1])
fpr, tpr , thr = roc_curve(y_true , predict[:,1])
# plt.ion() #开启interactive mode
plt.plot(fpr , tpr )
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)
plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)
plt.show()

d_train , d_test = data_prepare()
y_true , predict = create_model(d_train , d_test)
performance(y_true , predict )
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: