您的位置:首页 > 其它

用scikit-learn和jieba支持中文文本特征提取和分类的测例

2017-03-28 18:10 519 查看
注意:

安装jieba用此命令:easy_install jieba。pip install jieba有时不能正确安装。

中文情况下analyzer='word'参数需要带上,vocabulary=cv.vocabulary_这样的参数是为了使测试集和训练集特征数对齐

代码:

---------

from time import time

import sys

import os

import numpy as np

import scipy.sparse as sp

import matplotlib.pyplot as plt

import jieba

import jieba.posseg as pseg

from sklearn import feature_extraction

from sklearn.feature_extraction.text import TfidfTransformer  

from sklearn.feature_extraction.text import CountVectorizer  

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn.linear_model import SGDClassifier

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

corpus=["我爱河南。",#第一类文本切词后的结果,词之间以空格隔开 
"你恨河南。",
"他总是爱河南。",
"我有时候恨河南。"]

tokenized_corpus = []

for text in corpus:
tokenized_corpus.append(" ".join(jieba.cut(text)))

test_corpus = ["我爱河南的胡辣汤。"]

tokenized_test_corpus = []

tokenized_test_corpus.append(" ".join(jieba.cut(test_corpus[0])))

corpus_result = [1, 0, 1, 0]

#下面几个是HashingVectorizer, CountVectorizer+TfidfTransformer,TfidfVectorizer, FeatureHasher的正确用法。

#fh = feature_extraction.FeatureHasher(n_features=15,non_negative=True,input_type='string')

#X_train=fh.fit_transform(tokenized_corpus)

#X_test=fh.fit_transform(tokenized_test_corpus)

#fh = feature_extraction.text.HashingVectorizer(n_features=15,non_negative=True,analyzer='word')

#X_train=fh.fit_transform(tokenized_corpus)

#X_test=fh.fit_transform(tokenized_test_corpus)

#cv=CountVectorizer(analyzer='word')

#transformer=TfidfTransformer()

#X_train=transformer.fit_transform(cv.fit_transform(tokenized_corpus))

#cv2=CountVectorizer(vocabulary=cv.vocabulary_)

#transformer=TfidfTransformer()

#X_test = transformer.fit_transform(cv2.fit_transform(tokenized_test_corpus))

#word=cv.get_feature_names()

#weight=X_train.toarray()

#for i in range(len(weight)):

# print u"-------这里输出第",i,u"类文本的词语tf-idf权重------"  

# for j in range(len(word)):  

#            print word[j],weight[i][j] 

tfidf = TfidfVectorizer(analyzer='word')

X_train=tfidf.fit_transform(tokenized_corpus)

tfidf = TfidfVectorizer(analyzer='word', vocabulary = tfidf.vocabulary_)

X_test=tfidf.fit_transform(tokenized_test_corpus)

y_train = corpus_result

y_test = [1]

def benchmark(clf_class, params, name):

    print("parameters:", params)

    t0 = time()

    clf = clf_class(**params).fit(X_train, y_train)

    print("done in %fs" % (time() - t0))

    if hasattr(clf, 'coef_'):

        print("Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100))

    print("Predicting the outcomes of the testing set")

    t0 = time()

    pred = clf.predict(X_test)

    print("done in %fs" % (time() - t0))

    print("Classification report on test set for classifier:")

    print(clf)

    print()

    print(classification_report(y_test, pred))

    cm = confusion_matrix(y_test, pred)

    print("Confusion matrix:")

    print(cm)

if __name__ == "__main__":  

    print("Testbenching a linear classifier...")

    parameters = {
'loss': 'hinge',
'penalty': 'l2',
'n_iter': 50,
'alpha': 0.00001,
'fit_intercept': True,

    }

    benchmark(SGDClassifier, parameters, 'SGD')

---------

参考:
http://blog.csdn.net/liuxuejiang158blog/article/details/31360765 用count和tfidf
http://www.tuicool.com/articles/vYnIve 用FeatureHasher
http://blog.csdn.net/pat_datamine/article/details/43969631 jieba, 通过复用训练集的vocabulary来达到测试集和训练集特征数对齐的目的
http://blog.csdn.net/abcjennifer/article/details/23615947 复用训练集的vocabulary
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐