您的位置:首页 > 其它

文本分类学习笔记(2)- 特征提取

2015-12-11 17:07 288 查看
师兄提供了一组英文文本分类任务,正好拿来练手。

第一个问题就是文本的导入问题。数据格式为:每个类别一个文件夹,存放无后缀的文本文件,单词以空格切分。

自然的想法就是扫描文件夹下的所有文本,并将所有单词读入字典。

依据 /article/1280082.html 中提供的方法,稍作修改得到如下方法:

#coding=utf-8
import os
import nltk
from numpy import *
from operator import itemgetter
from collections import OrderedDict,Counter
from math import exp
from scipy import sparse,io
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

def load_files(directory,prefix=None,postfix=None):
#获取所有文件
files_list=[]
classlen=[0 for i in range(11)]
i = 0
for root, sub_dirs, files in os.walk(directory):
classlen[i] = len(files)
i += 1
for special_file in files:
if postfix:
if special_file.endswith(postfix):
files_list.append(os.path.join(root,special_file))
elif prefix:
if special_file.startswith(prefix):
files_list.append(os.path.join(root,special_file))
else:
files_list.append(os.path.join(root,special_file))
#扫描建立词典
articallist = [dict() for i in range(len(files_list))]
filelen = [0 for l in range(len(files_list))]
i = 0
for eachfile in files_list:
file_object = open(eachfile,'r')
t = 0
for line in file_object:
for word in line.split():
#非数字
if not str(word).isdigit():
t += 1
#大小写转换
word = str(word).lower()
if articallist[i].has_key(word):
articallist[i][word] += 1
else:
articallist[i][word] = 1
filelen[i] = t
i += 1
file_object.close()
#print '总文件数:',len(files_list)
print len(articallist[1])
return articallist,classlen,filelen

#导入停止词表
def load_stop_en(filename):
word_list=[]
file_object = open(filename,'r')
for line in file_object:
word_list.append(line.strip())
return word_list

#去停止词
def delet_stopword_en(stop_en_set, en_dict):
for key in stop_en_set:
if en_dict.has_key(key):
del en_dict[key]

#获取某个词在所有文档中的IF-TDF
def get_TFIDF(articallist,filelen,word):
num = len(articallist)
TFindex = [0 for i in range(num)]
IDFindex = 0
for i,eachdict in enumerate(articallist):
if eachdict.has_key(word):
TFindex[i] = eachdict[word]/float(filelen[i])
IDFindex += 1
for i in range(len(TFindex)):
if IDFindex != 0:
TFindex[i] = TFindex[i] * exp(IDFindex/float(num))
#print TFindex
return TFindex

def updatex(dict1,dict2):
#print '$',dict2
for key in dict2.keys():
#key = str(key)
if dict1.has_key(key):
dict1[key] += dict2[key]
else:
dict1[key] = dict2[key]

def get_Mat(trainfilepath='training',testfilepath='test',stop_enname='en.txt',matfilename='SetMat.mat'):
#导入文件
articallist,classlen,filelen = load_files(trainfilepath)
#print classlen
#去停止词
stop_en_set = load_stop_en(stop_enname)
#训练分类器标签集
#classlabel = [['acq'], ['corn'], ['crude'], ['earn'], ['grain'], ['interest'], ['money-fx'], ['ship'], ['trade'], ['wheat']]
classlabel = [i+1 for i in range(10)]
labeled_names = [0 for i in range(len(articallist))]

classr = 0
finaldict = {}
for i in range(10):
classl = classr
classr += classlen[i+1]
labeled_names[classl:classr] = [classlabel[i] for k in range(classlen[i+1])]
tempdict = {}
for eachdict in articallist[classl:classr]:
delet_stopword_en(stop_en_set,eachdict)
updatex(tempdict,eachdict)
#各类均前3000个,合并
tempdict = OrderedDict(sorted(tempdict.iteritems(), key=itemgetter(1), reverse=True))
tempdict = dict(Counter(tempdict).most_common(3000))
print 'vector:', len(finaldict)
#为每个关键词求TF-IDF值,得到文本特征值
vectormat = [get_TFIDF(articallist,filelen,each) for each in finaldict]
#转置:行为文件,列为特征
vectormat = array(vectormat).transpose()

articallist1,classlen1,filelen1 = load_files(testfilepath)
vectormat1 = [get_TFIDF(articallist1,filelen1,each) for each in finaldict]
vectormat1 = array(vectormat1).transpose()
#classifier = LogisticRegression()  # 参数默认
#classifier.fit(vectormat, labeled_names)  # 训练数据,无返回值
#print classifier.predict(vectormat1[2])


此处设计不足,导致统计特征词的逻辑混乱

下面是简单的分类器实现,使用sklearn包,另一个Kmeans是自己实现的k均值算法,精度不太理想

worddict1,articallist1,classlen1 = load_files('D:/Py/test')
vectormat1 = [get_TFIDF(articallist1, each[0]) for each in finaldict]
vectormat1 = array(vectormat1).transpose()
#训练分类器
classlabel = [['acq'], ['corn'], ['crude'], ['earn'], ['grain'], ['interest'], ['money-fx'], ['ship'], ['trade'], ['wheat']]
#标签
labeled_names = [[''] for i in range(len(articallist))]
classr = 0
for i in range(10):
classl = classr
classr += classlen[i+1]
#print classl,classr,classlabel[i]
labeled_names[classl:classr] = [classlabel[i] for k in range(classlen[i+1])]

classifier = LogisticRegression()  # 使用类,参数全是默认的
classifier.fit(vectormat, labeled_names)  # 训练数据来学习,不需要返回值

print classifier.predict(vectormat1[800])
#print classifier.predict_proba(vectormat1)

#Kmeans聚类预测
#myCentroids, clustAssing = Kmeans.kMeans(vectormat,10,len(vectormat))


修正心得:

1、python的字典数据提供了一种update方法(eg:dict.update(dict2),将字典dict2的键/值对更新到dict里)使用该方法合成特征值提取所需的词表,但此处存在一个问题:若dict与dict2中有相同的键,则该方法仅保留dict2中的值,这就导致了对词出现次数的统计错误,更正为自定义函数updatex(),时间复杂度还有待提高

2、词频TF = 词在某文件中出现的次数/该文件的总长度,TFindex[i] = eachdict[word]/float(len(eachdict))此处错误使用了词表长度;

3、自定义的分词只使用的简单的空格分割,每个词可能存在”,.’\”等特殊字符,在去停止词的步骤中也无法处理,需要提高。

nltk中提供了分句函数sent_tokenize和分词函数word_tokenize;还可以使用wordnet函数提取词干,去除时态等格式。在实际实验的过程中发现,当单词中存在‘\’等字符时,wordnet无法正常工作,故自定义函数onlychar做只保留字母的变换,但进一步提高了复杂度。。。

#分句分词
def ie_preprocess(document):
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
return sentences

#只保留字母
def OnlyChar(s,oth=''):
s2 = s.lower()
fomart = 'abcdefghijklmnopqrstuvwxyz'
for c in s2:
if not c in fomart:
s = s.replace(c,'')
return s


4、分类结果不理想。。。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: