您的位置:首页 > 其它

pyspark+nltk处理文本数据

2016-06-03 17:20 302 查看
环境条件:hadoop2.6.0,spark1.6.0,python2.7,下载代码和数据

代码如下:

from pyspark import SparkContext
sc=SparkContext('local','pyspark')
data=sc.textFile("hdfs:/user/hadoop/test.txt")
import nltk
from nltk.corpus import stopwords
from functools import reduce
def filter_content(content):
content_old=content
content=content.split("%#%")[-1]
sentences=nltk.sent_tokenize(content) #句子化,sent_tokenize的输入是一段文字,返回的是标准化的句子列表
words=[word.lower() for sentence in sentences for word in nltk.word_tokenize(sentence)] #单词化
words=[word for word in words if word not in stopwords.words('english')] #去除停用词
words=[word for word in words if word not in ['/','^','-','+','<','>','{','}','*','//',',','.',':',';','?','(',')','[',']','&','!','*','@','|','#','$','%','"',"'","''",'""','`','``']] #去除标点和空字符
words=[var[0] for var in nltk.pos_tag(words) if var[1][0] in ['N','V']] #词性标注并选择出名词和动词
words1=[nltk.PorterStemmer().stem(word) for word in words] #Porter提取词干
# words2=[nltk.LancasterStemmer().stem(word) for word in words] #Lancaster提取词干
# words3=[nltk.WordNetLemmatizer().lemmatize(word) for word in words] #WordNet提取词元
# words=set(words1+words2+words3) #将三者合并去重
words=words1
if words:
return reduce(lambda a,b: str(a)+"%#%"+str(b),content_old.split("%#%")[:-1])+"%#%"+reduce(lambda a,b:"%s %s"%(a,b),words)+'\n' #将单词以空格隔开组成文本并返回
elif content_old.split("%#%")[1]:
return reduce(lambda a,b: str(a)+"%#%"+str(b),content_old.split("%#%")[:-1])+"%#%"+'\n'
else:
return ''
#filter_content("%#%I am a good boy.")
data=data.map(lambda line: filter_content(line))
data.saveAsTextFile("hdfs:/user/hadoop/test_result")
data_list=data.collect()
with open("/home/snow/zzwork/test_result.txt","w") as fw:
for var in data_list:
fw.write(str(var))
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  pyspark nltk