pyspark+nltk处理文本数据
2016-06-03 17:20
302 查看
环境条件:hadoop2.6.0,spark1.6.0,python2.7,下载代码和数据
代码如下:
from pyspark import SparkContext
sc=SparkContext('local','pyspark')
data=sc.textFile("hdfs:/user/hadoop/test.txt")
import nltk
from nltk.corpus import stopwords
from functools import reduce
def filter_content(content):
content_old=content
content=content.split("%#%")[-1]
sentences=nltk.sent_tokenize(content) #句子化,sent_tokenize的输入是一段文字,返回的是标准化的句子列表
words=[word.lower() for sentence in sentences for word in nltk.word_tokenize(sentence)] #单词化
words=[word for word in words if word not in stopwords.words('english')] #去除停用词
words=[word for word in words if word not in ['/','^','-','+','<','>','{','}','*','//',',','.',':',';','?','(',')','[',']','&','!','*','@','|','#','$','%','"',"'","''",'""','`','``']] #去除标点和空字符
words=[var[0] for var in nltk.pos_tag(words) if var[1][0] in ['N','V']] #词性标注并选择出名词和动词
words1=[nltk.PorterStemmer().stem(word) for word in words] #Porter提取词干
# words2=[nltk.LancasterStemmer().stem(word) for word in words] #Lancaster提取词干
# words3=[nltk.WordNetLemmatizer().lemmatize(word) for word in words] #WordNet提取词元
# words=set(words1+words2+words3) #将三者合并去重
words=words1
if words:
return reduce(lambda a,b: str(a)+"%#%"+str(b),content_old.split("%#%")[:-1])+"%#%"+reduce(lambda a,b:"%s %s"%(a,b),words)+'\n' #将单词以空格隔开组成文本并返回
elif content_old.split("%#%")[1]:
return reduce(lambda a,b: str(a)+"%#%"+str(b),content_old.split("%#%")[:-1])+"%#%"+'\n'
else:
return ''
#filter_content("%#%I am a good boy.")
data=data.map(lambda line: filter_content(line))
data.saveAsTextFile("hdfs:/user/hadoop/test_result")
data_list=data.collect()
with open("/home/snow/zzwork/test_result.txt","w") as fw:
for var in data_list:
fw.write(str(var))
代码如下:
from pyspark import SparkContext
sc=SparkContext('local','pyspark')
data=sc.textFile("hdfs:/user/hadoop/test.txt")
import nltk
from nltk.corpus import stopwords
from functools import reduce
def filter_content(content):
content_old=content
content=content.split("%#%")[-1]
sentences=nltk.sent_tokenize(content) #句子化,sent_tokenize的输入是一段文字,返回的是标准化的句子列表
words=[word.lower() for sentence in sentences for word in nltk.word_tokenize(sentence)] #单词化
words=[word for word in words if word not in stopwords.words('english')] #去除停用词
words=[word for word in words if word not in ['/','^','-','+','<','>','{','}','*','//',',','.',':',';','?','(',')','[',']','&','!','*','@','|','#','$','%','"',"'","''",'""','`','``']] #去除标点和空字符
words=[var[0] for var in nltk.pos_tag(words) if var[1][0] in ['N','V']] #词性标注并选择出名词和动词
words1=[nltk.PorterStemmer().stem(word) for word in words] #Porter提取词干
# words2=[nltk.LancasterStemmer().stem(word) for word in words] #Lancaster提取词干
# words3=[nltk.WordNetLemmatizer().lemmatize(word) for word in words] #WordNet提取词元
# words=set(words1+words2+words3) #将三者合并去重
words=words1
if words:
return reduce(lambda a,b: str(a)+"%#%"+str(b),content_old.split("%#%")[:-1])+"%#%"+reduce(lambda a,b:"%s %s"%(a,b),words)+'\n' #将单词以空格隔开组成文本并返回
elif content_old.split("%#%")[1]:
return reduce(lambda a,b: str(a)+"%#%"+str(b),content_old.split("%#%")[:-1])+"%#%"+'\n'
else:
return ''
#filter_content("%#%I am a good boy.")
data=data.map(lambda line: filter_content(line))
data.saveAsTextFile("hdfs:/user/hadoop/test_result")
data_list=data.collect()
with open("/home/snow/zzwork/test_result.txt","w") as fw:
for var in data_list:
fw.write(str(var))
相关文章推荐
- 开始spark之旅
- spark的几点备忘
- python几个工具包的安装
- nltk在python中的安装,以及nltk的data库
- python自然语言处理-学习笔记(一)
- Python自然语言处理(二)--NLTK调用Stanford_NLP_Tools完成NLP任务
- NLTK vs Sklearn vs Gensim
- NLTK 学习笔记(1)
- nltk 3.0 的parse
- NLTK 学习笔记(4)
- NLTK 学习笔记(5)
- NLTK学习笔记(6)
- NLTK学习笔记(7)- Extracting information from text
- 词语相似度计算:1、安装NLTK和下载WordNet语料库;WordNet的使用
- Python 文本分类器
- Python 调用 Stanford Parser 两种方法
- Python java.lang.NoClassDefFoundError: org/slf4j/LoggerFactory解决办法
- jpython LookupError: unknown encoding 'ms936' 问题解决
- stanford parser 使用说明
- nltk.download()下载失败的解决办法