您的位置:首页 > 编程语言 > Python开发

《用Python进行自然语言处理》代码笔记(二):第二章 获得文本语料和词汇资源

2017-05-09 16:14 721 查看
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : Peidong
# @Site    :
# @File    : eg2.py
# @Software: PyCharm
"""
获得文本语料和词汇资源
"""
# # 获取古腾堡语料库
import nltk
print(nltk.corpus.gutenberg.fileids())
# # 获取简爱的词汇量
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print(len(emma))

from nltk.corpus import gutenberg
print(gutenberg.fileids())
emma = gutenberg.words('austen-emma.txt')
print(len(emma))
# # 计算平均词长、平均句子长度和本文中每个词出现的平均次数
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print(int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid)

machbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
print(machbeth_sentences)
print(machbeth_sentences[1037])
longest_len = max([len(s) for s in machbeth_sentences])
# # 打印文中最长句子的词汇量
print(longest_len)
# # 打印最长子句
print([s for s in machbeth_sentences if len(s) == longest_len])

# 获取加勒比海盗的电影剧本
from nltk.corpus import webtext
for fileid in webtext.fileids():
print(fileid, webtext.raw(fileid)[:65], '...')

# 获取网络聊天室帖子
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom[123])

# 获取布朗语料库
# 获取布朗语料库的分类
import nltk
from nltk.corpus import brown
print(brown.categories())
# # 获取新闻类语料资料
print(brown.words(categories='news'))
# # 获取指定文件
print(brown.words(fileids=['cg22']))
# # 获取指定类别
print(brown.sents(categories=['news', 'editorial', 'reviews']))

# 获取特定文体的计数
news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
print(m+':', fdist[m])

# 获取条件频率分布
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
genre = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
print(cfd.tabulate(conditions=genre, samples=modals))

# 获取路透社语料库
from  nltk.corpus import reuters
print(reuters.fileids())
# # 获取语料的分类
print(reuters.categories())
print(reuters.fileids('barley'))
print(reuters.fileids(['barley', 'corn']))

# 获取就职演说语料库
from nltk.corpus import inaugural
print(inaugural.fileids())
print([fileid[:4] for fileid in inaugural.fileids()])
# # 查看american和citizen随时间变化情况
cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target))
cfd.plot()

# 获取各国语言的世界人权宣言语料库
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
print(languages)
cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative=True)

# 按文体计数词汇
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
print(len(cfd))
genre_word = [(genre, word)  for genre in ['news', 'romance']  for word in brown.words(categories=genre)]
# print(len(genre_word))
# print(genre_word[:4])
# print(genre_word[-4:])
cfd = nltk.ConditionalFreqDist(genre_word)
# print(cfd.conditions())
# print(cfd['news'])
# print(list(cfd['romance']))
print(cfd['romance']['could'])

# 绘制分布图与分布表
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid)
for target in ['america', 'citizen'] if w.lower().startswith(target))
print(cfd)
cfd.plot()

# 使用双联词生成随机文本
import nltk
sent = ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', 'and', 'the', 'earth', '.']
print(list(nltk.bigrams(sent)))

"""
例2-1. 产生随机文本:此程序获得《创世记》文本中所有的双连词,然后构造一个条件频率分
布来记录哪些词汇最有可能跟在给定词的后面;例如:living 后面最可能的词是creature;gene
rate_model()函数使用这些数据和种子词随机产生文本。
"""
import nltk
def generate_model(cfdist, word, num=15):
for i in range(num):
print(word, end=' ' )
word = cfdist[word].max()

text = nltk.corpus.genesis.words("english-kjv.txt")
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)

print(list(cfd['living']))
print(generate_model(cfd, 'living'),  end=' ')

# 导入词汇库
import nltk
def unusual_words(text):
text_vocab = set(w.lower() for w in text if w.isalpha())
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
unusual = text_vocab.difference(english_vocab)
return sorted(unusual)
print(unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt')))
print(unusual_words(nltk.corpus.nps_chat.words()))

# 停用词语料库
from nltk.corpus import stopwords
print(stopwords.words('english'))
# # 定义一个函数来计算文本中没有在停用词列表中的词的比例。
def content_fraction(text):
stopwords = nltk.corpus.stopwords.words('english')
content = [w for w in text if w.lower() not in stopwords]
return len(content)/len(text)
print(content_fraction(nltk.corpus.reuters.words()))

# 检查频率小于或者等于相应的字母在拼词谜题中的频率
puzzle_letters = nltk.FreqDist('egivrvonl')
obligatory = 'r'
wordlist = nltk.corpus.words.words()
print([w for w in wordlist if len(w) >= 6 and obligatory in w and nltk.FreqDist(w) <= puzzle_letters])

# 找出同时出现在两个文件中的名字即性别暧昧的名字
import nltk
names = nltk.corpus.names
print(names.fileids())
male_names = names.words('male.txt')
female_names = names.words('female.txt')
print([w for w in male_names if w in female_names])
cfd = nltk.ConditionalFreqDist((fileid, name[-1]) for fileid in names.fileids() for name in names.words(fileid))
cfd.plot()

# 导入发音的词典
import nltk
entries = nltk.corpus.cmudict.entries()
print(len(entries))
for entry in entries[39943:39951]:
print(entry)

for word, pron in entries:
if(len(pron)==3):
ph1, ph2, ph3 = pron
if ph1 == 'P'and ph3 == 'T':
print(word, ph2, end=' ')

# 词汇工具
from nltk.corpus import toolbox
print(toolbox.entries('rotokas.dic'))

# WordNet词汇集
from nltk.corpus import wordnet as wn
# 查找指定词汇的同义词
print(wn.synsets('motorcar'))
print(wn.synset('car.n.01').lemma_names)
motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
print(types_of_motorcar[26])
#  'method' object is not iterable
print(sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas]))

# 反义词
print(wn.synset('walk.v.01').entailments())
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息