NLTK04《Python自然语言处理》code03 处理原始文本
2017-08-29 16:44
417 查看
处理原始文本
# -*- coding: utf-8 -*- # win10 python3.5.3/python3.6.1 nltk3.2.4 # 《Python自然语言处理》 03 处理原始文本 # pnlp03.py from __future__ import division import nltk, re, pprint # 3.1 从网络和硬盘访问文本 # 电子书 # pip3 install urllib3 from urllib.request import urlopen url = "http://www.gutenberg.org/files/2554/2554-0.txt" raw = urlopen(url).read() print(type(raw)) # <class 'bytes'> #pprint.pprint(raw) print(len(raw)) # 1201733 raw = raw.decode('utf-8') tokens = nltk.word_tokenize(raw) print(type(tokens)) # <class 'list'> print(tokens[:10]) # ['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Crime', 'and', 'Punishment', ',', 'by'] text = nltk.Text(tokens) print(type(text)) # <class 'nltk.text.Text'> print(text[1020:1040]) # ['AND', 'PUNISHMENT', 'PART', 'I', 'CHAPTER', 'I', 'On', 'an', 'exceptionally', 'hot', 'evening', 'early', ... text.collocations() n = raw.find("PART I") print(n) # 5336 n = raw.rfind("End of Project Gutenberg's Crime") print(n) # -1 raw = raw[5303:1157681] n = raw.find("PART I") print(n) # 33 # 处理HTML from urllib.request import urlopen url = "http://news.bbc.co.uk/2/hi/health/2284783.stm" html = urlopen(url).read() print(html[:5]) # b'<!doc' # https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html import bs4 import lxml #raw = nltk.clean_html(html) raw = bs4.BeautifulSoup(html, "lxml") raw = raw.get_text() tokens = nltk.word_tokenize(raw) print(tokens) # ['BBC', 'NEWS', '|', 'Health', '|', 'Blondes', "'to", ... tokens = tokens[96:399] text = nltk.Text(tokens) text.concordance('gene') # Displaying 5 of 5 matches:hey say too few people now carry the gene for blondes to last beyond the next ... # 处理搜索引擎的结果 # 处理RSS订阅 # https://pypi.python.org/pypi/feedparser/ import feedparser import bs4 import lxml llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom") print(llog['feed']['title']) # Language Log print(len(llog.entries)) # 13 post = llog.entries[2] print(post.title) # Asses and asterisks content = post.content[0].value print(content[:70]) # <p>When <em>The Sun</em>, a famously prurient UK tabloid newspaper, ch txt = bs4.BeautifulSoup(content, "lxml").get_text() t0 = nltk.word_tokenize(txt) print(t0) # ['When', 'The', 'Sun', ',', 'a', 'famously', 'prurient', 'UK', 'tabloid', ... t1 = nltk.word_tokenize(bs4.BeautifulSoup(llog.entries[2].content[0].value, "lxml").get_text()) print(t1) # ['When', 'The', 'Sun', ',', 'a', 'famously', 'prurient', 'UK', ... # 读取本地文件 # document.txt # test document.txt f = open('document.txt') raw = f.read() print(raw) # test document.txt f.close() f = open('document.txt') for line in f: print(line.strip()) f.close() path = nltk.data.find('corpora/abc/rural.txt') raw = open(path, 'rU').read() print(raw[:20]) # PM denies knowledge # 从PDF、MS Word及其他二进制格式中提取文本 # pypdf pywin32 # 获取用户输入 s = input("Enter some text: ") print(type(s), s) # NLP的流程 raw = open('document.txt').read() print(type(raw)) # <class 'str'> tokens = nltk.word_tokenize(raw) print(type(tokens)) # <class 'list'> words = [w.lower() for w in tokens] print(type(words), words) # <class 'list'> ['test', 'document.txt'] vocab = sorted(set(words)) print(type(vocab), vocab) # <class 'list'> ['document.txt', 'test'] vocab.append('blog') # raw.append('blog') # AttributeError: 'str' object has no attribute 'append' # 字符串和列表不能连接 query = 'Who knows?' beatles = ['john', 'paul', 'georage', 'ringo'] # s = query + beatles # TypeError: must be str, not list # 3.2 字符串:最底层的文本处理 # 字符串的基本操作 monty = 'Monty Python' print(monty) # Monty Python circus = "Monty Python's Flying Circus" print(circus) # Monty Python's Flying Circus circus = 'Monty Python\'s Flying Circus' print(circus) # Monty Python's Flying Circus ## circus = 'Monty Python's Flying Circus' couplet = "Shall I compare thee to a Summer's day?"\ "Thou are more Lovely and more temperate:" print(couplet) # Shall I compare thee to a Summer's day?Thou are more Lovely and more temperate: couplet = ("Rough winds do shake the darling duds of May, " "And Summer's lease hath all too short a date:") print(couplet) # Rough winds do shake the darling duds of May, And Summer's lease hath all too short a date: couplet = """Shall I compare thee to a Summer's day? Thou are more lovely and more temperate:""" print(couplet) #Shall I compare thee to a Summer's day? #Thou are more lovely and more temperate: print('very' + 'very' + 'very') # veryveryvery print('very'*3) # veryveryvery a = [1, 2, 3, 4, 5, 6, 7, 6, 5, 4, 3, 2, 1] b = [' ' * 2 * (7 - i) + 'very' * i for i in a] for line in b: print(line) # s = 'very' - 'y' # TypeError: unsupported operand type(s) for -: 'str' and 'str' # s = 'very' / 2 # TypeError: unsupported operand type(s) for /: 'str' and 'int' # 输出字符串 monty = 'Monty Python' print(monty) # Monty Python grail = 'Holy Grail' print(monty + grail) # Monty PythonHoly Grail print(monty, "and the", grail) # Monty Python and the Holy Grail # 访问单个字符 print(monty[0]) # M print(monty[3]) # t print(monty[5]) # ' ' #print(monty[20]) # IndexError: string index out of range print(monty[-1]) # n print(monty[5]) # ' ' print(monty[-7]) # ' ' sent = 'colorless green ideas sleep furiously' for char in sent: print(char, end='') # 结尾不要换行符 from nltk.corpus import gutenberg raw = gutenberg.raw('melville-moby_dick.txt') fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha()) print(list(fdist.keys())) # ['m', 'o', 'b', 'y', 'd', 'i', 'c', 'k', 'h', 'e', 'r', 'a', 'n', 'l', 'v', 't', 'g', 's', 'u', 'p', 'w', 'x', 'q', 'f', 'j', 'z'] # 访问子字符串 print(monty[6:10]) # Pyth print(monty[-12:-7]) # Monty print(monty[:5]) # Monty print(monty[6:]) # Python phrase = 'And now for something completely different' if 'thing' in phrase: print('found "thing"') # found "thing" # 更多字符串操作 # s.find(t) 字符串中包含t的第一个索引(没找到返回-1) # s.rfind(t) 字符串s中是包含t的最后一个索引(没找到返回-1) # s.index(t) 与s.find(t)功能类似,但没有找到时,引起ValueError # s.rindex(t) 与s.rfind(t)功能类似,但没有找到时,引起ValueError # s.join(text) 连接字符串s与text中的词汇 # s.split(t) 在所有找到t的位置将s分割成链表(默认为空白符) # s.splitlines() 将s按行分割成字符串链表 # s.lower() 将字符串s小写 # s.upper() 将字符串s大写 # s.titilecase() 将字符串s首字符大写 # s.strip() 返回一个没有首尾空白字符的s的复制 # s.replace(t, u) 用u替换s中的t # 链表与字符串的差异 query = 'Who knows?' beatles = ['John', 'Paul', 'George', 'Ringo'] print(query[2]) # 'o' print(beatles[2]) # 'George' print(query[:2]) # 'Wh' print(beatles[:2]) # ['John', 'Paul'] print(query + " I dot't") # "Who knows? I don't" #print(beatles + 'Brian') # TypeError: can only concatenate list (not "str") to list print(beatles + ['Brian']) # beatles[0] = "John Lennon" del beatles[-1] print(beatles) # # 字符串是不可变的 beatles[0] = 'F' # # 3.3 使用unicode进行文字处理 # 什么是unicode # 从文件中提取自己编码文本 path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt') import codecs f = codecs.open(path, encoding='latin2') for line in f: line = line.strip() print(line.encode('unicode_escape')) print(ord('a')) # 97 a = u'\u0061' print(a) # nacute = u'\u0144' print(nacute) # nacute_utf = nacute.encode('utf8') print(repr(nacute_utf)) # import unicodedata lines = codecs.open(path, encoding = 'latin2').readlines() line = lines[2] print(line.encode('unicode_escape')) for c in line: if ord(c) > 127: print('%r U+%04x %s' % (c.encode('utf8'), ord(c), unicodedata.name(c))) print(line.find(u'zosta\u0142y')) line = line.lower() print(line.encode('unicode_escape')) import re m = re.search(u'\u015b\w*', line) print(m.group()) nltk.word_tokenize(line) # 在Python中使用本地编码 # '#-*-coding:<utf-8>-*-' # 3.4 使用正则表达式检测词组搭配 import re wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()] print(wordlist) # 使用基本的元字符 # <<ed$>>表示以ed结尾的词汇 res = [w for w in wordlist if re.search('ed$', w)] print(res) # 通配符"."匹配任意单个字符 res = [w for w in wordlist if re.search('^..j..t..$', w)] print(res) # 范围与闭包 res = [w for w in wordlist if re.search('^[hgi][mno][jlk][def]$', w)] print(res) chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words())) res = [w for w in chat_words if re.search('^m+i+n+e+$', w)] print(res) res = [w for w in chat_words if re.search('^[ha]+$', w)] print(res) wsj = sorted(set(nltk.corpus.treebank.words())) res = [w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)] print(res) res = [w for w in wsj if re.search('^[A-Z]+$', w)] print(res) res = [w for w in wsj if re.search('^[0-9]{4}$', w)] print(res) res = [w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)] print(res) res = [w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)] print(res) res = [w for w in wsj if re.search('(ed|ing)$', w)] print(res) # . 通配符,匹配所有字符 # ^abc 匹配以abc开始的字符串 # abc$ 匹配以abc结尾的字符串 # [abc] 匹配字符合集 # [A-Z0-9] 匹配字符范围 # ed|ing|s 匹配指定字符串(析取) # * 前面的项目另个或多个,如a*、[a-z]*(也叫Kleene闭包) # + 前面的项目1个或多个,如a+、[a-z]+ # ? 前面的项目0个或1个(即:可选),如:a?、[z-a]? # {n} 重复n次,n为非负整数 # {n, } 至少重复n次 # {, n} 之多重复n次 # {m, n} 至少重复m次,不多于n次 # a(b|c)+ 括号表示操作符的范围 # 3.5 正则表达式的有益应用 # 提取字符块 word = 'supercalifragilisticexpialidocious' res = re.findall(r'[aeiou]', word) print(res) print(len(res)) wsj = sorted(set(nltk.corpus.treebank.words())) fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word)) fd.items() # 在字符串上做更多事情 regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]' def compress(word): pieces = re.findall(regexp, word) return ''.join(pieces) english_udhr = nltk.corpus.udhr.words('English-Latin1') print(nltk.tokenwrap(compress(w) for w in english_udhr[:75])) rotokas_words = nltk.corpus.toolbox.words('rotokas.dic') cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)] cfd = nltk.ConditionalFreqDist(cvs) cfd.tabulate() cv_word_pairs = [(cv, w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)] cv_index = nltk.Index(cv_word_pairs) print(cv_index['su']) # 查找词干 def stem(word): for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']: if word.endswith(suffix): return word[:-len(suffix)] re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing') re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing') re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing') re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing') re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing') def stem(word): regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$' res = re.findall(regexp, word) if len(res) > 0: stem = res[0] else: stem = None return stem raw = """DENNIS: Listen, strange women lying in ponds distributing swords ... is no basis for a system of government. Supreme executive power derivers from ... a mandate from the masses, not from some farcical aquatic ceremony.""" tokens = nltk.word_tokenize(raw) print(tokens) res = [stem(t) for t in tokens if len(t) > 0] print(res) # 搜索已分词文本 from nltk.corpus import gutenberg, nps_chat moby = nltk.Text(gutenberg.words('melville-moby_dick.txt')) moby.findall(r"<a> (<.*>) <man>") chat = nltk.Text(nps_chat.words()) chat.findall(r"<.*> <.*> <bro>") chat.findall(r"<l.*>{3,}") from nltk.corpus import brown hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned'])) hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>") # 3.6 规范化文本 raw = """DENNIS: Listen, strange women lying in ponds distributing swords ... is no basis for a system of government. Supreme executive power derivers from ... a mandate from the masses, not from some farcical aquatic ceremony.""" tokens = nltk.word_tokenize(raw) # 词干提取器 porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() res = [porter.stem(t) for t in tokens] print(res) res = [lancaster.stem(t) for t in tokens] print(res) class IndexedText(object): def __init__(self, stemmer, text): self._text = text self._stemmer = stemmer self._index = nltk.Index((self._stem(word), i) for (i, word) in enumerate(text)) def concordance(self, word, width = 40): key = self._stem(word) wc = int(width / 4) for i in self._index[key]: lcontext = ' '.join(self._text[i-wc:i]) rcontext = ' '.join(self._text[i:i+wc]) ldisplay = '%*s' % (width, lcontext[-width:]) rdisplay = '%-*s' % (width, rcontext[:width]) print(ldisplay, rdisplay) def _stem(self, word): return self._stemmer.stem(word).lower() porter = nltk.PorterStemmer() grail = nltk.corpus.webtext.words('grail.txt') text = IndexedText(porter, grail) text.concordance("lie") # 词形归并 wnl = nltk.WordNetLemmatizer() res = [wnl.lemmatize(t) for t in tokens] print(res) # 3.7 用正则表达式为文本分词 # 分词的简单方法 raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone ... though), 'I won't have any pepper in my kitchen AT ALL. Soup does very ... well without--Maybe it's always pepper that makes people hot-tempered,'...""" re.split(r' ', raw) re.split(r'[ \t\n]+', raw) re.split(r'\W+', raw) re.findall(r'\w+|\S\w*', raw) re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw) # NLTK的正则表达式分词器 nltk.regexp_tokenize() text = 'That U.S.A poster-print costs $12.40...' pattern = r"""(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`] """ nltk.regexp_tokenize(text, pattern) # 3.8 分割 # 断句 l1 = len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents()) print(l1) sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt') sents = sent_tokenizer.tokenize(text) pprint.pprint(sents[171:181]) # 分词 def segment(text, segs): words = [] last = 0 for i in range(len(segs)): if segs[i] == '1': words.append(text[last:i+1]) last = i + 1 words.append(text[last:]) return words text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy" seg1 = "0000000000000001000000000010000000000000000100000000000" seg2 = "0100100100100001001001000010100100010010000100010010000" res = segment(text, seg1) print(res) res = segment(text, seg2) print(res) def evaluate(text, segs): words = segment(text, segs) text_size = len(words) lexicon_size = len(' '.join(list(set(words)))) return text_size + lexicon_size text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy" seg1 = "0000000000000001000000000010000000000000000100000000000" seg2 = "0100100100100001001001000010100100010010000100010010000" seg3 = "0000100100000011001000000110000100010000001100010000001" res = segment(text, seg3) print(res) res = evaluate(text, seg3) print(res) res = evaluate(text, seg2) print(res) res = evaluate(text, seg1) print(res) from random import randint def flip(segs, pos): return segs[:pos] + str(1 - int(segs[pos])) + segs[pos + 1:] def flip_n(segs, n): for i in range(n): segs = flip(segs, randint(0, len(segs)-1)) return segs def anneal(text, segs, iterations, cooling_rate): temperature = float(len(segs)) while temperature > 0.5: best_segs, best = segs, evaluate(text, segs) for i in range(iterations): guess = flip_n(segs, int(round(temperature))) score = evaluate(text, guess) if score < best: best, best_segs = score, guess score, segs = best, best_segs temperature = temperature / cooling_rate print(evaluate(text, segs), segment(text, segs)) print("") return segs text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy" seg1 = "0000000000000001000000000010000000000000000100000000000" anneal(text, seg1, 5000, 1.2) # 3.9 格式化:从链表到字符串 # 从链表到字符串 silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.'] res = ' '.join(silly) print(res) # We called him Tortoise because he taught us . res = ';'.join(silly) print(res) # We;called;him;Tortoise;because;he;taught;us;. res = ''.join(silly) print(res) # WecalledhimTortoisebecausehetaughtus. # 字符串与格式化 word = 'cat' sentence = """hello world""" print(word) print(sentence) fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat']) for word in fdist: print(word, '->', fdist[word], ";", end='') print("") for word in fdist: print('%s->%d;' % (word, fdist[word]), end='') print("") template = 'Lee wants a %s right now' menu = ['sandwich', 'spam fritter', 'pancake'] for snack in menu: print(template % snack) # 排列 print('%6s' % 'dog') print('%-6s' % 'dog') width = 6 print('%-*s' % (width, 'dog')) count, total = 3205, 9375 print("accuracy for %d words: %2.4f%%" % (total, 100*count/total)) # 这种打印排版格式比较常用 def tabulate(cfdist, words, categories): print('%-16s' % 'Category', end='') for word in words: print('%6s' % word, end='') print("") for category in categories: print('%-16s' % category, end='') for word in words: print('%6d' % cfdist[category][word], end='') print("") from nltk.corpus import brown cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] tabulate(cfd, modals, genres) # 将结果写入文件 output_file = open('output.txt', 'w') words = set(nltk.corpus.genesis.words('english-kjv.txt')) for word in sorted(words): output_file.write(word + "\n") print(len(words)) print(str(len(words))) output_file.write(str(len(words)) + "\n") output_file.close() # 文本换行 from textwrap import fill saying = ['After', 'all', 'is', 'said', 'and', 'done', ',', 'more', 'is', 'said', 'than', 'done', '.'] for word in saying: print(word, '(' + str(len(word)) + ')', end="") print("") format = '%s (%d),' pieces = [format % (word, len(word)) for word in saying] output = ' '.join(pieces) wrapped = fill(output) print(wrapped)
相关文章推荐
- python自然语言处理第三章:处理原始文本
- Python自然语言处理--处理原始文本
- Python自然语言处理 3 处理原始文本
- 如何处理原始文本数据(CNN情感分析yoom例子二)
- NLP with python 3 处理原始文本
- 【NLP】Python NLTK处理原始文本
- 「原」valueOf() //返回某个字符串对象的原始值 20140830 ①文本处理
- tensorflow之路-如何处理原始文本数据
- 【NLP】Python NLTK处理原始文本
- tensorflow之路-如何处理原始文本数据
- 自然语言处理----处理原始文本
- NLTK03 《Python自然语言处理》code02 获得文本语料和词汇资源
- 集算器协助java处理结构化文本之导入数据库
- linux基本命令-文本过滤与处理
- 利用JDBC处理mysql大数据--大文本和二进制文件等
- 【Scikit-Learn 中文文档】处理文本数据 - scikit-learn 教程 | ApacheCN
- asp控件button "字符文本太多"错误的处理
- linux文本处理 sort,grep,sed,awk,uniq 用法
- Shell脚本学习-文件操作和文本处理
- 文本处理 (包括C++和shell)