9 循环神经网络——具有记忆功能的网络(3)
2019-07-03 14:50
1446 查看
基于seq2seq注意力模型实现中英文机器翻译
1.样本准备
配套资料中的“中英文平行语料库.rar”文件,解压后英文文件放在:
F:/PycharmProjects/test20190701/fanyichina/yuliao/from
中文文件放在:
F:/PycharmProjects/test20190701/fanyichina/yuliao/to
2.生成中英文字典
9-33 datautil(样本预处理文件)
9-33 2 生成中、英文字典
程序:
import numpy as np from tensorflow.python.platform import gfile from random import shuffle import jieba import re import os import collections # 系统字符,创建字典是需要加入 _PAD = "_PAD" _GO = "_GO" _EOS = "_EOS" _UNK = "_UNK" PAD_ID = 0 GO_ID = 1 EOS_ID = 2 UNK_ID = 3 # 文字字符替换,不属于系统字符 _NUM = "_NUM" # 获取文件列表 def getRawFileList(path): files = [] names = [] for f in os.listdir(path): if not f.endswith("~") or not f == "": files.append(os.path.join(path, f)) names.append(f) return files, names # 读取分词后的中文词 def get_ch_lable(txt_file, Isch=True, normalize_digits=False): labels = list() # "" labelssz = [] with open(txt_file, 'rb') as f: for label in f: linstr1 = label.decode('utf-8') # labels =label.decode('gb2312').split() # linstr1 = label.decode('gb2312') if normalize_digits: linstr1 = re.sub('\d+', _NUM, linstr1) notoken = basic_tokenizer(linstr1) if Isch: notoken = fenci(notoken) else: notoken = notoken.split() # labels =labels+notoken_ci#label.decode('gb2312') labels.extend(notoken) labelssz.append(len(labels)) return labels, labelssz def basic_tokenizer(sentence): _WORD_SPLIT = "([.,!?\"':;)(])" _CHWORD_SPLIT = '、|。|,|‘|’' str1 = "" for i in re.split(_CHWORD_SPLIT, sentence): str1 = str1 + i str2 = "" for i in re.split(_WORD_SPLIT, str1): str2 = str2 + i return str2 def fenci(training_data): seg_list = jieba.cut(training_data) # 默认是精确模式 training_ci = " ".join(seg_list) training_ci = training_ci.split() # 以空格将字符串分开 # training_ci = np.array(training_ci) # training_ci = np.reshape(training_ci, [-1, ]) return training_ci def build_dataset(words, n_words): """Process raw inputs into a dataset.""" count = [[_PAD, -1], [_GO, -1], [_EOS, -1], [_UNK, -1]] count.extend(collections.Counter(words).most_common(n_words - 1)) dictionary = dict() for word, _ in count: dictionary[word] = len(dictionary) data = list() unk_count = 0 for word in words: if word in dictionary: index = dictionary[word] else: index = 0 # dictionary['UNK'] unk_count += 1 data.append(index) count[0][1] = unk_count reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data, count, dictionary, reversed_dictionary # 获取文件文本 def get_ch_path_text(raw_data_dir, Isch=True, normalize_digits=False): text_files, _ = getRawFileList(raw_data_dir) labels = [] training_dataszs = list([0]) # np.reshape(training_dataszs,(1,-1)) if len(text_files) == 0: print("err:no files in ", raw_data_dir) return labels print(len(text_files), "files,one is", text_files[0]) shuffle(text_files) for text_file in text_files: training_data, training_datasz = get_ch_lable(text_file, Isch, normalize_digits) # notoken = basic_tokenizer(training_data) # notoken_ci = fenci(notoken) training_ci = np.array(training_data) training_ci = np.reshape(training_ci, [-1, ]) labels.append(training_ci) training_datasz = np.array(training_datasz) + training_dataszs[-1] training_dataszs.extend(list(training_datasz)) print("here", training_dataszs) return labels, training_dataszs # Isch=true 中文, false 英文 # 创建词典 max_vocabulary_size=500 500个词 def create_vocabulary(vocabulary_file, raw_data_dir, max_vocabulary_size, Isch=True, normalize_digits=True): texts, textssz = get_ch_path_text(raw_data_dir, Isch, normalize_digits) print(texts[0], len(texts)) print("行数", len(textssz), textssz) # texts -> all_words = [] for label in texts: print("词数", len(label)) all_words += [word for word in label] print("词数", len(all_words)) training_label, count, dictionary, reverse_dictionary = build_dataset(all_words, max_vocabulary_size) print("reverse_dictionary", reverse_dictionary, len(reverse_dictionary)) if not gfile.Exists(vocabulary_file): print("Creating vocabulary %s from data %s" % (vocabulary_file, data_dir)) if len(reverse_dictionary) > max_vocabulary_size: reverse_dictionary = reverse_dictionary[:max_vocabulary_size] with gfile.GFile(vocabulary_file, mode="w") as vocab_file: for w in reverse_dictionary: print(reverse_dictionary[w]) vocab_file.write(reverse_dictionary[w] + "\n") else: print("already have vocabulary! do nothing !!!!!!!!!!!!!!!!!!!!!!!!!!!!!") return training_label, count, dictionary, reverse_dictionary, textssz '''------------------------------------------------------------------------------------------------''' #2 生成中英文字典 data_dir = "F:/PycharmProjects/test20190701/fanyichina" raw_data_dir = "F:/PycharmProjects/test20190701/fanyichina/yuliao/from"#英文文件路径 raw_data_dir_to = "F:/PycharmProjects/test20190701/fanyichina/yuliao/to"#中文文件路径 vocabulary_fileen = "dicten.txt"#在当前目录下的fanyichina文件夹下生成字典文件 vocabulary_filech = "dictch.txt" jieba.load_userdict("C:/Users/50633/Desktop/素材与样本/实例74 素材/myjiebadict.txt") plot_histograms = plot_scatter = True vocab_size = 40000 max_num_lines = 1 max_target_size = 200 max_source_size = 200 def main(): vocabulary_filenameen = os.path.join(data_dir, vocabulary_fileen) vocabulary_filenamech = os.path.join(data_dir, vocabulary_filech) ##################################################################### #创建英文字典 training_dataen, counten, dictionaryen, reverse_dictionaryen, textsszen = create_vocabulary(vocabulary_filenameen ,raw_data_dir,vocab_size, Isch=False, normalize_digits=True) print("training_data", len(training_dataen)) print("dictionary", len(dictionaryen)) ######################### # 创建中文字典 training_datach, countch, dictionarych, reverse_dictionarych, textsszch = create_vocabulary(vocabulary_filenamech , raw_data_dir_to, vocab_size, Isch=True, normalize_digits=True) print("training_datach", len(training_datach)) print("dictionarych", len(dictionarych)) ####################################################################### '''------------------------------------------------------------------------------------------------''' if __name__ == "__main__": main()
结果:
执行代码后,会在当前目录下的fanyichina文件夹里找到dicten.txt与dictch.txt两个字典文件。
相关文章推荐
- RNN,具有记忆功能神经网络的理解与实现
- TensorFlow人工智能引擎入门教程之九 RNN/LSTM循环神经网络长短期记忆网络使用
- 5 什么是LSTM-RNN(长短期记忆循环神经网络)?
- 双向长短时记忆循环神经网络详解(Bi-directional LSTM RNN)
- 双向长短时记忆循环神经网络详解(Bi-directional LSTM RNN)
- 机器学习与Tensorflow(5)——循环神经网络、长短时记忆网络
- 循环神经网络RNN模型和长短时记忆系统LSTM
- 【深度学习】RNN(循环神经网络)之LSTM(长短时记忆)
- 深度学习——循环神经网络/递归神经网络(RNN)及其改进的长短时记忆网络(LSTM)
- TensorFlow实现经典深度学习网络(7):TensorFlow实现双向长短时记忆循环神经网络
- 双向长短时记忆循环神经网络详解(Bi-directional LSTM RNN)
- RNN循环神经网络以及LSTM长短期记忆模型-简介
- Bi-directional LSTM RNN(双向长短时记忆循环神经网络)
- 双向长短时记忆循环神经网络详解(Bi-directional LSTM RNN)
- 离散型Hopdield神经网络联想记忆功能
- 学习笔记TF057:TensorFlow MNIST,卷积神经网络、循环神经网络、无监督学习
- 循环神经网络(RNN、RNN变体、RNN训练方法:BPTT)
- 循环神经网络RNN
- Keras RNN循环神经网络(四)
- 循环神经网络