您的位置:首页 > 理论基础 > 计算机网络

9 循环神经网络——具有记忆功能的网络(3)

2019-07-03 14:50 1446 查看

基于seq2seq注意力模型实现中英文机器翻译

1.样本准备
配套资料中的“中英文平行语料库.rar”文件,解压后英文文件放在:
F:/PycharmProjects/test20190701/fanyichina/yuliao/from
中文文件放在:
F:/PycharmProjects/test20190701/fanyichina/yuliao/to

2.生成中英文字典
9-33 datautil(样本预处理文件)

9-33 2 生成中、英文字典
程序:

import numpy as np
from tensorflow.python.platform import gfile
from random import shuffle
import jieba
import re
import os

import collections
# 系统字符,创建字典是需要加入
_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

# 文字字符替换,不属于系统字符
_NUM = "_NUM"

# 获取文件列表
def getRawFileList(path):
files = []
names = []
for f in os.listdir(path):
if not f.endswith("~") or not f == "":
files.append(os.path.join(path, f))
names.append(f)
return files, names

# 读取分词后的中文词
def get_ch_lable(txt_file, Isch=True, normalize_digits=False):
labels = list()  # ""
labelssz = []
with open(txt_file, 'rb') as f:
for label in f:
linstr1 = label.decode('utf-8')
# labels =label.decode('gb2312').split()
# linstr1 = label.decode('gb2312')
if normalize_digits:
linstr1 = re.sub('\d+', _NUM, linstr1)
notoken = basic_tokenizer(linstr1)
if Isch:
notoken = fenci(notoken)
else:
notoken = notoken.split()
# labels =labels+notoken_ci#label.decode('gb2312')
labels.extend(notoken)
labelssz.append(len(labels))
return labels, labelssz

def basic_tokenizer(sentence):
_WORD_SPLIT = "([.,!?\"':;)(])"
_CHWORD_SPLIT = '、|。|,|‘|’'
str1 = ""
for i in re.split(_CHWORD_SPLIT, sentence):
str1 = str1 + i
str2 = ""
for i in re.split(_WORD_SPLIT, str1):
str2 = str2 + i
return str2

def fenci(training_data):
seg_list = jieba.cut(training_data)  # 默认是精确模式
training_ci = " ".join(seg_list)
training_ci = training_ci.split()
# 以空格将字符串分开
# training_ci = np.array(training_ci)
# training_ci = np.reshape(training_ci, [-1, ])
return training_ci

def build_dataset(words, n_words):
"""Process raw inputs into a dataset."""
count = [[_PAD, -1], [_GO, -1], [_EOS, -1], [_UNK, -1]]
count.extend(collections.Counter(words).most_common(n_words - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0  # dictionary['UNK']
unk_count += 1
data.append(index)
count[0][1] = unk_count
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reversed_dictionary

# 获取文件文本
def get_ch_path_text(raw_data_dir, Isch=True, normalize_digits=False):
text_files, _ = getRawFileList(raw_data_dir)
labels = []

training_dataszs = list([0])
# np.reshape(training_dataszs,(1,-1))
if len(text_files) == 0:
print("err:no files in ", raw_data_dir)
return labels
print(len(text_files), "files,one is", text_files[0])
shuffle(text_files)

for text_file in text_files:
training_data, training_datasz = get_ch_lable(text_file, Isch, normalize_digits)

#        notoken = basic_tokenizer(training_data)
#        notoken_ci = fenci(notoken)
training_ci = np.array(training_data)
training_ci = np.reshape(training_ci, [-1, ])
labels.append(training_ci)

training_datasz = np.array(training_datasz) + training_dataszs[-1]
training_dataszs.extend(list(training_datasz))
print("here", training_dataszs)
return labels, training_dataszs

# Isch=true 中文, false 英文
# 创建词典 max_vocabulary_size=500 500个词
def create_vocabulary(vocabulary_file, raw_data_dir, max_vocabulary_size, Isch=True, normalize_digits=True):
texts, textssz = get_ch_path_text(raw_data_dir, Isch, normalize_digits)
print(texts[0], len(texts))
print("行数", len(textssz), textssz)
# texts ->
all_words = []
for label in texts:
print("词数", len(label))
all_words += [word for word in label]
print("词数", len(all_words))

training_label, count, dictionary, reverse_dictionary = build_dataset(all_words, max_vocabulary_size)
print("reverse_dictionary", reverse_dictionary, len(reverse_dictionary))
if not gfile.Exists(vocabulary_file):
print("Creating vocabulary %s from data %s" % (vocabulary_file, data_dir))
if len(reverse_dictionary) > max_vocabulary_size:
reverse_dictionary = reverse_dictionary[:max_vocabulary_size]
with gfile.GFile(vocabulary_file, mode="w") as vocab_file:
for w in reverse_dictionary:
print(reverse_dictionary[w])
vocab_file.write(reverse_dictionary[w] + "\n")
else:
print("already have vocabulary!  do nothing !!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
return training_label, count, dictionary, reverse_dictionary, textssz

'''------------------------------------------------------------------------------------------------'''
#2 生成中英文字典

data_dir = "F:/PycharmProjects/test20190701/fanyichina"
raw_data_dir = "F:/PycharmProjects/test20190701/fanyichina/yuliao/from"#英文文件路径
raw_data_dir_to = "F:/PycharmProjects/test20190701/fanyichina/yuliao/to"#中文文件路径
vocabulary_fileen = "dicten.txt"#在当前目录下的fanyichina文件夹下生成字典文件
vocabulary_filech = "dictch.txt"

jieba.load_userdict("C:/Users/50633/Desktop/素材与样本/实例74 素材/myjiebadict.txt")

plot_histograms = plot_scatter = True
vocab_size = 40000

max_num_lines = 1
max_target_size = 200
max_source_size = 200

def main():
vocabulary_filenameen = os.path.join(data_dir, vocabulary_fileen)
vocabulary_filenamech = os.path.join(data_dir, vocabulary_filech)
#####################################################################
#创建英文字典
training_dataen, counten, dictionaryen, reverse_dictionaryen, textsszen = create_vocabulary(vocabulary_filenameen
,raw_data_dir,vocab_size, Isch=False, normalize_digits=True)
print("training_data", len(training_dataen))
print("dictionary", len(dictionaryen))
#########################
# 创建中文字典
training_datach, countch, dictionarych, reverse_dictionarych, textsszch = create_vocabulary(vocabulary_filenamech
, raw_data_dir_to,
vocab_size, Isch=True,
normalize_digits=True)

print("training_datach", len(training_datach))
print("dictionarych", len(dictionarych))
#######################################################################
'''------------------------------------------------------------------------------------------------'''

if __name__ == "__main__":
main()

结果:
执行代码后,会在当前目录下的fanyichina文件夹里找到dicten.txt与dictch.txt两个字典文件。

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: