Datawhale大作业_基于Bert实现文本分类
2020-07-16 04:38
127 查看
Datawhale大作业
- 基于Bert实现文本分类
- Import
- Parameters and Hyperparameters
- Tokenizer
- Load IMDB Dataset
- Load Pretrained-Bert
- Model
- Train
- Evaluate
- Predict_sentiment
- Train Loop
基于Bert实现文本分类
Import
# 基于Pytorch实现 import torch import torch.nn as nn import torch.optim as optim # 使用transformers包 from transformers import BertTokenizer, BertModel from torchtext import data, datasets import numpy as np import random import time
Parameters and Hyperparameters
# 参数 SEED = 1234 TRAIN = False BATCH_SIZE = 128 N_EPOCHS = 5 HIDDEN_DIM = 256 OUTPUT_DIM = 1 N_LAYERS = 2 BIDIRECTIONAL = True DROPOUT = 0.25 TEXT = "I like you!" # 固定模型用种子,便于重复试验 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True
Tokenizer
# 应用transformers中Tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') init_token_id = tokenizer.cls_token_id eos_token_id = tokenizer.sep_token_id pad_token_id = tokenizer.pad_token_id unk_token_id = tokenizer.unk_token_id max_input_len = tokenizer.max_model_input_sizes['bert-base-uncased'] # 将句子长度切割成510长,为了加上开头和最后一个token def tokenize_and_crop(sentence): tokens = tokenizer.tokenize(sentence) tokens = tokens[:max_input_len - 2] return tokens
Load IMDB Dataset
def load_data(): text = data.Field( batch_first=True, use_vocab=False, tokenize=tokenize_and_crop, preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_id, pad_token=pad_token_id, unk_token=unk_token_id ) label = data.LabelField(dtype=torch.float) train_data, test_data = datasets.IMDB.splits(text, label) train_data, valid_data = train_data.split(random_state=random.seed(SEED)) print(f"training examples count: {len(train_data)}") print(f"test examples count: {len(test_data)}") print(f"validation examples count: {len(valid_data)}") label.build_vocab(train_data) train_iter, valid_iter, test_iter = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device ) return train_iter, valid_iter, test_iter
Load Pretrained-Bert
# 看是否有GPU device = 'cuda' if torch.cuda.is_available() else 'cpu' # 通过transformers包,建立BERT模型 bert_model = BertModel.from_pretrained('bert-base-uncased')
Model
# 此处用BERT做为基础模型完成情感分析任务 # 在BERT之上加两层GRU # 最后接一层线性层用于完成分类任务 class SentimentModel(nn.Module): def __init__( self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout ): super(SentimentModel, self).__init__() self.bert = bert embedding_dim = bert.config.to_dict()['hidden_size'] self.rnn = nn.GRU( embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True, dropout=0 if n_layers < 2 else dropout ) self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim) self.dropout = nn.Dropout(dropout) def forward(self, text): with torch.no_grad(): embedded = self.bert(text)[0] _, hidden = self.rnn(embedded) if self.rnn.bidirectional: hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)) else: hidden = self.dropout(hidden[-1,:,:]) output = self.out(hidden) return output model = SentimentModel( bert_model, HIDDEN_DIM, OUTPUT_DIM,!pip list N_LAYERS, BIDIRECTIONAL, DROPOUT ) print(model)
Train
def epoch_time(start_time, end_time): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs def binary_accuracy(preds, y): rounded_preds = torch.round(torch.sigmoid(preds)) correct = (rounded_preds == y).float() acc = correct.sum() / len(correct) return acc def train(model, iterator, optimizer, criterion): epoch_loss = 0 epoch_acc = 0 model.train() for batch in iterator: optimizer.zero_grad() predictions = model(batch.text).squeeze(1) loss = criterion(predictions, batch.label) acc = binary_accuracy(predictions, batch.label) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator)
Evaluate
def evaluate(model, iterator, criterion): epoch_loss = 0 epoch_acc = 0 model.eval() with torch.no_grad(): for batch in iterator: predictions = model(batch.text).squeeze(1) loss = criterion(predictions, batch.label) acc = binary_accuracy(predictions, batch.label) epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator)
Predict_sentiment
def predict_sentiment(model, tokenizer, sentence): model.eval() tokens = tokenizer.tokenize(sentence) tokens = tokens[:max_input_len - 2] indexed = [init_token_id] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_id] tensor = torch.LongTensor(indexed).to(device) tensor = tensor.unsqueeze(0) prediction = torch.sigmoid(model(tensor)) return prediction.item()
Train Loop
train_iter, valid_iter, test_iter = load_data() optimizer = optim.Adam(model.parameters()) criterion = nn.BCEWithLogitsLoss().to(device) model = model.to(device) best_val_loss = float('inf') for epoch in range(N_EPOCHS): start_time = time.time() # 训练一个epoch train_loss, train_acc = train(model, train_iter, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iter, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'model.pt') print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%') print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%') if __name__ == "__main__": # 开始训练 if TRAIN: # 读取数据 train_iter, valid_iter, test_iter = load_data() optimizer = optim.Adam(model.parameters()) criterion = nn.BCEWithLogitsLoss().to(device) model = model.to(device) best_val_loss = float('inf') for epoch in range(N_EPOCHS): start_time = time.time() # 训练一个epoch train_loss, train_acc = train(model, train_iter, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iter, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'model.pt') print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%') print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%') # 测试 model.load_state_dict(torch.load('model.pt')) test_loss, test_acc = evaluate(model, test_iter, criterion) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%') # 推理结果 else: model.load_state_dict(torch.load('model.pt', map_location=device)) sentiment = predict_sentiment(model, tokenizer, TEXT) print(sentiment)
相关文章推荐
- 基于向量空间模型的文本自动分类系统的研究与实现
- CNN中文文本分类-基于TensorFlow实现
- 基于weka的文本分类实现
- 文本挖掘——基于TF-IDF的KNN分类算法实现
- 基于词向量word2vec模型的文本分类实现(算例有代码)
- 基于传统机器学习scikit-learn的文本分类方法实现
- 【python实现基于深度学习的文本情感分类(3)】——word2vec词向量训练
- 基于kNN的文本分类原理以及实现
- Chinese-Text-Classification,用卷积神经网络基于 Tensorflow 实现的中文文本分类。
- tensorflow实现基于LSTM的文本分类方法
- 基于朴素贝叶斯分类器的文本分类算法的实现过程分析
- 基于情感词典和朴素贝叶斯算法实现中文文本情感分类
- 基于Attention Model的Aspect level文本情感分类---用Python+Keras实现
- 基于NaiveBayes的文本分类之Spark实现
- Pytorch实现基于CharRNN的文本分类与生成示例
- tensorflow实现基于LSTM的文本分类方法
- 手把手教你如何用 TensorFlow 实现基于 DNN 的文本分类
- tensorflow实现基于LSTM的文本分类方法
- 基于支持向量机SVM的文本分类的实现
- 文本分类的python实现-基于SVM算法