递归神经网络 (RNN)
如果你一遍又一遍地将任何大于 1 的数字与自身相乘,它就会向无穷大移动(梯度爆炸),类似地,如果你一遍又一遍地将任何小于 1 的数字与自身相乘,它就会向零移动(梯度消失)。到目前为止,我们已经处理了整个输入(例如,在整个输入中应用过滤器以提取特征),但我们也可以按顺序处理我们的输入。到目前为止,我们一次处理了一个时间步的输入,我们可以在每个时间步使用 RNN 的输出,也可以只使用最终输入时间步的
概述
到目前为止,我们已经处理了整个输入(例如,在整个输入中应用过滤器以提取特征),但我们也可以按顺序处理我们的输入。例如,我们可以将文本中的每个标记视为时间事件(时间步长)。我们可以一次处理每个时间步,并在处理完最后一个时间步(令牌)后预测类别。这是非常强大的,因为该模型现在有一种有意义的方式来解释我们序列中标记的顺序并进行相应的预测。
多变的 | 描述 |
---|---|
ñ | 批量大小 |
和 | 嵌入维度 |
H | # 隐藏单元 |
在HH | RNN 权重∈RHXH |
H吨−1 | 前一个时间步的隐藏状态∈一世nRñXH |
在XH | 输入权重∈R和XH |
X吨 | 在时间步输入吨∈RñX和 |
bH | 隐藏单位偏差∈RHX1 |
H吨 | 时间步长的 RNN 输出吨 |
- 目标:
- 通过考虑当前输入以及从先前输入中学到的内容来处理顺序数据。
- 优点:
- 以有意义的方式考虑订单和先前的输入。
- 用于生成序列的条件生成。
- 缺点:
- 每个时间步的预测都依赖于之前的预测,因此很难并行化 RNN 操作。
- 处理长序列会产生内存和计算问题。
- 可解释性很困难,但很少有技术使用来自 RNN 的激活来查看输入的哪些部分被处理。
- 杂项:
- 使 RNN 更快且可解释的架构调整是一个正在进行的研究领域。
设置
让我们为我们的主要任务设置种子和设备。
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
SEED = 1234
def set_seeds(seed=1234):
"""Set seeds for reproducibility."""
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # multi-GPU
# Set seeds for reproducibility
set_seeds(seed=SEED)
# Set device
cuda = True
device = torch.device("cuda" if (
torch.cuda.is_available() and cuda) else "cpu")
torch.set_default_tensor_type("torch.FloatTensor")
if device.type == "cuda":
torch.set_default_tensor_type("torch.cuda.FloatTensor")
print (device)
加载数据
我们将下载AG News 数据集Business
,该数据集包含来自 4 个独特类别( 、Sci/Tech
、Sports
、World
) 的 120K 文本样本
# Load data
url = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/news.csv"
df = pd.read_csv(url, header=0) # load
df = df.sample(frac=1).reset_index(drop=True) # shuffle
df.head()
标题 | 类别 | |
---|---|---|
0 | 沙龙接受减少加沙军队行动的计划...... | 世界 |
1 | 野生动物犯罪斗争中的互联网关键战场 | 科技 |
2 | 7 月耐用品订单增长 1.7% | 商业 |
3 | 华尔街放缓的迹象越来越多 | 商业 |
4 | 真人秀的新面孔 | 世界 |
预处理
我们将首先通过执行诸如下部文本、删除停止(填充)词、使用正则表达式的过滤器等操作来清理我们的输入数据。
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
print (STOPWORDS[:5])
porter = PorterStemmer()
def preprocess(text, stopwords=STOPWORDS):
"""Conditional preprocessing on our text unique to our task."""
# Lower
text = text.lower()
# Remove stopwords
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
text = pattern.sub("", text)
# Remove words in parenthesis
text = re.sub(r"\([^)]*\)", "", text)
# Spacing and filters
text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
text = re.sub(" +", " ", text) # remove multiple spaces
text = text.strip()
return text
# Sample
text = "Great week for the NYSE!"
preprocess(text=text)
# Apply to dataframe
preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)
print (f"{df.title.values[0]}\n\n{preprocessed_df.title.values[0]}")
拆分数据
import collections
from sklearn.model_selection import train_test_split
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15
def train_val_test_split(X, y, train_size):
"""Split dataset into data splits."""
X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
return X_train, X_val, X_test, y_train, y_val, y_test
# Data
X = preprocessed_df["title"].values
y = preprocessed_df["category"].values
# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
X=X, y=y, train_size=TRAIN_SIZE)
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[0]} → {y_train[0]}")
X_train: (84000,), y_train: (84000,)
X_val: (18000,), y_val: (18000,)
X_test: (18000,), y_test: (18000,)
样本点:中国与朝鲜核谈判作斗争 → 世界
标签编码
接下来,我们将定义 aLabelEncoder
将我们的文本标签编码为唯一索引
import itertools
class LabelEncoder(object):
"""Label encoder for tag labels."""
def __init__(self, class_to_index={}):
self.class_to_index = class_to_index or {} # mutable defaults ;)
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
def __len__(self):
return len(self.class_to_index)
def __str__(self):
return f"<LabelEncoder(num_classes={len(self)})>"
def fit(self, y):
classes = np.unique(y)
for i, class_ in enumerate(classes):
self.class_to_index[class_] = i
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
return self
def encode(self, y):
encoded = np.zeros((len(y)), dtype=int)
for i, item in enumerate(y):
encoded[i] = self.class_to_index[item]
return encoded
def decode(self, y):
classes = []
for i, item in enumerate(y):
classes.append(self.index_to_class[item])
return classes
def save(self, fp):
with open(fp, "w") as fp:
contents = {'class_to_index': self.class_to_index}
json.dump(contents, fp, indent=4, sort_keys=False)
@classmethod
def load(cls, fp):
with open(fp, "r") as fp:
kwargs = json.load(fp=fp)
return cls(**kwargs)
# Encode
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
NUM_CLASSES = len(label_encoder)
label_encoder.class_to_index
{“商业”:0,“科技”:1,“体育”:2,“世界”:3}
# Convert labels to tokens
print (f"y_train[0]: {y_train[0]}")
y_train = label_encoder.encode(y_train)
y_val = label_encoder.encode(y_val)
y_test = label_encoder.encode(y_test)
print (f"y_train[0]: {y_train[0]}")
y_train[0]:世界
y_train[0]:3
# Class weights
counts = np.bincount(y_train)
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print (f"counts: {counts}\nweights: {class_weights}")
分词器
我们将定义一个Tokenizer
将我们的文本输入数据转换为标记索引。
import json
from collections import Counter
from more_itertools import take
class Tokenizer(object):
def __init__(self, char_level, num_tokens=None,
pad_token="<PAD>", oov_token="<UNK>",
token_to_index=None):
self.char_level = char_level
self.separator = "" if self.char_level else " "
if num_tokens: num_tokens -= 2 # pad + unk tokens
self.num_tokens = num_tokens
self.pad_token = pad_token
self.oov_token = oov_token
if not token_to_index:
token_to_index = {pad_token: 0, oov_token: 1}
self.token_to_index = token_to_index
self.index_to_token = {v: k for k, v in self.token_to_index.items()}
def __len__(self):
return len(self.token_to_index)
def __str__(self):
return f"<Tokenizer(num_tokens={len(self)})>"
def fit_on_texts(self, texts):
if not self.char_level:
texts = [text.split(" ") for text in texts]
all_tokens = [token for text in texts for token in text]
counts = Counter(all_tokens).most_common(self.num_tokens)
self.min_token_freq = counts[-1][1]
for token, count in counts:
index = len(self)
self.token_to_index[token] = index
self.index_to_token[index] = token
return self
def texts_to_sequences(self, texts):
sequences = []
for text in texts:
if not self.char_level:
text = text.split(" ")
sequence = []
for token in text:
sequence.append(self.token_to_index.get(
token, self.token_to_index[self.oov_token]))
sequences.append(np.asarray(sequence))
return sequences
def sequences_to_texts(self, sequences):
texts = []
for sequence in sequences:
text = []
for index in sequence:
text.append(self.index_to_token.get(index, self.oov_token))
texts.append(self.separator.join([token for token in text]))
return texts
def save(self, fp):
with open(fp, "w") as fp:
contents = {
"char_level": self.char_level,
"oov_token": self.oov_token,
"token_to_index": self.token_to_index
}
json.dump(contents, fp, indent=4, sort_keys=False)
@classmethod
def load(cls, fp):
with open(fp, "r") as fp:
kwargs = json.load(fp=fp)
return cls(**kwargs)
# Tokenize
tokenizer = Tokenizer(char_level=False, num_tokens=5000)
tokenizer.fit_on_texts(texts=X_train)
VOCAB_SIZE = len(tokenizer)
print (tokenizer)
# Sample of tokens
print (take(5, tokenizer.token_to_index.items()))
print (f"least freq token's freq: {tokenizer.min_token_freq}") # use this to adjust num_tokens
# Convert texts to sequences of indices
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)
preprocessed_text = tokenizer.sequences_to_texts([X_train[0]])[0]
print ("Text to indices:\n"
f" (preprocessed) → {preprocessed_text}\n"
f" (tokenized) → {X_train[0]}")
填充
我们需要对我们的标记化文本进行 2D 填充。
def pad_sequences(sequences, max_seq_len=0):
"""Pad sequences to max length in sequence."""
max_seq_len = max(max_seq_len, max(len(sequence) for sequence in sequences))
padded_sequences = np.zeros((len(sequences), max_seq_len))
for i, sequence in enumerate(sequences):
padded_sequences[i][:len(sequence)] = sequence
return padded_sequences
# 2D sequences
padded = pad_sequences(X_train[0:3])
print (padded.shape)
print (padded)
数据集
我们将创建数据集和数据加载器,以便能够使用我们的数据拆分有效地创建批次。
class Dataset(torch.utils.data.Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return len(self.y)
def __str__(self):
return f"<Dataset(N={len(self)})>"
def __getitem__(self, index):
X = self.X[index]
y = self.y[index]
return [X, len(X), y]
def collate_fn(self, batch):
"""Processing on a batch."""
# Get inputs
batch = np.array(batch)
X = batch[:, 0]
seq_lens = batch[:, 1]
y = batch[:, 2]
# Pad inputs
X = pad_sequences(sequences=X)
# Cast
X = torch.LongTensor(X.astype(np.int32))
seq_lens = torch.LongTensor(seq_lens.astype(np.int32))
y = torch.LongTensor(y.astype(np.int32))
return X, seq_lens, y
def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
return torch.utils.data.DataLoader(
dataset=self, batch_size=batch_size, collate_fn=self.collate_fn,
shuffle=shuffle, drop_last=drop_last, pin_memory=True)
# Create datasets
train_dataset = Dataset(X=X_train, y=y_train)
val_dataset = Dataset(X=X_val, y=y_val)
test_dataset = Dataset(X=X_test, y=y_test)
print ("Datasets:\n"
f" Train dataset:{train_dataset.__str__()}\n"
f" Val dataset: {val_dataset.__str__()}\n"
f" Test dataset: {test_dataset.__str__()}\n"
"Sample point:\n"
f" X: {train_dataset[0][0]}\n"
f" seq_len: {train_dataset[0][1]}\n"
f" y: {train_dataset[0][2]}")
# Create dataloaders
batch_size = 64
train_dataloader = train_dataset.create_dataloader(
batch_size=batch_size)
val_dataloader = val_dataset.create_dataloader(
batch_size=batch_size)
test_dataloader = test_dataset.create_dataloader(
batch_size=batch_size)
batch_X, batch_seq_lens, batch_y = next(iter(train_dataloader))
print ("Sample batch:\n"
f" X: {list(batch_X.size())}\n"
f" seq_lens: {list(batch_seq_lens.size())}\n"
f" y: {list(batch_y.size())}\n"
"Sample point:\n"
f" X: {batch_X[0]}\n"
f" seq_len: {batch_seq_lens[0]}\n"
f" y: {batch_y[0]}")
培训
让我们创建一个Trainer
类,我们将使用它来促进我们的实验训练。
class Trainer(object):
def __init__(self, model, device, loss_fn=None, optimizer=None, scheduler=None):
# Set params
self.model = model
self.device = device
self.loss_fn = loss_fn
self.optimizer = optimizer
self.scheduler = scheduler
def train_step(self, dataloader):
"""Train step."""
# Set model to train mode
self.model.train()
loss = 0.0
# Iterate over train batches
for i, batch in enumerate(dataloader):
# Step
batch = [item.to(self.device) for item in batch] # Set device
inputs, targets = batch[:-1], batch[-1]
self.optimizer.zero_grad() # Reset gradients
z = self.model(inputs) # Forward pass
J = self.loss_fn(z, targets) # Define loss
J.backward() # Backward pass
self.optimizer.step() # Update weights
# Cumulative Metrics
loss += (J.detach().item() - loss) / (i + 1)
return loss
def eval_step(self, dataloader):
"""Validation or test step."""
# Set model to eval mode
self.model.eval()
loss = 0.0
y_trues, y_probs = [], []
# Iterate over val batches
with torch.inference_mode():
for i, batch in enumerate(dataloader):
# Step
batch = [item.to(self.device) for item in batch] # Set device
inputs, y_true = batch[:-1], batch[-1]
z = self.model(inputs) # Forward pass
J = self.loss_fn(z, y_true).item()
# Cumulative Metrics
loss += (J - loss) / (i + 1)
# Store outputs
y_prob = F.softmax(z).cpu().numpy()
y_probs.extend(y_prob)
y_trues.extend(y_true.cpu().numpy())
return loss, np.vstack(y_trues), np.vstack(y_probs)
def predict_step(self, dataloader):
"""Prediction step."""
# Set model to eval mode
self.model.eval()
y_probs = []
# Iterate over val batches
with torch.inference_mode():
for i, batch in enumerate(dataloader):
# Forward pass w/ inputs
inputs, targets = batch[:-1], batch[-1]
z = self.model(inputs)
# Store outputs
y_prob = F.softmax(z).cpu().numpy()
y_probs.extend(y_prob)
return np.vstack(y_probs)
def train(self, num_epochs, patience, train_dataloader, val_dataloader):
best_val_loss = np.inf
for epoch in range(num_epochs):
# Steps
train_loss = self.train_step(dataloader=train_dataloader)
val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
self.scheduler.step(val_loss)
# Early stopping
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model = self.model
_patience = patience # reset _patience
else:
_patience -= 1
if not _patience: # 0
print("Stopping early!")
break
# Logging
print(
f"Epoch: {epoch+1} | "
f"train_loss: {train_loss:.5f}, "
f"val_loss: {val_loss:.5f}, "
f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
f"_patience: {_patience}"
)
return best_model
RNN
循环神经网络
RNN 的输入是连续的,如文本或时间序列。
BATCH_SIZE = 64
EMBEDDING_DIM = 100
# Input
sequence_size = 8 # words per input
x = torch.rand((BATCH_SIZE, sequence_size, EMBEDDING_DIM))
seq_lens = torch.randint(high=sequence_size, size=(BATCH_SIZE, ))
print (x.shape)
print (seq_lens.shape)
在第一个时间步,之前的隐藏状态H吨−1可以是零向量(无条件)或初始化(有条件)。如果我们调节 RNN,第一个隐藏状态H0可以属于特定条件,或者我们可以在每个时间步将特定条件连接到随机初始化的隐藏向量。更多关于这方面的内容,请参阅后续的 RNN 笔记本。
RNN_HIDDEN_DIM = 128
DROPOUT_P = 0.1
# Initialize hidden state
hidden_t = torch.zeros((BATCH_SIZE, RNN_HIDDEN_DIM))
print (hidden_t.size())
我们将展示如何使用 PyTorchRNNCell和更抽象的RNN.
# Initialize RNN cell
rnn_cell = nn.RNNCell(EMBEDDING_DIM, RNN_HIDDEN_DIM)
print (rnn_cell)
# Forward pass through RNN
x = x.permute(1, 0, 2) # RNN needs batch_size to be at dim 1
# Loop through the inputs time steps
hiddens = []
for t in range(sequence_size):
hidden_t = rnn_cell(x[t], hidden_t)
hiddens.append(hidden_t)
hiddens = torch.stack(hiddens)
hiddens = hiddens.permute(1, 0, 2) # bring batch_size back to dim 0
print (hiddens.size())
# We also could've used a more abstracted layer
x = torch.rand((BATCH_SIZE, sequence_size, EMBEDDING_DIM))
rnn = nn.RNN(EMBEDDING_DIM, RNN_HIDDEN_DIM, batch_first=True)
out, h_n = rnn(x) # h_n is the last hidden state
print ("out: ", out.shape)
print ("h_n: ", h_n.shape)
# The same tensors
print (out[:,-1,:])
print (h_n.squeeze(0))
在我们的模型中,我们希望在处理完句子中的最后一个相关标记后使用 RNN 的输出。最后一个相关标记不是指标记,<PAD>
而是指句子中的最后一个实际单词,并且它的索引对于批次中的每个输入都是不同的。这就是我们seq_lens
在批次中包含张量的原因。
def gather_last_relevant_hidden(hiddens, seq_lens):
"""Extract and collect the last relevant
hidden state based on the sequence length."""
seq_lens = seq_lens.long().detach().cpu().numpy() - 1
out = []
for batch_index, column_index in enumerate(seq_lens):
out.append(hiddens[batch_index, column_index])
return torch.stack(out)
# Get the last relevant hidden state
gather_last_relevant_hidden(hiddens=out, seq_lens=seq_lens).squeeze(0).shape
有许多不同的方法可以使用 RNN。到目前为止,我们一次处理了一个时间步的输入,我们可以在每个时间步使用 RNN 的输出,也可以只使用最终输入时间步的 RNN 输出。让我们看看其他一些可能性。
模型
import torch.nn.functional as F
HIDDEN_DIM = 100
class RNN(nn.Module):
def __init__(self, embedding_dim, vocab_size, rnn_hidden_dim,
hidden_dim, dropout_p, num_classes, padding_idx=0):
super(RNN, self).__init__()
# Initialize embeddings
self.embeddings = nn.Embedding(
embedding_dim=embedding_dim, num_embeddings=vocab_size,
padding_idx=padding_idx)
# RNN
self.rnn = nn.RNN(embedding_dim, rnn_hidden_dim, batch_first=True)
# FC weights
self.dropout = nn.Dropout(dropout_p)
self.fc1 = nn.Linear(rnn_hidden_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, num_classes)
def forward(self, inputs):
# Embed
x_in, seq_lens = inputs
x_in = self.embeddings(x_in)
# Rnn outputs
out, h_n = self.rnn(x_in)
z = gather_last_relevant_hidden(hiddens=out, seq_lens=seq_lens)
# FC layers
z = self.fc1(z)
z = self.dropout(z)
z = self.fc2(z)
return z
# Simple RNN cell
model = RNN(
embedding_dim=EMBEDDING_DIM, vocab_size=VOCAB_SIZE,
rnn_hidden_dim=RNN_HIDDEN_DIM, hidden_dim=HIDDEN_DIM,
dropout_p=DROPOUT_P, num_classes=NUM_CLASSES)
model = model.to(device) # set device
print (model.named_parameters)
训练
from torch.optim import Adam
NUM_LAYERS = 1
LEARNING_RATE = 1e-4
PATIENCE = 10
NUM_EPOCHS = 50
# Define Loss
class_weights_tensor = torch.Tensor(list(class_weights.values())).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
# Define optimizer & scheduler
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode="min", factor=0.1, patience=3)
# Trainer module
trainer = Trainer(
model=model, device=device, loss_fn=loss_fn,
optimizer=optimizer, scheduler=scheduler)
# Train
best_model = trainer.train(
NUM_EPOCHS, PATIENCE, train_dataloader, val_dataloader)
评估
import json
from sklearn.metrics import precision_recall_fscore_support
def get_metrics(y_true, y_pred, classes):
"""Per-class performance metrics."""
# Performance
performance = {"overall": {}, "class": {}}
# Overall performance
metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
performance["overall"]["precision"] = metrics[0]
performance["overall"]["recall"] = metrics[1]
performance["overall"]["f1"] = metrics[2]
performance["overall"]["num_samples"] = np.float64(len(y_true))
# Per-class performance
metrics = precision_recall_fscore_support(y_true, y_pred, average=None)
for i in range(len(classes)):
performance["class"][classes[i]] = {
"precision": metrics[0][i],
"recall": metrics[1][i],
"f1": metrics[2][i],
"num_samples": np.float64(metrics[3][i]),
}
return performance
# Get predictions
test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
y_pred = np.argmax(y_prob, axis=1)
# Get predictions
test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
y_pred = np.argmax(y_prob, axis=1)
门控RNN
虽然到目前为止我们的简单 RNN 非常适合按顺序处理我们的输入,但它们有很多缺点。由于使用相同的权重集,它们通常会遭受梯度爆炸或消失的困扰(在XH和在HH) 每个时间步的输入。在反向传播期间,这可能导致梯度爆炸(>1)或消失(<1)。如果你一遍又一遍地将任何大于 1 的数字与自身相乘,它就会向无穷大移动(梯度爆炸),类似地,如果你一遍又一遍地将任何小于 1 的数字与自身相乘,它就会向零移动(梯度消失)。为了缓解这个问题,设计了门控 RNN 来选择性地保留信息。如果你有兴趣了解更多细节,这篇文章是必读的。
有两种流行的门控 RNN 类型:长短期记忆 (LSTM) 单元和门控循环单元 (GRU)。
在 LSTM 和 GRU 之间做出决定时,经验性能是最好的因素,但通常 GRU 提供相似的性能,但复杂性更低(权重更小)。
# Input
sequence_size = 8 # words per input
x = torch.rand((BATCH_SIZE, sequence_size, EMBEDDING_DIM))
print (x.shape)
# GRU
gru = nn.GRU(input_size=EMBEDDING_DIM, hidden_size=RNN_HIDDEN_DIM, batch_first=True)
# Forward pass
out, h_n = gru(x)
print (f"out: {out.shape}")
print (f"h_n: {h_n.shape}")
双向 RNN
我们还可以让 RNN 处理来自两个方向的输入(第一个令牌到最后一个令牌,反之亦然)并组合它们的输出。这种架构被称为双向 RNN。
# GRU
gru = nn.GRU(input_size=EMBEDDING_DIM, hidden_size=RNN_HIDDEN_DIM,
batch_first=True, bidirectional=True)
# Forward pass
out, h_n = gru(x)
print (f"out: {out.shape}")
print (f"h_n: {h_n.shape}")
请注意,每个时间戳的每个样本的输出大小为 256(是 的两倍RNN_HIDDEN_DIM
)。这是因为这包括来自 BiRNN 的前向和后向方向。
模型
class GRU(nn.Module):
def __init__(self, embedding_dim, vocab_size, rnn_hidden_dim,
hidden_dim, dropout_p, num_classes, padding_idx=0):
super(GRU, self).__init__()
# Initialize embeddings
self.embeddings = nn.Embedding(embedding_dim=embedding_dim,
num_embeddings=vocab_size,
padding_idx=padding_idx)
# RNN
self.rnn = nn.GRU(embedding_dim, rnn_hidden_dim,
batch_first=True, bidirectional=True)
# FC weights
self.dropout = nn.Dropout(dropout_p)
self.fc1 = nn.Linear(rnn_hidden_dim*2, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, num_classes)
def forward(self, inputs:
# Embed
x_in, seq_lens = inputs
x_in = self.embeddings(x_in)
# Rnn outputs
out, h_n = self.rnn(x_in)
z = gather_last_relevant_hidden(hiddens=out, seq_lens=seq_lens)
# FC layers
z = self.fc1(z)
z = self.dropout(z)
z = self.fc2(z)
return z
# Simple RNN cell
model = GRU(
embedding_dim=EMBEDDING_DIM, vocab_size=VOCAB_SIZE,
rnn_hidden_dim=RNN_HIDDEN_DIM, hidden_dim=HIDDEN_DIM,
dropout_p=DROPOUT_P, num_classes=NUM_CLASSES)
model = model.to(device) # set device
print (model.named_parameters)
训练
# Define Loss
class_weights_tensor = torch.Tensor(list(class_weights.values())).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
# Define optimizer & scheduler
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode="min", factor=0.1, patience=3)
# Trainer module
trainer = Trainer(
model=model, device=device, loss_fn=loss_fn,
optimizer=optimizer, scheduler=scheduler)
# Train
best_model = trainer.train(
NUM_EPOCHS, PATIENCE, train_dataloader, val_dataloader)
评估
from pathlib import Path
# Get predictions
test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
y_pred = np.argmax(y_prob, axis=1)
# Determine performance
performance = get_metrics(
y_true=y_test, y_pred=y_pred, classes=label_encoder.classes)
print (json.dumps(performance["overall"], indent=2))
# Save artifacts
dir = Path("gru")
dir.mkdir(parents=True, exist_ok=True)
label_encoder.save(fp=Path(dir, "label_encoder.json"))
tokenizer.save(fp=Path(dir, 'tokenizer.json'))
torch.save(best_model.state_dict(), Path(dir, "model.pt"))
with open(Path(dir, 'performance.json'), "w") as fp:
json.dump(performance, indent=2, sort_keys=False, fp=fp)
推理
def get_probability_distribution(y_prob, classes):
"""Create a dict of class probabilities from an array."""
results = {}
for i, class_ in enumerate(classes):
results[class_] = np.float64(y_prob[i])
sorted_results = {k: v for k, v in sorted(
results.items(), key=lambda item: item[1], reverse=True)}
return sorted_results
# Load artifacts
device = torch.device("cpu")
label_encoder = LabelEncoder.load(fp=Path(dir, "label_encoder.json"))
tokenizer = Tokenizer.load(fp=Path(dir, 'tokenizer.json'))
model = GRU(
embedding_dim=EMBEDDING_DIM, vocab_size=VOCAB_SIZE,
rnn_hidden_dim=RNN_HIDDEN_DIM, hidden_dim=HIDDEN_DIM,
dropout_p=DROPOUT_P, num_classes=NUM_CLASSES)
model.load_state_dict(torch.load(Path(dir, "model.pt"), map_location=device))
model.to(device)
# Initialize trainer
trainer = Trainer(model=model, device=device)
# Dataloader
text = "The final tennis tournament starts next week."
X = tokenizer.texts_to_sequences([preprocess(text)])
print (tokenizer.sequences_to_texts(X))
y_filler = label_encoder.encode([label_encoder.classes[0]]*len(X))
dataset = Dataset(X=X, y=y_filler)
dataloader = dataset.create_dataloader(batch_size=batch_size)
['决赛网球锦标赛下周开始']
# Inference
y_prob = trainer.predict_step(dataloader)
y_pred = np.argmax(y_prob, axis=1)
label_encoder.decode(y_pred)
['运动的']
# Class distributions
prob_dist = get_probability_distribution(y_prob=y_prob[0], classes=label_encoder.classes)
print (json.dumps(prob_dist, indent=2))
{
“体育”:0.49753469228744507,
“世界”:0.2925860285758972,
“业务”:0.1932886838912964,
“科技”:0.01659061387181282
}
要引用本课,请使用:
1 2 3 4 5 6 |
|
更多推荐
所有评论(0)