Following the tutorial from https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
There is a USE_CUDA
flag that is used to control the variable and tensor types between CPU (when False) to GPU (when True) types.
Using the data from en-fr.tsv and converting the sentences to variables:
import unicodedata
import string
import re
import random
import time
import math
from gensim.corpora.dictionary import Dictionary
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import LongTensor, FloatTensor
from torch import optim
import torch.nn.functional as F
import numpy as np
MAX_LENGTH = 10
USE_CUDA = False
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
s = unicode_to_ascii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
return s
SOS_IDX, SOS_TOKEN = 0, '<s>'
EOS_IDX, EOS_TOKEN = 1, '</s>'
UNK_IDX, UNK_TOKEN = 2, '<unk>'
PAD_IDX, PAD_TOKEN = 3, '<blank>'
lines = open('en-fr.tsv').read().strip().split('\n')
pairs = [[normalize_string(s).split() for s in l.split('\t')] for l in lines]
src_sents, trg_sents = zip(*pairs)
src_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
src_dict.add_documents(src_sents)
trg_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
trg_dict.add_documents(trg_sents)
def variablize_sentences(sentence, dictionary):
indices = [dictionary.token2id[tok] for tok in sentence] + [dictionary.token2id[EOS_TOKEN]]
var = Variable(LongTensor(indices).view(-1, 1))
return var.cuda() if USE_CUDA else var
input_variables = [variablize_sentences(sent, src_dict) for sent in src_sents]
output_variables = [variablize_sentences(sent, trg_dict) for sent in trg_sents]
And using a Encoder-Attn-Decoder network:
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1):
super(EncoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
self.gru = self.gru.cuda() if USE_CUDA else self.gru
def forward(self, word_inputs, hidden):
seq_len = len(word_inputs)
embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
embedded = embedded.cuda() if USE_CUDA else embedded
output, hidden = self.gru(embedded, hidden)
output = output.cuda() if USE_CUDA else output
hiddne = hidden.cuda() if USE_CUDA else hidden
return output, hidden
def init_hidden(self):
hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
return hidden.cuda() if USE_CUDA else hidden
class Attn(nn.Module):
def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
super(Attn, self).__init__()
self.method = method
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
self.other = nn.Parameter(FloatTensor(1, hidden_size))
def forward(self, hidden, encoder_outputs):
seq_len = len(encoder_outputs)
# Create variable to store attention energies
attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
attn_energies = attn_energies.cuda() if USE_CUDA else attn_energies
# Calculate energies for each encoder output
for i in range(seq_len):
attn_energies[i] = self.score(hidden, encoder_outputs[i])
# Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)
def score(self, hidden, encoder_output):
if self.method == 'dot':
energy =torch.dot(hidden.view(-1), encoder_output.view(-1))
elif self.method == 'general':
energy = self.attn(encoder_output)
energy = torch.dot(hidden.view(-1), energy.view(-1))
elif self.method == 'concat':
energy = self.attn(torch.cat((hidden, encoder_output), 1))
energy = torch.dot(self.v.view(-1), energy.view(-1))
return energy
class AttnDecoderRNN(nn.Module):
def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
super(AttnDecoderRNN, self).__init__()
# Keep parameters for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout_p = dropout_p
# Define layers
self.embedding = nn.Embedding(output_size, hidden_size)
self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
self.out = nn.Linear(hidden_size * 2, output_size)
self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
self.gru = self.gru.cuda() if USE_CUDA else self.gru
self.out = self.out.cuda() if USE_CUDA else self.out
# Choose attention model
if attn_model != 'none':
self.attn = Attn(attn_model, hidden_size)
self.attn = self.attn.cuda() if USE_CUDA else self.attn
def forward(self, word_input, last_context, last_hidden, encoder_outputs):
# Note: we run this one step at a time
# Get the embedding of the current input word (last output word)
word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
# Combine embedded input word and last context, run through RNN
rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
rnn_output, hidden = self.gru(rnn_input, last_hidden)
# Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
# Final output layer (next word prediction) using the RNN hidden state and context vector
rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
context = context.squeeze(1) # B x S=1 x N -> B x N
output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)))
if USE_CUDA:
return output.cuda(), context.cuda(), hidden.cuda(), attn_weights.cuda()
else:
return output, context, hidden, attn_weights
And testing the network:
encoder_test = EncoderRNN(10, 10, 2) # I, H , L
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L
encoder_hidden = encoder_test.init_hidden()
if USE_CUDA:
word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda())
else:
word_inputs = Variable(torch.LongTensor([1, 2, 3]))
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden)
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
print(decoder_output)
print(decoder_hidden)
print(decoder_attn)
The code works fine on CPU,
[out]:
EncoderRNN (
(embedding): Embedding(10, 10)
(gru): GRU(10, 10, num_layers=2)
)
AttnDecoderRNN (
(embedding): Embedding(10, 10)
(gru): GRU(20, 10, num_layers=2, dropout=0.1)
(out): Linear (20 -> 10)
(attn): Attn (
(attn): Linear (10 -> 10)
)
)
Variable containing:
-2.4378 -2.3556 -2.3391 -2.5070 -2.3439 -2.3415 -2.3976 -2.1832 -1.9976 -2.2213
[torch.FloatTensor of size 1x10]
Variable containing:
(0 ,.,.) =
Columns 0 to 8
-0.2325 0.0775 0.5415 0.4876 -0.5771 -0.0687 0.1832 -0.5285 0.2508
Columns 9 to 9
-0.1837
(1 ,.,.) =
Columns 0 to 8
-0.1389 -0.2605 -0.0518 0.3405 0.0774 0.1815 0.0297 -0.1304 -0.1015
Columns 9 to 9
0.2602
[torch.FloatTensor of size 2x1x10]
Variable containing:
(0 ,.,.) =
0.3334 0.3291 0.3374
[torch.FloatTensor of size 1x1x3]
but when changing the flag to USE_GPU=True
, it throws the error when initializing the decoder_test
object, it throws a TypeError
:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-76-b3c660013934> in <module>()
12 decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
13
---> 14 decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
15 print(decoder_output)
16 print(decoder_hidden)
~/.local/lib/python3.5/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
222 for hook in self._forward_pre_hooks.values():
223 hook(self, input)
--> 224 result = self.forward(*input, **kwargs)
225 for hook in self._forward_hooks.values():
226 hook_result = hook(self, input, result)
<ipython-input-75-34ecfe9b3112> in forward(self, word_input, last_context, last_hidden, encoder_outputs)
32
33 # Combine embedded input word and last context, run through RNN
---> 34 rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
35 rnn_output, hidden = self.gru(rnn_input, last_hidden)
36
~/.local/lib/python3.5/site-packages/torch/autograd/variable.py in cat(iterable, dim)
895 @staticmethod
896 def cat(iterable, dim=0):
--> 897 return Concat.apply(dim, *iterable)
898
899 @staticmethod
~/.local/lib/python3.5/site-packages/torch/autograd/_functions/tensor.py in forward(ctx, dim, *inputs)
315 ctx.dim = dim
316 ctx.input_sizes = [i.size(dim) for i in inputs]
--> 317 return torch.cat(inputs, dim)
318
319 @staticmethod
TypeError: cat received an invalid combination of arguments - got (tuple, int), but expected one of:
* (sequence[torch.cuda.FloatTensor] seq)
* (sequence[torch.cuda.FloatTensor] seq, int dim)
didn't match because some of the arguments have invalid types: (tuple, int)
The question is why are that types not matching in CUDA but it works on CPU and how to resolve this?
Does PyTorch have a global flag to just change all types to CUDA types and not mess around with CPU/GPU types?