Porting PyTorch code from CPU to GPU

Following the tutorial from https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb

There is a USE_CUDA flag that is used to control the variable and tensor types between CPU (when False) to GPU (when True) types.

Using the data from en-fr.tsv and converting the sentences to variables:

import unicodedata
import string
import re
import random
import time
import math

from gensim.corpora.dictionary import Dictionary

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import LongTensor, FloatTensor
from torch import optim
import torch.nn.functional as F

import numpy as np

MAX_LENGTH = 10
USE_CUDA = False

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

SOS_IDX, SOS_TOKEN = 0, '<s>'
EOS_IDX, EOS_TOKEN = 1, '</s>'
UNK_IDX, UNK_TOKEN = 2, '<unk>'
PAD_IDX, PAD_TOKEN = 3, '<blank>'

lines = open('en-fr.tsv').read().strip().split('\n')
pairs = [[normalize_string(s).split() for s in l.split('\t')] for l in lines]
src_sents, trg_sents = zip(*pairs)

src_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
src_dict.add_documents(src_sents)

trg_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
trg_dict.add_documents(trg_sents)

def variablize_sentences(sentence, dictionary):
    indices = [dictionary.token2id[tok] for tok in sentence] + [dictionary.token2id[EOS_TOKEN]]
    var = Variable(LongTensor(indices).view(-1, 1))
    return var.cuda() if USE_CUDA else var

input_variables = [variablize_sentences(sent, src_dict) for sent in src_sents]
output_variables = [variablize_sentences(sent, trg_dict) for sent in trg_sents]

And using a Encoder-Attn-Decoder network:

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_size, hidden_size)    
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)

        self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
        self.gru = self.gru.cuda() if USE_CUDA else self.gru

    def forward(self, word_inputs, hidden):
        seq_len = len(word_inputs)

        embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
        embedded = embedded.cuda() if USE_CUDA else embedded

        output, hidden = self.gru(embedded, hidden)
        output = output.cuda() if USE_CUDA else output
        hiddne = hidden.cuda() if USE_CUDA else hidden

        return output, hidden

    def init_hidden(self):
        hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden

class Attn(nn.Module):
    def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
        super(Attn, self).__init__()

        self.method = method
        self.hidden_size = hidden_size

        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.other = nn.Parameter(FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        seq_len = len(encoder_outputs)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
        attn_energies = attn_energies.cuda() if USE_CUDA else attn_energies
        # Calculate energies for each encoder output
        for i in range(seq_len):
            attn_energies[i] = self.score(hidden, encoder_outputs[i])

        # Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
        return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)

    def score(self, hidden, encoder_output):
        if self.method == 'dot':
            energy =torch.dot(hidden.view(-1), encoder_output.view(-1))
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = torch.dot(hidden.view(-1), energy.view(-1))
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = torch.dot(self.v.view(-1), energy.view(-1))
        return energy

class AttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()

        # Keep parameters for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p

        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size * 2, output_size)

        self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
        self.gru = self.gru.cuda() if USE_CUDA else self.gru
        self.out = self.out.cuda() if USE_CUDA else self.out


        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)
            self.attn = self.attn.cuda() if USE_CUDA else self.attn

    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        # Note: we run this one step at a time

        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N

        # Combine embedded input word and last context, run through RNN
        rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
        rnn_output, hidden = self.gru(rnn_input, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
        attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N

        # Final output layer (next word prediction) using the RNN hidden state and context vector
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)))

        if USE_CUDA:
            return output.cuda(), context.cuda(), hidden.cuda(), attn_weights.cuda()
        else:
            return output, context, hidden, attn_weights

And testing the network:

encoder_test = EncoderRNN(10, 10, 2) # I, H , L
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L

encoder_hidden = encoder_test.init_hidden()
if USE_CUDA:
    word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda())
else:
    word_inputs = Variable(torch.LongTensor([1, 2, 3]))
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden)
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))

decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
print(decoder_output)
print(decoder_hidden)
print(decoder_attn)

The code works fine on CPU,

[out]:

EncoderRNN (
  (embedding): Embedding(10, 10)
  (gru): GRU(10, 10, num_layers=2)
)
AttnDecoderRNN (
  (embedding): Embedding(10, 10)
  (gru): GRU(20, 10, num_layers=2, dropout=0.1)
  (out): Linear (20 -> 10)
  (attn): Attn (
    (attn): Linear (10 -> 10)
  )
)
Variable containing:
-2.4378 -2.3556 -2.3391 -2.5070 -2.3439 -2.3415 -2.3976 -2.1832 -1.9976 -2.2213
[torch.FloatTensor of size 1x10]

Variable containing:
(0 ,.,.) = 

Columns 0 to 8 
  -0.2325  0.0775  0.5415  0.4876 -0.5771 -0.0687  0.1832 -0.5285  0.2508

Columns 9 to 9 
  -0.1837

(1 ,.,.) = 

Columns 0 to 8 
  -0.1389 -0.2605 -0.0518  0.3405  0.0774  0.1815  0.0297 -0.1304 -0.1015

Columns 9 to 9 
   0.2602
[torch.FloatTensor of size 2x1x10]

Variable containing:
(0 ,.,.) = 
  0.3334  0.3291  0.3374
[torch.FloatTensor of size 1x1x3]

but when changing the flag to USE_GPU=True, it throws the error when initializing the decoder_test object, it throws a TypeError:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-76-b3c660013934> in <module>()
     12 decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
     13 
---> 14 decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
     15 print(decoder_output)
     16 print(decoder_hidden)

~/.local/lib/python3.5/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    222         for hook in self._forward_pre_hooks.values():
    223             hook(self, input)
--> 224         result = self.forward(*input, **kwargs)
    225         for hook in self._forward_hooks.values():
    226             hook_result = hook(self, input, result)

<ipython-input-75-34ecfe9b3112> in forward(self, word_input, last_context, last_hidden, encoder_outputs)
     32 
     33         # Combine embedded input word and last context, run through RNN
---> 34         rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
     35         rnn_output, hidden = self.gru(rnn_input, last_hidden)
     36 

~/.local/lib/python3.5/site-packages/torch/autograd/variable.py in cat(iterable, dim)
    895         @staticmethod
    896         def cat(iterable, dim=0):
--> 897             return Concat.apply(dim, *iterable)
    898 
    899         @staticmethod

~/.local/lib/python3.5/site-packages/torch/autograd/_functions/tensor.py in forward(ctx, dim, *inputs)
    315         ctx.dim = dim
    316         ctx.input_sizes = [i.size(dim) for i in inputs]
--> 317         return torch.cat(inputs, dim)
    318 
    319     @staticmethod

TypeError: cat received an invalid combination of arguments - got (tuple, int), but expected one of:
 * (sequence[torch.cuda.FloatTensor] seq)
 * (sequence[torch.cuda.FloatTensor] seq, int dim)
      didn't match because some of the arguments have invalid types: (tuple, int)

The question is why are that types not matching in CUDA but it works on CPU and how to resolve this?

Does PyTorch have a global flag to just change all types to CUDA types and not mess around with CPU/GPU types?

回答1:

You can also try:

net = YouNetworkClass()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)

After that, you have to send the word_inputs, encoder_hidden and decoder_context to the GPU too:

word_inputs, encoder_hidden, decoder_context = word_inputs.to(device), encoder_hidden.to(device), decoder_context.to(device)

Look here: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#training-on-gpu

回答2:

Does PyTorch have a global flag to just change all types to CUDA types and not mess around with CPU/GPU types?

Nope =(

(Source: https://discuss.pytorch.org/t/porting-seq2seq-tutorial-from-spro-practical-pytorh-from-cpu-to-gpu/8604)

Specific to the example:

The input variables to the decoder_test object needs to be in .cuda() type. More specifically:

encoder_hidden = encoder_test.init_hidden()
---> encoder_hidden = encoder_test.init_hidden().cuda()


decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
---> decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda()

So the code to test the network should be:

encoder_test = EncoderRNN(10, 10, 2) # I, H , L
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L

encoder_hidden = encoder_test.init_hidden().cuda()
if USE_CUDA:
    word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda())
else:
    word_inputs = Variable(torch.LongTensor([1, 2, 3]))
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden)
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda()

decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
print(decoder_output)
print(decoder_hidden)
print(decoder_attn)

[out]:

Variable containing:
-2.1412 -2.4589 -2.4042 -2.1591 -2.5080 -2.0839 -2.5058 -2.3831 -2.4468 -2.0804
[torch.cuda.FloatTensor of size 1x10 (GPU 0)]

Variable containing:
(0 ,.,.) = 

Columns 0 to 8 
  -0.0264 -0.0689  0.1049  0.0760  0.1017 -0.4585 -0.1273  0.0449 -0.3271

Columns 9 to 9 
  -0.0104

(1 ,.,.) = 

Columns 0 to 8 
  -0.0308 -0.0690 -0.0258 -0.2759  0.1403 -0.0468 -0.0205  0.0126 -0.1729

Columns 9 to 9 
   0.0599
[torch.cuda.FloatTensor of size 2x1x10 (GPU 0)]

Variable containing:
(0 ,.,.) = 
  0.3328  0.3328  0.3344
[torch.cuda.FloatTensor of size 1x1x3 (GPU 0)]