# Classification de noms en utilisant un RNN

Dans ce tutorie nous introduisont les réseaux de neurones récurrents (RNNs).

Pour cet fin, nous travaillerons sur la classification de noms qui peuvent être soient une compagnie ou une personne. 

In [1]:
import math
import torch
import random
import numpy as np
from torch import optim, nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, random_split

from pytoune.framework import Model, ModelCheckpoint, Callback, CSVLogger, EarlyStopping, ReduceLROnPlateau
from pytoune import torch_to_numpy
from pytoune.layers import Flatten
from tensorboardX import SummaryWriter
from torchvision.utils import make_grid

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [3]:
cuda_device = 0
device = torch.device("cuda:%d" % cuda_device if torch.cuda.is_available() else "cpu")
batch_size = 32
learning_rate = 0.01
n_epoch = 30
num_classes = 2

While this dataset is really small and simple, the main purpose is to learn how do we handle textual/sequential data using a neural network.

In [5]:
with open('names_manual_annotation.tsv') as f:
    data = f.readlines()
    data = [d[:-1].split('\t') for d in data]
data = [(x, y) for x, y in data if y != 'Unknown']

In [6]:
data

[('Québec (Ville de)', 'Company'),
 ('Drouin', 'Person'),
 ('Pneus Métro inc.', 'Company'),
 ('Graton', 'Person'),
 ('English', 'Person'),
 ('Clinique vétérinaire Ève Woods-Lavoie inc.', 'Company'),
 ('Grimard', 'Person'),
 ('St-Amand', 'Person'),
 ('Douville', 'Person'),
 ('Fréchette', 'Person'),
 ('Desroches', 'Person'),
 ('Gestion des résidents du chemin de la Grève Est et des Domaines nos étés et Kamouraska',
  'Company'),
 ('Gilbert', 'Person'),
 ('Le Bouhris', 'Person'),
 ('Castonguay', 'Person'),
 ('Alcide Ouellet et Fils inc.', 'Company'),
 ('9308-5934 Québec inc. (Volvo Trois-Rivières)', 'Company'),
 ('Hassan', 'Person'),
 ('Magasins Patrick Morin', 'Company'),
 ('Lessard (Construction Éric Lessard enr.)', 'Company'),
 ('Vacances Air-Transat', 'Company'),
 ('Gilbert', 'Person'),
 ('9239-2497 Québec inc. (Toiture Vision)', 'Company'),
 ('Syndicat de la copropriété du 1324, 1326 et 1328, rue Saint-Zotique',
  'Company'),
 ('Poirier', 'Person'),
 ('Royer', 'Person'),
 ('Dubuc', '

In [7]:
len(data)

793

We will be working with the characters of each names to classify if its a Person or a Company.

This is why we create a ```list``` with the name.

In [9]:
def format_examples(data):
    examples = list()
    for name, tag in data:
        examples.append((list(name), tag))
    return examples

In [10]:
formatted_data = format_examples(data)

In [11]:
formatted_data[0]

(['Q',
  'u',
  'é',
  'b',
  'e',
  'c',
  ' ',
  '(',
  'V',
  'i',
  'l',
  'l',
  'e',
  ' ',
  'd',
  'e',
  ')'],
 'Company')

In [12]:
formatted_data = random.sample(formatted_data, len(formatted_data))
train_ratio = int(len(formatted_data)*0.8) # 80% of dataset
train = formatted_data[:train_ratio]
test = formatted_data[train_ratio:]
valid_ratio = int(len(train)*0.8) # 20% of train set
valid = train[valid_ratio:]

In [13]:
len(train), len(valid), len(test)

(634, 127, 159)

Nous devons nous créer un vocabulaire pour toute donnée "non numérique". Ce vocabulaire nous sert d'index qui sera utilisé pour trouvé le vecteur de rééls associé à cet élément non numérique.

In [14]:
vocab = set()
tags = set()

for example in train:
    for char in example[0]:
        vocab.add(char)
    tags.add(example[1])
    
char_to_idx = {
    '<PAD>': 0,
    '<UNK>': 1,
}

for char in sorted(vocab):
    char_to_idx[char] = len(char_to_idx)
    
tag_to_idx = {tag: i for i, tag in enumerate(sorted(tags))}

In [16]:
char_to_idx

{' ': 2,
 '&': 3,
 "'": 4,
 '(': 5,
 ')': 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '/': 10,
 '0': 11,
 '1': 12,
 '2': 13,
 '3': 14,
 '4': 15,
 '5': 16,
 '6': 17,
 '7': 18,
 '8': 19,
 '9': 20,
 '<PAD>': 0,
 '<UNK>': 1,
 'A': 21,
 'B': 22,
 'C': 23,
 'D': 24,
 'E': 25,
 'F': 26,
 'G': 27,
 'H': 28,
 'I': 29,
 'J': 30,
 'K': 31,
 'L': 32,
 'M': 33,
 'N': 34,
 'O': 35,
 'P': 36,
 'Q': 37,
 'R': 38,
 'S': 39,
 'T': 40,
 'U': 41,
 'V': 42,
 'W': 43,
 'X': 44,
 'Y': 45,
 'Z': 46,
 'a': 47,
 'b': 48,
 'c': 49,
 'd': 50,
 'e': 51,
 'f': 52,
 'g': 53,
 'h': 54,
 'i': 55,
 'j': 56,
 'k': 57,
 'l': 58,
 'm': 59,
 'n': 60,
 'o': 61,
 'p': 62,
 'q': 63,
 'r': 64,
 's': 65,
 't': 66,
 'u': 67,
 'v': 68,
 'w': 69,
 'x': 70,
 'y': 71,
 'z': 72,
 'É': 73,
 'â': 74,
 'ç': 75,
 'è': 76,
 'é': 77,
 'ê': 78,
 'ë': 79,
 'î': 80,
 'ô': 81,
 '—': 82}

In [17]:
tag_to_idx

{'Company': 0, 'Person': 1}

Le Vectorizer va nous servir à convertir toute donnée 'non numérique' en donnée numérique.

In [18]:
class Vectorizer:
    def __init__(self, char_to_idx, tag_to_idx):
        self.char_to_idx = char_to_idx
        self.tag_to_idx = tag_to_idx

    def vectorize_sequence(self, sequence, idx, remove_if_unk=False):
        if '<UNK>' in idx:
            unknown_index = idx['<UNK>']
            chars = [idx.get(tok, unknown_index) for tok in sequence]
            if remove_if_unk:
                return [w for w in chars if w != unknown_index]
            else:
                return chars

        else:
            return [idx[tok] for tok in sequence]

    def __call__(self, example):
        name, tag = example
        vectorized_name = self.vectorize_sequence(name, self.char_to_idx)
        vectorized_tag = self.tag_to_idx[tag]
        return (
            vectorized_name,
            vectorized_tag,
        )

vectorizer = Vectorizer(char_to_idx, tag_to_idx)

In [19]:
train_data = [vectorizer(example) for example in train]
valid_data = [vectorizer(example) for example in valid]
test_data = [vectorizer(example) for example in test]

In [20]:
train_data[0]

([32, 47, 62, 61, 55, 60, 66, 51], 1)

In [21]:
train[0]

(['L', 'a', 'p', 'o', 'i', 'n', 't', 'e'], 'Person')

Le concept de padding est extrêmement important. Il nous permet d'envoyer des tenseurs de longueurs différentes sur le GPU.

Nous prenons donc le tenseur le plus long de notre minibatch pour créer une matrice d'exemple.

In [22]:
import torch

def pad_sequences(vectorized_seqs, seq_lengths):
    seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max())).long()
    for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
        seq_tensor[idx, :seqlen] = torch.LongTensor(seq[:seqlen])
    return seq_tensor

def collate_examples(samples):
    names, tags = list(zip(*samples))
    names_lengths = torch.LongTensor([len(s) for s in names])
    padded_names = pad_sequences(names, names_lengths)
    tags = torch.LongTensor(tags)
    return padded_names, tags

In [23]:
from torch.utils.data import DataLoader, Dataset

batch_size = 16

train_loader = DataLoader(
    train_data,
    batch_size=batch_size,
    collate_fn=collate_examples,
    shuffle=True
)

valid_loader = DataLoader(
    valid_data,
    batch_size=batch_size,
    collate_fn=collate_examples,
    shuffle=False
)

test_loader = DataLoader(
    test_data,
    batch_size=batch_size,
    collate_fn=collate_examples,
    shuffle=False
)

In [24]:
b = next(iter(train_loader))
b[0].shape, b

(torch.Size([16, 48]),
 (tensor([[39, 55, 60, 53, 58, 51, 66, 61, 60,  0,  0,  0,  0,  0,  0,  0,  0,  0,
            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
          [38, 55, 61, 67, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
          [38, 51, 65, 66, 47, 67, 64, 47, 60, 66,  8, 22, 47, 64,  2, 32, 47,  2,
           38, 61, 49, 54, 51, 58, 55, 76, 64, 51,  2, 55, 60, 49,  9,  0,  0,  0,
            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
          [25, 70, 49, 47, 68, 47, 66, 55, 61, 60, 65,  2, 27,  9,  2, 21, 58, 58,
           47, 64, 50,  2, 55, 60, 49,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
          [20, 13, 13, 11,  8, 17, 17, 15, 19,  2, 37, 67, 77, 48, 51, 49

In [25]:
from torch import nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

class NameClassifier(nn.Module):
    def __init__(self, char_to_idx, tag_to_idx, embedding_size, hidden_layer_size):
        super(NameClassifier, self).__init__()
        self.embeddings = nn.Embedding(len(char_to_idx), embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_layer_size)
        self.fully_connected = nn.Linear(hidden_layer_size, len(tag_to_idx))
        self.loss_function = nn.CrossEntropyLoss()
        self.metrics = ['acc']

    def forward(self, names):
        # Getting the length of sequences so we can pack them and send them to gpu.
        # We also sort the sequences by length, as required by the pack_padded_sequence function
        seq_lengths, perm_idx = (names > 0).sum(dim=1).sort(0, descending=True)
        
        # We need the reverse idx to unsort the sequence at the end of the forward
        _, rev_perm_idx = perm_idx.sort(0)
        
        # (batch_size, max_length)
        sorted_names = names[perm_idx]
        
        # (batch_size, max_length, embedding_size)
        embeds = self.embeddings(sorted_names)
        
        packed_names = pack_padded_sequence(embeds, seq_lengths, batch_first=True)
        
        # (1, batch_size, hidden_layer_size)
        _, (h_n, _) = self.rnn(packed_names)
        
        # (1, batch_size, num_tags)
        out = self.fully_connected(h_n)
        
        # (batch_size, num_tags)
        out = out.squeeze(0)
        
        return out[rev_perm_idx]

In [26]:
loaders = [train_loader, valid_loader, test_loader]

In [27]:
def train(name, pytorch_module):
    optimizer = optim.SGD(pytorch_module.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
    loss_function = nn.CrossEntropyLoss()

    # Pytoune Model
    model = Model(pytorch_module, optimizer, loss_function, metrics=['accuracy'])

    # Send model on GPU
    model.to(device)

    # Train
    model.fit_generator(train_loader, valid_loader, epochs=n_epoch)
    
    return model

In [28]:
net = NameClassifier(char_to_idx, tag_to_idx, embedding_size=50, hidden_layer_size=100)
model = train('name_classifier', net)

Epoch 1/30[26] > [33;01m<ipython-input-25-1b1f6b3d2c82>[00m([36;01m18[00m)forward()
-> seq_lengths, perm_idx = (names > 0).sum(dim=1).sort(0, descending=True)
(Pdb++) ll
[36;01m  14[00m         [34;01mdef[39;49;00m [32;01mforward[39;49;00m([36;01mself[39;49;00m, names):                                          
[36;01m  15[00m             [34;01mimport[39;49;00m [04m[36;01mpdb[39;49;00m; pdb.set_trace()                                    
[36;01m  16[00m             [30;01m# Getting the length of sequences so we can pack them and send [39;49;00m
[36;01m  17[00m             [30;01m# We also sort the sequences by length, as required by the pack[39;49;00m
[44m[36;01;44m  18[00;44m  ->         seq_lengths, perm_idx = (names > [34;01;44m0[39;49;00;44m).sum(dim=[34;01;44m1[39;49;00;44m).sort([34;01;44m0[39;49;00;44m, descendi[00m
[36;01m  19[00m                                                                            
[36;01m  20[00m             [3

(Pdb++) sorted_names
tensor([[39, 71, 60, 50, 55, 49, 47, 66,  2, 50, 51, 65,  2, 49, 61, 62, 64, 61,
         62, 64, 55, 77, 66, 47, 55, 64, 51, 65,  2, 50, 67,  2, 12, 17, 12, 11],
        [23, 51, 60, 66, 64, 51,  2, 32,  4, 21, 67, 66, 64, 51,  2, 49, 81, 66,
         77,  2, 50, 51,  2, 58,  4, 61, 59, 48, 64, 51,  2, 55, 60, 49,  9,  0],
        [30, 61, 58, 55,  8, 23, 61, 51, 67, 64,  2, 32, 47, 49, 47, 65, 65, 51,
          7,  2, 65,  9, 51,  9, 60,  9, 49,  9, 64,  9, 58,  9,  0,  0,  0,  0],
        [38, 77, 60, 61, 68, 47, 66, 55, 61, 60, 65,  2, 27, 51, 61, 64, 53, 51,
         65,  2, 36, 47, 64, 55, 65,  2, 55, 60, 49,  9,  0,  0,  0,  0,  0,  0],
        [36, 58, 47, 49, 51, 59, 51, 60, 66, 65,  2, 33,  9, 30,  9,  2, 36, 61,
         55, 64, 55, 51, 64,  2, 55, 60, 49,  9,  0,  0,  0,  0,  0,  0,  0,  0],
        [23, 61, 60, 65, 66, 64, 67, 49, 66, 55, 61, 60,  2, 33, 55, 50, 47, 58,
         66, 61,  2, 55, 60, 49,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
 

(Pdb++) n
[26] > [33;01m<ipython-input-25-1b1f6b3d2c82>[00m([36;01m32[00m)forward()
-> _, (h_n, _) = self.rnn(packed_names)
(Pdb++) packed_names
PackedSequence(data=tensor([[ 0.1912, -0.0095, -0.9079,  ..., -0.6360, -0.2510,  0.7005],
        [ 0.2240, -0.6876, -0.0302,  ...,  1.1689,  0.9026, -1.7167],
        [-0.3566, -1.9664,  0.3476,  ...,  1.5742,  0.1354, -0.7954],
        ...,
        [ 2.8104,  0.9233, -0.4852,  ...,  1.0837,  1.9056,  1.6714],
        [-0.8018,  2.2057, -1.7395,  ..., -0.9025, -0.2357, -2.3813],
        [ 0.6346,  0.8769,  0.7514,  ...,  0.1126, -1.4679, -1.7168]],
       grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([16, 16, 16, 16, 16, 16, 13, 12, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,
         9,  8,  7,  6,  6,  6,  6,  5,  5,  5,  4,  4,  3,  3,  2,  2,  2,  1]))
(Pdb++) l
 [34;01m33[39;49;00m  	
 [34;01m34[39;49;00m  	        [30;01m# (1, batch_size, num_tags)[39;49;00m
 [34;01m35[39;49;00m  	        out = [36;01mself[39;49;00

(Pdb++) q


BdbQuit: 

In [23]:
model.evaluate_generator(test_loader)

(0.11388833978383234, 96.85534591194968)