From c34cb4ae870f6e80bbe62ae64723c67dd9fdac62 Mon Sep 17 00:00:00 2001
From: LilDotTheGod <luciennoel2001@gmail.com>
Date: Thu, 27 Jun 2024 00:50:29 +0200
Subject: [PATCH] test simple Transformer avec CUDA

---
 README.md         | 11 ++++++++++-
 data.py           |  9 ++++++---
 main.py           | 24 +++++++++++++-----------
 musicgenerator.py |  4 ++--
 4 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index e04937d..0a69da6 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,10 @@
-Inspiré du projet [suivant](https://github.com/pytorch/examples/tree/main/word_language_model "Exemple pytorch d'utilisation d'un Transformer")
\ No newline at end of file
+Inspiré du projet [suivant](https://github.com/pytorch/examples/tree/main/word_language_model "Exemple pytorch d'utilisation d'un Transformer")
+
+# Executer le projet sur une machine avec CUDA
+si torch.cuda.is_available() rend False alors que CUDA est installé sur une machine, il faut executer la commande suivante avec le bon numero de version de CUDA:
+
+Cet exemple install pour la version 11.8 de CUDA.
+
+```shell
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+```
diff --git a/data.py b/data.py
index cc80c57..62af798 100644
--- a/data.py
+++ b/data.py
@@ -1,4 +1,5 @@
 import os
+import re
 from io import open
 import torch
 
@@ -21,7 +22,7 @@ class Dictionary(object):
 class Corpus(object):
     def __init__(self, path: str) -> None:
         self.dictionary = Dictionary()
-        self.train = self.tokenize(os.path.join(path, './training_data/classical_music/test/input.txt'))
+        self.train = self.tokenize(os.path.join(path, './training_data/classical_music/test/train.txt'))
         # self.valid = self.tokenize(os.path.join(path, './training_data/classical_music/test/input.txt'))
         # self.test = self.tokenize(os.path.join(path, './training_data/classical_music/test/input.txt'))
         self.valid = self.train
@@ -33,7 +34,8 @@ class Corpus(object):
         # Add words to the dictionary
         with open(path, 'r', encoding="utf8") as f:
             for line in f:
-                words = line.split()  # + ['<eos>']
+                words = re.findall(r'\b(?![^<]*>)[a-zA-Z0-9]+\b', line)
+                # words = line.split()  # + ['<eos>']
                 for word in words:
                     self.dictionary.add_word(word)
 
@@ -44,7 +46,8 @@ class Corpus(object):
                 words = line.split()  # + ['<eos>']
                 ids = []
                 for word in words:
-                    ids.append(self.dictionary.word2idx[word])
+                    if word in self.dictionary.word2idx:
+                        ids.append(self.dictionary.word2idx[word])
                 idss.append(torch.tensor(ids).type(torch.int64))
             ids = torch.cat(idss)
 
diff --git a/main.py b/main.py
index 02ee152..faefb6c 100644
--- a/main.py
+++ b/main.py
@@ -8,38 +8,40 @@ import musicgenerator as mg
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
 if __name__ == '__main__':
-    learning_rate = 0.01
-    print("device :",device)
+    learning_rate = 20
+    print("device :", device)
 
     ###############################################################################
     # Prépare les données
     ###############################################################################
+    batch_size = 16
     corpus = data.Corpus('./')
-    train_data = data.batchify(corpus.train, 16, device)
-    val_data = data.batchify(corpus.valid, 16, device)
+    train_data = data.batchify(corpus.train, batch_size, device)
+    val_data = data.batchify(corpus.valid, batch_size, device)
 
     ###############################################################################
     # Construit le modèle
     ###############################################################################
     ntokens = len(corpus.dictionary)
-    model = mg.MusicGenerator(vocab_size=ntokens, dim_model=8, num_head=4).to(device)
+    model = mg.MusicGenerator(vocab_size=ntokens, dim_model=512, num_head=8).to(device)
     criterion = nn.NLLLoss()
 
     ###############################################################################
     # Entraîne le modèle
     ###############################################################################
-    sequence_length = 4
-    nb_log_epoch = 10
+    sequence_length = 32
+    nb_log_epoch = 5
+    epochs = 100
     mg.train(
         model=model,
         criterion=criterion,
         ntokens=ntokens,
         train_data=train_data,
         val_data=val_data,
-        sequence_length=4,
+        sequence_length=sequence_length,
         lr=learning_rate,
-        epochs=5,
-        log_interval=(len(train_data)//sequence_length)//nb_log_epoch
+        epochs=epochs,
+        log_interval=(len(train_data) // sequence_length) // nb_log_epoch
     )
 
     ###############################################################################
@@ -58,4 +60,4 @@ if __name__ == '__main__':
             input = torch.cat([input, word_tensor], 0)
 
             word = corpus.dictionary.idx2word[word_idx]
-            print(word + ('\n' if i % 20 == 19 else ' '))
+            print(word + ' ')
diff --git a/musicgenerator.py b/musicgenerator.py
index d4f24a3..948651a 100644
--- a/musicgenerator.py
+++ b/musicgenerator.py
@@ -113,8 +113,8 @@ def train(model: MusicGenerator,
             print('-' * 89)
             # Save the model if the validation loss is the best we've seen so far.
             if not best_val_loss or val_loss < best_val_loss:
-                # with open(args.save, 'wb') as f:
-                #     torch.save(model, f)
+                with open('./model.pt', 'wb') as f:
+                    torch.save(model, f)
                 best_val_loss = val_loss
             else:
                 # Anneal the learning rate if no improvement has been seen in the validation dataset.
-- 
GitLab