diff --git a/exercices/01-somme-vecteurs-gpu/correction/vec_add.cu b/exercices/01-somme-vecteurs-gpu/correction/vec_add.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6fcc79e9da31a18d264b137a442c508f03c6a7ff
--- /dev/null
+++ b/exercices/01-somme-vecteurs-gpu/correction/vec_add.cu
@@ -0,0 +1,76 @@
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <algorithm>
+
+// CUDA kernel pour l'addition de vecteurs
+__global__ void vectorAdd(const int* a, const int* b, int* c, int N) {
+  // Thread ID globale
+  int tid = (blockIdx.x*blockDim.x) + threadIdx.x;
+  // Certains ne doivent rien faire !
+  if (tid < N) c[tid] = a[tid] + b[tid];
+}
+
+int main() {
+  // Taille des vecteurs à additionner (puissance de 2)
+  const int N = 1 << 28;
+  const size_t n_bytes = sizeof(int)*N;
+
+  std::cout << "Number of integers: " << N << std::endl;
+  std::cout << "Vector size: " << (double) n_bytes/1000000000.0 << " [GB]" << std::endl;
+
+  // les trois vecteurs: C = A + B, la raison d'un
+  int* h_a = (int*) malloc(n_bytes);
+  int* h_b = (int*) malloc(n_bytes);
+  int* h_c = (int*) malloc(n_bytes);
+
+  // Du random: 0 à 99
+  for (int i = 0; i < N; i++) {
+    h_a[i] = rand() % 100;
+    h_b[i] = rand() % 100;
+  }
+
+  // allocation sur le device
+  int* d_a; 
+  int* d_b;
+  int* d_c;
+  cudaMalloc(&d_a, n_bytes);
+  cudaMalloc(&d_b, n_bytes);
+  cudaMalloc(&d_c, n_bytes);
+
+  // Host -> Device
+  cudaMemcpy(d_a, h_a, n_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_b, h_b, n_bytes, cudaMemcpyHostToDevice);
+
+  // Threads per block
+  int NUM_THREADS = 1024;
+
+  // Block per Grid: il faut au moins autant de threads que d'éléments.
+  // Mais on a que 1024 threads par block, donc on n'ajoute un block
+  // en plus si nécessaire (i.e. padding)
+  int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;
+
+  // On lance le Kernel: on note que c'est asynchrone
+  vectorAdd<<<NUM_BLOCKS, NUM_THREADS>>>(d_a, d_b, d_c, N);
+
+  // Device -> Host: on rammène C et on note que cudaMemcpy est synchrone
+  cudaMemcpy(h_c, d_c, n_bytes, cudaMemcpyDeviceToHost);
+
+  // Tout est ok ?
+  for (int i = 0; i < N; i++) {
+    assert(h_c[i] == h_a[i] + h_b[i]);
+  }
+
+  // On libère la mémoire
+  free(h_a);
+  free(h_b);
+  free(h_c);
+
+  cudaFree(d_a);
+  cudaFree(d_b);
+  cudaFree(d_c);
+
+  std::cout << "Addition de vecteurs: OK" << std::endl;
+
+  return 0;
+}