diff --git a/exercices/03-equation-chaleur/correction/Makefile b/exercices/03-equation-chaleur/correction/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..59210a56b289a03fcb55d2dc916fcc82fbbe7f0e
--- /dev/null
+++ b/exercices/03-equation-chaleur/correction/Makefile
@@ -0,0 +1,14 @@
+NVCC := nvcc
+CFLAGS := -O3
+SRC := heat_equation.cu
+EXEC := heat_equation
+
+all: $(EXEC)
+
+$(EXEC): $(SRC)
+	$(NVCC) $(CFLAGS) -o $(EXEC) $(SRC)
+
+clean:
+	rm -f $(EXEC)
+
+.PHONY: all clean
\ No newline at end of file
diff --git a/exercices/03-equation-chaleur/correction/heat_equation.cu b/exercices/03-equation-chaleur/correction/heat_equation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6afc953c3d148d11936dc855ba565af9d648d3c5
--- /dev/null
+++ b/exercices/03-equation-chaleur/correction/heat_equation.cu
@@ -0,0 +1,154 @@
+#include<iostream>
+#include<assert.h>
+#include <chrono>
+
+using namespace std;
+
+typedef struct _matrix{
+    int m,n;
+    double** data;
+} matrix_t;
+
+matrix_t matrix_create(int m, int n){
+    matrix_t mat;
+    mat.m = m;
+    mat.n = n;
+    mat.data = (double**)calloc(m, sizeof(double*));
+    mat.data[0] = (double*)calloc(m*n, sizeof(double));
+    for(int i=0; i<mat.m; i++){
+        mat.data[i] = &mat.data[0][i*n];
+    }
+    return mat;
+}
+
+void matrix_destroy(matrix_t mat){
+    free(mat.data[0]);
+    free(mat.data);
+}
+
+void matrix_swap(matrix_t *m1, matrix_t *m2){
+    int temp_m = m1->m; 
+    int temp_n = m1->n;
+    double** temp_data = m1->data;
+    
+    m1->m = m2->m;
+    m1->n = m2->n;
+    m1->data = m2->data;
+    
+    m2->m = temp_m;
+    m2->n = temp_n;
+    m2->data = temp_data;
+}
+
+void set_boundary_conditions(matrix_t U){
+    for(int i=0; i<U.m; i++){
+        U.data[i][0] = 1.0;
+        U.data[i][U.n-1] = 1.0;
+    }
+    for(int j=0; j<U.n; j++){
+        U.data[0][j] = 1.0;
+        U.data[U.m-1][j] = 1.0;
+    }
+}
+
+void cpu_step(matrix_t U, matrix_t Unext){
+    for(int i=0; i<U.m; i++){
+        for(int j=0; j<U.n; j++){
+            if(i==0 || j==0 || i==U.m-1 || j==U.n-1) Unext.data[i][j] = U.data[i][j];
+            else Unext.data[i][j] = (U.data[i+1][j]+U.data[i-1][j]+U.data[i][j+1]+U.data[i][j-1])/4.0;
+        }
+    }
+}
+
+void matrix_print(matrix_t U){
+    for(int i=0; i<U.m; i++){
+        for(int j=0; j<U.n; j++){
+            std::cout << U.data[i][j] << "  ";
+        }
+        std::cout << std::endl;
+    }
+}
+
+
+int double_compare(double* a1, double* a2, int s, double e){
+    int dif = 0; 
+    for(int i=0; i<s; i++){
+        if(fabs(a1[i]-a2[i])>e) dif++;
+    }
+    return dif;
+}
+
+__global__ void gpu_step(double *U, double *Unext, int m, int n){
+    int i = (blockIdx.x*blockDim.x) + threadIdx.x;
+    int j = (blockIdx.y*blockDim.y) + threadIdx.y;
+    if(i>0 && i<m-1 && j>0 && j<n-1){
+        Unext[i*n+j] = (U[(i+1)*n+j]+U[(i-1)*n+j]+U[i*n+j+1]+U[i*n+j-1])/4.0;
+    }
+    if(i==0 || i==m-1 || j==0 || j==n-1){
+        Unext[i*n+j] = U[i*n+j];
+    }
+}
+
+int main(){
+    int m = 1000;
+    int n = 1000;
+    int Ttot = 10000;
+
+    cout << "m : " << m << ", n : " << n << ", Ttot : " << Ttot << endl;
+
+    // alloc and init on CPU
+    matrix_t U = matrix_create(m, n);
+    matrix_t Unext = matrix_create(m, n);
+    set_boundary_conditions(U);
+
+    // alloc and init on GPU
+    double *d_U, *d_Unext;
+    cudaMalloc(&d_U, m*n*sizeof(double));
+    cudaMalloc(&d_Unext, m*n*sizeof(double));
+    cudaMemcpy(d_U, U.data[0], m*n*sizeof(double), cudaMemcpyHostToDevice);
+
+    auto start_cpu = chrono::high_resolution_clock::now();
+    // perform several iterations on CPU
+    cout << "start CPU work" << endl;
+    for(int t=0; t<Ttot; t++){
+         cpu_step(U, Unext);
+         matrix_swap(&U, &Unext);
+    }
+    auto end_cpu = chrono::high_resolution_clock::now();
+
+    
+    auto start_gpu = chrono::high_resolution_clock::now();
+
+    // GPU exec config
+    dim3 dimBlock(10, 10, 1);
+    dim3 dimGrid((m + dimBlock.x - 1) / dimBlock.x, (n + dimBlock.y - 1) / dimBlock.y, 1);  
+
+    // perform several iterations on GPU
+    cout << "start GPU work" << endl;
+    for(int t=0; t<Ttot; t++){
+        gpu_step<<<dimGrid, dimBlock>>>(d_U, d_Unext, m, n); 
+        swap(d_U, d_Unext);
+    }
+
+    // use Unext memory space to gather GPU res and compare results
+    double* gpu_res = Unext.data[0];
+    cudaMemcpy(gpu_res, d_U, m*n*sizeof(double), cudaMemcpyDeviceToHost);
+    auto end_gpu = chrono::high_resolution_clock::now();
+
+    // matrix_print(Unext);
+
+    // ensure CPU and GPU res are close
+    assert(double_compare(gpu_res, U.data[0], m*n, 0.0000001) == 0);
+
+    cout << "OK" << endl;
+    auto duration_cpu = chrono::duration_cast<chrono::milliseconds>(end_cpu - start_cpu);
+    auto duration_gpu = chrono::duration_cast<chrono::milliseconds>(end_gpu - start_gpu);
+    cout << "time on CPU : " << duration_cpu.count() << "ms" << endl;
+    cout << "time on GPU : " << duration_gpu.count() << "ms" << endl;
+
+    matrix_destroy(U);
+    matrix_destroy(Unext);
+    cudaFree(d_U);
+    cudaFree(d_Unext);
+
+}
\ No newline at end of file