diff --git a/exercices/03-equation-chaleur/correction/Makefile b/exercices/03-equation-chaleur/correction/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..59210a56b289a03fcb55d2dc916fcc82fbbe7f0e --- /dev/null +++ b/exercices/03-equation-chaleur/correction/Makefile @@ -0,0 +1,14 @@ +NVCC := nvcc +CFLAGS := -O3 +SRC := heat_equation.cu +EXEC := heat_equation + +all: $(EXEC) + +$(EXEC): $(SRC) + $(NVCC) $(CFLAGS) -o $(EXEC) $(SRC) + +clean: + rm -f $(EXEC) + +.PHONY: all clean \ No newline at end of file diff --git a/exercices/03-equation-chaleur/correction/heat_equation.cu b/exercices/03-equation-chaleur/correction/heat_equation.cu new file mode 100644 index 0000000000000000000000000000000000000000..6afc953c3d148d11936dc855ba565af9d648d3c5 --- /dev/null +++ b/exercices/03-equation-chaleur/correction/heat_equation.cu @@ -0,0 +1,154 @@ +#include<iostream> +#include<assert.h> +#include <chrono> + +using namespace std; + +typedef struct _matrix{ + int m,n; + double** data; +} matrix_t; + +matrix_t matrix_create(int m, int n){ + matrix_t mat; + mat.m = m; + mat.n = n; + mat.data = (double**)calloc(m, sizeof(double*)); + mat.data[0] = (double*)calloc(m*n, sizeof(double)); + for(int i=0; i<mat.m; i++){ + mat.data[i] = &mat.data[0][i*n]; + } + return mat; +} + +void matrix_destroy(matrix_t mat){ + free(mat.data[0]); + free(mat.data); +} + +void matrix_swap(matrix_t *m1, matrix_t *m2){ + int temp_m = m1->m; + int temp_n = m1->n; + double** temp_data = m1->data; + + m1->m = m2->m; + m1->n = m2->n; + m1->data = m2->data; + + m2->m = temp_m; + m2->n = temp_n; + m2->data = temp_data; +} + +void set_boundary_conditions(matrix_t U){ + for(int i=0; i<U.m; i++){ + U.data[i][0] = 1.0; + U.data[i][U.n-1] = 1.0; + } + for(int j=0; j<U.n; j++){ + U.data[0][j] = 1.0; + U.data[U.m-1][j] = 1.0; + } +} + +void cpu_step(matrix_t U, matrix_t Unext){ + for(int i=0; i<U.m; i++){ + for(int j=0; j<U.n; j++){ + if(i==0 || j==0 || i==U.m-1 || j==U.n-1) Unext.data[i][j] = U.data[i][j]; + else Unext.data[i][j] = (U.data[i+1][j]+U.data[i-1][j]+U.data[i][j+1]+U.data[i][j-1])/4.0; + } + } +} + +void matrix_print(matrix_t U){ + for(int i=0; i<U.m; i++){ + for(int j=0; j<U.n; j++){ + std::cout << U.data[i][j] << " "; + } + std::cout << std::endl; + } +} + + +int double_compare(double* a1, double* a2, int s, double e){ + int dif = 0; + for(int i=0; i<s; i++){ + if(fabs(a1[i]-a2[i])>e) dif++; + } + return dif; +} + +__global__ void gpu_step(double *U, double *Unext, int m, int n){ + int i = (blockIdx.x*blockDim.x) + threadIdx.x; + int j = (blockIdx.y*blockDim.y) + threadIdx.y; + if(i>0 && i<m-1 && j>0 && j<n-1){ + Unext[i*n+j] = (U[(i+1)*n+j]+U[(i-1)*n+j]+U[i*n+j+1]+U[i*n+j-1])/4.0; + } + if(i==0 || i==m-1 || j==0 || j==n-1){ + Unext[i*n+j] = U[i*n+j]; + } +} + +int main(){ + int m = 1000; + int n = 1000; + int Ttot = 10000; + + cout << "m : " << m << ", n : " << n << ", Ttot : " << Ttot << endl; + + // alloc and init on CPU + matrix_t U = matrix_create(m, n); + matrix_t Unext = matrix_create(m, n); + set_boundary_conditions(U); + + // alloc and init on GPU + double *d_U, *d_Unext; + cudaMalloc(&d_U, m*n*sizeof(double)); + cudaMalloc(&d_Unext, m*n*sizeof(double)); + cudaMemcpy(d_U, U.data[0], m*n*sizeof(double), cudaMemcpyHostToDevice); + + auto start_cpu = chrono::high_resolution_clock::now(); + // perform several iterations on CPU + cout << "start CPU work" << endl; + for(int t=0; t<Ttot; t++){ + cpu_step(U, Unext); + matrix_swap(&U, &Unext); + } + auto end_cpu = chrono::high_resolution_clock::now(); + + + auto start_gpu = chrono::high_resolution_clock::now(); + + // GPU exec config + dim3 dimBlock(10, 10, 1); + dim3 dimGrid((m + dimBlock.x - 1) / dimBlock.x, (n + dimBlock.y - 1) / dimBlock.y, 1); + + // perform several iterations on GPU + cout << "start GPU work" << endl; + for(int t=0; t<Ttot; t++){ + gpu_step<<<dimGrid, dimBlock>>>(d_U, d_Unext, m, n); + swap(d_U, d_Unext); + } + + // use Unext memory space to gather GPU res and compare results + double* gpu_res = Unext.data[0]; + cudaMemcpy(gpu_res, d_U, m*n*sizeof(double), cudaMemcpyDeviceToHost); + auto end_gpu = chrono::high_resolution_clock::now(); + + // matrix_print(Unext); + + // ensure CPU and GPU res are close + assert(double_compare(gpu_res, U.data[0], m*n, 0.0000001) == 0); + + cout << "OK" << endl; + auto duration_cpu = chrono::duration_cast<chrono::milliseconds>(end_cpu - start_cpu); + auto duration_gpu = chrono::duration_cast<chrono::milliseconds>(end_gpu - start_gpu); + cout << "time on CPU : " << duration_cpu.count() << "ms" << endl; + cout << "time on GPU : " << duration_gpu.count() << "ms" << endl; + + matrix_destroy(U); + matrix_destroy(Unext); + cudaFree(d_U); + cudaFree(d_Unext); + +} \ No newline at end of file