cuda编程入门示例19---矩阵相乘
2016-11-19 20:04
399 查看
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <cuda.h> #include <cuda_runtime.h> #define BLOCK_SIZE 16 static void HandleError(cudaError_t err, const char *file, int line) { if (err != cudaSuccess) { printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); exit(EXIT_FAILURE); } } #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) #define HANDLE_NULL( a ) {if ((a) == NULL) { \ printf("Host memory failed in %s at line %d\n", \ __FILE__, __LINE__); \ exit(EXIT_FAILURE); }} static bool InitCUDA() { int count; cudaGetDeviceCount(&count); if (count == 0) { fprintf(stderr, "There is no device.\n"); return false; } int i; cudaDeviceProp prop = {0}; for (i = 0; i < count; i++) { if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if (prop.major >= 1) { printf("%s\n", prop.name); break; } } } if (i >= count) { fprintf(stderr, "There is no device supporting CUDA 1.x.\n"); return false; } cudaSetDevice(i); return true; } static void matgen(float* a, int lda, int n) { int i, j; for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { a[i * lda + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX); } } } static void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n) { int i, j, k; for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { double t = 0; for (k = 0; k < n; k++) { t += a[i * lda + k] * b[j + k * ldb]; } c[i * ldc + j] = t; } } } static void compare_mat(const float* a, int lda, const float* b, int ldb, int n) { float max_err = 0; float average_err = 0; float max_absolute_err = 0; int i, j; for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { if (b[i * ldb + j] != 0) { float tmp = fabs(a[i * lda + j] - b[i * ldb + j]); if (max_absolute_err < tmp) { max_absolute_err = tmp; } float err = fabs(tmp / b[i * ldb + j]); if (max_err < err) { max_err = err; } average_err += err; } } } printf("Max absolute error: %g\nMax error: %g\nAverage error: %g\n", \ max_absolute_err, max_err, average_err / (n * n)); } __global__ static void matMultCUDA(const float* dev_a, size_t lda, const float* dev_b, size_t ldb, float* dev_c, size_t ldc, int n) { const int tid = threadIdx.x; const int bid = blockIdx.x; const int id = bid * blockDim.x + tid; const int row = id / n; const int col = id % n; if (row < n && col < n) { float t = 0; for (int i = 0; i < n; i++) { t += dev_a[row * lda + i] * dev_b[i * ldb + col]; } dev_c[row * ldc + col] = t; } } static clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n) { const int thread_num = 256; float *dev_a, *dev_b, *dev_c; clock_t time; time = clock(); HANDLE_ERROR(cudaMalloc((void**)&dev_a, sizeof(float)* n * n)); HANDLE_ERROR(cudaMalloc((void**)&dev_b, sizeof(float)* n * n)); HANDLE_ERROR(cudaMalloc((void**)&dev_c, sizeof(float)* n * n)); HANDLE_ERROR(cudaMemcpy2D(dev_a, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy2D(dev_b, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice)); int blocks = (n * n + thread_num - 1) / thread_num; matMultCUDA << <blocks, thread_num >> >(dev_a, n, dev_b, n, dev_c, n, n); HANDLE_ERROR(cudaMemcpy2D(c, sizeof(float)* ldc, dev_c, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost)); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return clock() - time; } int main(int argc, char *argv[]) { const int n = 64 * 4 * 2; float *a, *b, *c, *d; HANDLE_NULL(a = (float*)malloc(sizeof(float)* n * n)); HANDLE_NULL(b = (float*)malloc(sizeof(float)* n * n)); HANDLE_NULL(c = (float*)malloc(sizeof(float)* n * n)); HANDLE_NULL(d = (float*)malloc(sizeof(float)* n * n)); srand(0); matgen(a, n, n); matgen(b, n, n); clock_t time = matmultCUDA(a, n, b, n, c, n, n); matmult(a, n, b, n, d, n, n); compare_mat(c, n, d, n, n); double sec = (double)time / CLOCKS_PER_SEC; printf("Time used: %.6fs (%.6lf GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9)); free(a); free(b); free(c); free(d); //remember to release the device cudaDeviceReset(); return 0; }
相关文章推荐
- cuda编程入门示例1---两个向量对应元素相乘
- CUDA编程接口:共享存储器实现矩阵相乘
- cuda编程入门示例3---数组求和
- cuda编程入门示例20
- CUDA编程-(2)其实写个矩阵相乘并不是那么难
- cuda编程入门示例24
- 【CUDA并行编程之四】矩阵相乘
- cuda编程入门示例12
- cuda编程入门示例21
- cuda编程入门示例15
- CUDA编程之示例(GPU读取图像矩阵的像素值--未完待续
- cuda编程入门示例10
- cuda编程入门示例6
- cuda编程入门示例22
- CUDA编程入门:向量加法和矩阵乘法
- cuda编程入门示例2---CUDA environment initialization
- cuda编程入门示例8
- cuda编程入门示例9
- cuda编程入门示例25
- cuda编程入门示例23