您的位置:首页 > 其它

向量乘以矩阵(vector_matrix_multiplication)

2009-09-19 13:47 337 查看
/********************************************************************
    created:    2009/09/19
    created:    19:9:2009   12:00
    filename:     vector_matrix_multiplication.cu
    file base:    vector_matrix_multiplication
    file ext:    CUDA
    author:        zhao.kaiyong(at)gmail.com
    purpose:    vector matrix multiplication
    copyright:    everyone can use this code, please specify source.
                任意使用,请注明出处;
http://www.hpctech.com
http://openhero.net
*********************************************************************/
#include
#include
#include
#include
/************************************************************************/
/* Init CUDA                                                            */
/************************************************************************/
#if __DEVICE_EMULATION__
bool InitCUDA(void){return true;}
#else
bool InitCUDA(void)
{
    int count = 0;
    int i = 0;
    cudaGetDeviceCount(&count);
    if(count == 0) {
        fprintf(stderr, "There is no device./n");
        return false;
    }
    for(i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if(prop.major >= 1) {
                break;
            }
        }
    }
    if(i == count) {
        fprintf(stderr, "There is no device supporting CUDA./n");
        return false;
    }
    cudaSetDevice(i);
    printf("CUDA initialized./n");
    return true;
}
#endif
/************************************************************************/
/* Example                                                              */
/************************************************************************/
__global__ static void vector_matrix_mult_kernel(float* A, long wA, float* B, long wB, float* C)
{
    __shared__ float subA[64];
    A = A + threadIdx.x;
    B = B + blockIdx.x * 64 + threadIdx.x;
    C = C + blockIdx.x * 64 + threadIdx.x;
    float subC = 0.0;
    for (int i = 0; i < wA; i+=64)
    {
        subA[threadIdx.x] = A[i];
        __syncthreads();
#pragma unroll
        for (int j = 0; j < 64; j++, B += wB)
        {
            subC += subA[j] * B[0];
        }
        __syncthreads();
    }
    C[0] = subC;
}
//__global__ static void vector_matrix_mult_kernel_32T(float* A, long wA, float* B, long wB, float* C)
//{

//}
#define  RUN_TEST
#define  WA 32
#define  WB 64
/************************************************************************/
/* HelloCUDA                                                            */
/************************************************************************/
int main(int argc, char* argv[])
{
    if(!InitCUDA()) {
        return 0;
    }
    srand(2009);
    long wA = 64* WA;
    long wB = 64* WB;
    long size_A = wA;
    long size_B = wA*wB;
    long size_C = wB;
    float    *hA = (float*)malloc(sizeof(float) * size_A);
    float    *hB = (float*)malloc(sizeof(float) * size_B);
    float    *hC = (float*)malloc(sizeof(float) * size_C);
    float    *testhC = (float*)malloc(sizeof(float) * size_C);
    for (int i = 0; i < size_A; i++)
    {
        hA[i] = (float)rand()/(float)RAND_MAX;
    }
    for (int i = 0; i < size_B; i++)
    {
        hB[i] = (float)rand()/(float)RAND_MAX;
    }
    float    *dA    = 0;
    float    *dB = 0;
    float    *dC = 0;
    CUDA_SAFE_CALL( cudaMalloc((void**) &dA, sizeof(float) * size_A));
    CUDA_SAFE_CALL( cudaMalloc((void**) &dB, sizeof(float) * size_B));
    CUDA_SAFE_CALL( cudaMalloc((void**) &dC, sizeof(float) * size_C));
    CUDA_SAFE_CALL( cudaMemcpy(dA, hA, sizeof(float)*size_A, cudaMemcpyHostToDevice));
    CUDA_SAFE_CALL( cudaMemcpy(dB, hB, sizeof(float)*size_B, cudaMemcpyHostToDevice));
    unsigned int timer = 0;
    CUT_SAFE_CALL( cutCreateTimer( &timer));
    CUT_SAFE_CALL( cutStartTimer( timer));
    dim3 threads = 64;
    dim3 blocks = wB/64;
    vector_matrix_mult_kernel<<>>(dA, wA, dB, wB, dC);
    CUT_CHECK_ERROR("Kernel execution failed/n");
    CUDA_SAFE_CALL( cudaMemcpy(hC, dC, sizeof(float) * size_C, cudaMemcpyDeviceToHost));
    CUT_SAFE_CALL( cutStopTimer( timer));
    printf("Processing time: %f (ms)/n", cutGetTimerValue( timer));
    CUT_SAFE_CALL( cutResetTimer( timer));
    for (int i = 0; i < wB; i++)
    {
        float subC = 0.0;
        for (int j = 0; j         {
            subC += hA[j] * hB[j*wB + i];
        }
        testhC[i] = subC;
    }
    CUT_SAFE_CALL( cutStopTimer( timer));
    printf("Processing time: %f (ms)/n", cutGetTimerValue( timer));
    CUT_SAFE_CALL( cutDeleteTimer( timer));
#ifdef RUN_TEST
    CUTBoolean res = cutCompareL2fe(testhC, hC, size_C, 1e-6f);
    printf("Test %s /n", (1 == res) ? "PASSED":"FAILED");
#endif
    CUDA_SAFE_CALL( cudaFree(dA));
    CUDA_SAFE_CALL( cudaFree(dB));
    CUDA_SAFE_CALL( cudaFree(dC));
    free(hA);
    free(hB);
    free(hC);
    CUT_EXIT(argc, argv);
    return 0;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
相关文章推荐