您的位置:首页 > 其它

矩阵分解

2017-03-30 15:28 260 查看
首选说明一下,dot点积运算,定义n维向量的点积运算为



其实,这个点积运算就是1个1*n的行矩阵和1个n*1的列矩阵的矩阵运算。因此,完全可以将点积运算转化为矩阵乘法,就好像深度学习框架中,将卷积运算转化为矩阵乘法实现一样。
       而这里需要说明的就是,如果只是单纯的计算1个点积运算,这样转化为矩阵和不做这样的转化,没有任何意义。而当我们需要做大量的这样的点积运算的时候,这样的意义就尤为凸显。
       再说一下,norm求模运算的优化,假设有m个a向量,n个b向量,如果这时要m个a和n个b两两算cosine距离,每一次计算都要除以a的模和b的模,一共要算m*n次,如果,给a,b向量分别归一化,再算cosine,这样只需要m+n次,为什么呢,因为a和b的模都已经变为1。
 
有了上面的知识,进入下面的正题。
目前流行的矩阵加速框架有,多线程openMP,eigen(mkl),armadillo(openblas),cublas(cublasSgemm)
自己测试的加速效果如下,注意这里的Cublas的时间算上了数据从cpu到gpu的时间和结果从gpu到cpu的时间,如果不计算考进考出的时间,gpu无疑是最快的加速方案。



基于上面的结果,自己撸了一个通用的加速接口,并且在内部实现了矩阵的分解,可以进行超大矩阵的运算。一句话,只要内存放的下,都可以直接传入函数接口进行矩阵运算。
int matrix_mul(float *A, int A_ROW, int A_COL, float *B, int B_ROW, int B_COL, float *C, string mode, int maxRow)
{
if (mode == "GPU" || mode == "gpu")
{
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
cout << "CUBLAS 对象实例化出错" << endl;
}
return EXIT_FAILURE;
}

if (A_ROW > maxRow)
{
float *CC = (float*)malloc(sizeof(float)*A_ROW*B_ROW);
float *BB = (float*)malloc(sizeof(float)*B_COL*B_ROW);
for (int it = 0; it < B_ROW; it++)
{
for (int itt = 0; itt < B_COL; itt++)
BB[B_ROW*itt + it] = B[B_COL*it + itt];
}

int ranknum = ceil((float)A_ROW / (float)maxRow);
float *dev_B;
cudaMalloc((void**)&dev_B, B_COL*B_ROW * sizeof(float));
CUDA_SAFE_CALL(cudaMemcpy(dev_B, BB, B_COL*B_ROW * sizeof(float), cudaMemcpyHostToDevice));

for (int i = 0; i < ranknum; i++)
{

float *dev_A, *dev_C;
int lastROW;
if (i < ranknum - 1)
{
lastROW = maxRow;
}
else
{
lastROW = (A_ROW - (maxRow*i));
}

cudaMalloc((void**)&dev_A, lastROW*A_COL * sizeof(float));
cudaMalloc((void**)&dev_C, lastROW*B_ROW * sizeof(float));
CUDA_SAFE_CALL(cudaMemcpy(dev_A, A + maxRow*i*A_COL, lastROW*A_COL * sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemset(dev_C, 0, lastROW*B_ROW * sizeof(float)));

cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, B_ROW, lastROW, A_COL, &alpha, dev_B, B_ROW, dev_A, A_COL, &beta, dev_C, B_ROW);
cudaMemcpy(CC + i*maxRow*B_ROW, dev_C, lastROW*B_ROW * sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(dev_A);
cudaFree(dev_C);

}
for (int it = 0; it < B_ROW; it++)
{
for (int itt = 0; itt < A_ROW; itt++)
C[A_ROW*it + itt] = CC[B_ROW*itt + it];
}
free(CC);
free(BB);
cudaFree(dev_B);

}
else
{

float *dev_A, *dev_B, *dev_C;
cudaMalloc((void**)&dev_A, A_ROW*A_COL * sizeof(float));
cudaMalloc((void**)&dev_B, B_COL*B_ROW * sizeof(float));
cudaMalloc((void**)&dev_C, A_ROW*B_ROW * sizeof(float));

float *CC = (float*)malloc(sizeof(float)*A_ROW*B_ROW);
float *BB = (float*)malloc(sizeof(float)*B_COL*B_ROW);
for (int it = 0; it < B_ROW; it++)
{
for (int itt = 0; itt < B_COL; itt++)
{
BB[B_ROW*itt + it] = B[B_COL*it + itt];
}

}
CUDA_SAFE_CALL(cudaMemcpy(dev_A, A, A_ROW*A_COL * sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dev_B, BB, B_COL*B_ROW * sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemset(dev_C, 0, A_ROW*B_ROW * sizeof(float)));

cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, B_ROW, A_ROW, A_COL, &alpha, dev_B, B_ROW, dev_A, A_COL, &beta, dev_C, B_ROW);

cudaMemcpy(CC, dev_C, A_ROW*B_ROW * sizeof(float), cudaMemcpyDeviceToHost);

for (int it = 0; it < B_ROW; it++)
{
for (int itt = 0; itt < A_ROW; itt++)
C[A_ROW*it + itt] = CC[B_ROW*itt + it];
}
free(BB);
free(CC);
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);

}

cublasDestroy(handle);

}
else if (mode == "CPU" || mode == "cpu")
{

if (A_ROW > maxRow)
{
if (B_ROW < OMPEIGEN_CHOICE)
{
//openMP
for (int i = 0; i < B_ROW; i++)
{
#pragma omp parallel for
for (int j = 0; j < A_ROW; j++)
{
double result = 0.0;
for (int k = 0; k < A_COL; k++)
result += A[j*A_COL + k] * B[i*A_COL + k];
C[A_ROW*i + j] = result;
}

}

}else
{
int ranknumA = ceil((float)A_ROW / (float)maxRow);
float *CC_tmp = (float*)malloc(sizeof(float)*B_ROW*A_ROW);
for (int jj = 0; jj < ranknumA; jj++)
{
int lastROWA;
if (jj < ranknumA - 1)
{
lastROWA = maxRow;
}
else
{
lastROWA = (A_ROW - (maxRow*jj));
}

MatrixXf featuremat(lastROWA, A_COL);
for (int row = 0; row < lastROWA; row++)
for (int col = 0; col < A_COL; col++)
{
featuremat(row, col) = A[row*A_COL + col + jj*maxRow*A_COL];
}
MatrixXf vectormat(B_ROW, B_COL);
for (int row = 0; row < B_ROW; row++)
for (int col = 0; col < B_COL; col++)
{
vectormat(row, col) = B[row*B_COL + col];
}
MatrixXf scoremat = vectormat*featuremat.transpose();
for (int col = 0; col < lastROWA; col++)
for (int row = 0; row < B_ROW; row++)
{
CC_tmp[col*B_ROW + row + jj*maxRow*B_ROW] = scoremat(row, col);
}

}
for (int it = 0; it < B_ROW; it++)
{
for (int itt = 0; itt < A_ROW; itt++)
{
C[A_ROW*it + itt] = CC_tmp[B_ROW*itt + it];
}
}
free(CC_tmp);
}

}
else
{
if (B_ROW < OMPEIGEN_CHOICE)
{
//openMP
for (int i = 0; i < B_ROW; i++)
{
#pragma omp parallel for
for (int j = 0; j < A_ROW; j++)
{
double result = 0.0;
for (int k = 0; k < A_COL; k++)
result += A[j*A_COL + k] * B[i*A_COL + k];
C[A_ROW*i + j] = result;
}

}

}
else
{
//eigen
MatrixXf featuremat(A_ROW, A_COL);
for (int row = 0; row < A_ROW; row++)
for (int col = 0; col < A_COL; col++)
{
featuremat(row, col) = A[row*A_COL + col];
}
MatrixXf vectormat(B_ROW, B_COL);
for (int row = 0; row < B_ROW; row++)
for (int col = 0; col < B_COL; col++)
{
vectormat(row, col) = B[row*B_COL + col];
}
MatrixXf scoremat = featuremat*vectormat.transpose();
for (int col = 0; col < B_ROW; col++)
for (int row = 0; row < A_ROW; row++)
{
C[col*A_ROW + row] = scoremat(row, col);
}

}

}

//armadillo
/*arma::mat featuremat( A_ROW, A_COL);
for (int row = 0; row < A_ROW; row++)
for (int col = 0; col < A_COL; col++)
{
featuremat(row, col) = A[row*A_COL + col];
}
arma::mat vectormat(B_ROW, B_COL);
for (int row = 0; row < B_ROW; row++)
for (int col = 0; col < B_COL; col++)
{
vectormat(row, col) = B[row*B_COL + col];
}
arma::mat scoremat = featuremat*vectormat.t();
for (int col = 0; col < B_ROW; col++)
for (int row = 0; row < A_ROW; row++)
{
C[col*A_ROW + row] = scoremat(row, col);
}*/

}
else
{
cout << "wrong mode,mode must be one of GPU,gpu,CPU,cpu" << endl;
}

return 1;
}


int matrix_mul_NN(float *A, int A_ROW, int A_COL, float *B, int B_ROW, int B_COL, float *C, string mode, int maxRow)
{
if (A_COL != B_COL)
{
cout << "matrix A's col is not equal to matrix B's row" << endl;
return 0;
}
__TIC__();

if (B_ROW > maxRow)
{
int ranknumB = ceil((float)B_ROW / (float)maxRow);
for (int j = 0; j < ranknumB; j++)
{
int lastROWB;
if (j < ranknumB - 1)
{
lastROWB = maxRow;
}
else
{
lastROWB = (B_ROW - (maxRow*j));
}
float *B_tmp = (float*)malloc(sizeof(float)*B_COL*lastROWB);
memcpy(B_tmp, B + j*B_COL*maxRow, sizeof(float)*B_COL*lastROWB);
float *C_tmp = (float*)malloc(sizeof(float)*A_ROW*lastROWB);
matrix_mul(A, A_ROW, A_COL, B_tmp, lastROWB, B_COL, C_tmp, mode, maxRow);
memcpy(C + j*A_ROW*maxRow, C_tmp, sizeof(float)*A_ROW*lastROWB);
free(B_tmp);
free(C_tmp);

}
}
else
{
matrix_mul(A, A_ROW, A_COL, B, B_ROW, B_COL, C, mode, maxRow);
}

__TOC__();
return 1;
}


细节不再赘述,The End!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: