基于CUDA在GPU上实现膨胀、腐蚀加速
2017-09-18 21:05
387 查看
传统的OpenCV形态学运算函数不能直接在GPU上运行,现提供几种方法,使得膨胀、腐蚀能在GPU上实现加速。笔者使用的是GPU是Nvidia的,故以下代码基于CUDA。
需要注意的是,这种方法第一次使用时,耗时较多,从第二次调用开始,运行时间变得正常。
需要注意的是,传入的参数unsigned char *src及输出unsigned char *dst为cudaMalloc而来,通过
来获得数据。
另外,当运算的元素为n×n时,传给radio的值为(n-1)/2,例如:以5×5的元素进行腐蚀、膨胀,则radio的值为2。而且不能定制元素的形状,固定为矩形元素,且长宽相等。
不同的OpenCV版本,包含的头文件路径有所差别,我用的版本是3.1.0。其他版本,若编译不通过,需要对头文件进行微小的改动。
注:以上代码参考了文章
Luke Domanski, Pascal Vallotton, Dadong Wang. Parallel van Herk/Gil-Werman image morphology on GPUs using CUDA. CSIRO Mathematical & Information Sciences, Biotech Imaging
及开源代码 https://github.com/mompes/CUDA-dilation-and-erosion-filters
1. 传统的膨胀、腐蚀
下面的例子是,对一个尺寸为5×5的矩形元素分别进行腐蚀、膨胀,元素的支点为其中心,坐标为(2, 2)。它相当于对3×3元素进行两次操作。1.1 OpenCV 1.0
IplConvKernel *elem = cvCreateStructuringElementEx(5, 5, 2, 2, CV_SHAPE_RECT); cvErode(src, dst, elem, 1); // 相当于cvErode(src, dst, NULL, 2); cvDilate(src, dst, elem, 1); // 相当于cvDilate(src, dst, NULL, 2); cvReleaseStructuringElement(&elem);
1.2 OpenCV 2.0
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(5, 5), cv::Point(2, 2)); cv::dilate(src, dst, kernel, cv::Point(2, 2), 1); cv::dilate(src, dst, kernel, cv::Point(2, 2), 1);
2. 使用CUDA提供的函数实现腐蚀、膨胀
// 假设src、dst的类型为IplImage* using namespace cv; Mat mat_src = cvarrToMat(src); Mat mat_dst(mat_src); cuda::GpuMat gpu_src(mat_src); cuda::GpuMat gpu_dst(mat_dst); Mat elem = getStructuringElement( MORPH_RECT, Size(5, 5), Point(2, 2) ); Ptr<cuda::Filter> erodeFilter = cuda::createMorphologyFilter( MORPH_ERODE, CV_8U, elem ); erodeFilter->apply( gpu_src, gpu_dst ); Ptr<cuda::Filter> dilateFilter = cuda::createMorphologyFilter( MORPH_DILATE, CV_8U, elem ); dilateFilter->apply( gpu_src, gpu_dst ); gpu_dst.download( mat_dst ); *dst = IplImage( mat_dst );
需要注意的是,这种方法第一次使用时,耗时较多,从第二次调用开始,运行时间变得正常。
3. 自己写的函数实现
上面的方法运行时间还是挺长的,为了进一步提高运行速度,可使用以下方法3.1 函数接口:morphology.cuh
#ifndef _MORPHOLOGY_CUH_ #define _MORPHOLOGY_CUH_ #include <opencv/cv.h> #include <opencv2/highgui/highgui.hpp> #include <opencv2/features2d/features2d.hpp> #include <opencv2/core/cuda.hpp> #include <opencv2/core/mat.hpp> #include <opencv2/cudafilters.hpp> using namespace cv; void ErodeTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height); void DilateTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height); #endif
需要注意的是,传入的参数unsigned char *src及输出unsigned char *dst为cudaMalloc而来,通过
cudaMemcpy(src, img->imageData, img->width * img->height, cudaMemcpyHostToDevice);
来获得数据。
另外,当运算的元素为n×n时,传给radio的值为(n-1)/2,例如:以5×5的元素进行腐蚀、膨胀,则radio的值为2。而且不能定制元素的形状,固定为矩形元素,且长宽相等。
不同的OpenCV版本,包含的头文件路径有所差别,我用的版本是3.1.0。其他版本,若编译不通过,需要对头文件进行微小的改动。
3.2 源文件:morphology.cu
#include "morphology.cuh" __global__ void ErodeSharedStep1(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h) { extern __shared__ int smem[]; int tx = threadIdx.x; int ty = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; int x = bx * tile_w + tx - radio; int y = by * tile_h + ty; smem[ty * blockDim.x + tx] = 255; __syncthreads(); if( x < 0 || x >= width || y >= height ) { return; } smem[ty * blockDim.x + tx] = (int)src[y * width + x]; __syncthreads(); if( x < (bx * tile_w) || x >= (bx + 1) * tile_w ) { return; } int *smem_thread = &smem[ty * blockDim.x + tx - radio]; int val = smem_thread[0]; for( int i = 1; i <= 2 * radio; i++ ) { val = MIN( val, smem_thread[i] ); } dst[y * width + x] = (unsigned char)val; } __global__ void ErodeSharedStep2(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h) { extern __shared__ int smem[]; int tx = threadIdx.x; int ty = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; int x = bx * tile_w + tx; int y = by * tile_h + ty - radio; smem[ty * blockDim.x + tx] = 255; __syncthreads(); if( x >= width || y < 0 || y >= height ) { return; } smem[ty * blockDim.x + tx] = (int)src[y * width + x]; __syncthreads(); if( y < (by * tile_h) || y >= (by + 1) * tile_h ) { return; } int *smem_thread = &smem[(ty - radio) * blockDim.x + tx]; int val = smem_thread[0]; for( int i = 1; i <= 2 * radio; i++ ) { val = MIN( val, smem_thread[i * blockDim.x] ); } dst[y * width + x] = (unsigned char)val; } __global__ void DilateSharedStep1(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h) { extern __shared__ int smem[]; int tx = threadIdx.x; int ty = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; int x = bx * tile_w + tx - radio; int y = by * tile_h + ty; smem[ty * blockDim.x + tx] = 0; __syncthreads(); if( x < 0 || x >= width || y >= height ) { return; } smem[ty * blockDim.x + tx] = (int)src[y * width + x]; __syncthreads(); if( x < (bx * tile_w) || x >= (bx + 1) * tile_w ) { return; } int *smem_thread = &smem[ty * blockDim.x + tx - radio]; int val = smem_thread[0]; for( int i = 1; i <= 2 * radio; i++ ) { val = MAX( val, smem_thread[i] ); } dst[y * width + x] = (unsigned char)val; } __global__ void DilateSharedStep2(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h) { extern __shared__ int smem[]; int tx = threadIdx.x; int ty = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; int x = bx * tile_w + tx; int y = by * tile_h + ty - radio; smem[ty * blockDim.x + tx] = 0; __syncthreads(); if( x >= width || y < 0 || y >= height ) { return; } smem[ty * blockDim.x + tx] = (int)src[y * width + x]; __syncthreads(); if( y < (by * tile_h) || y >= (by + 1) * tile_h ) { return; } int *smem_thread = &smem[(ty - radio) * blockDim.x + tx]; int val = smem_thread[0]; for( int i = 1; i <= 2 * radio; i++ ) { val = MAX( val, smem_thread[i * blockDim.x] ); } dst[y * width + x] = (unsigned char)val; } void ErodeTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height) { unsigned char *temp = NULL; cudaMalloc( &temp, width * height * sizeof(char) ); int tile_w = 640; int tile_h = 1; dim3 block1( tile_w + 2 * radio, tile_h ); dim3 grid1( ceil((float)width / tile_w), ceil((float)height / tile_h) ); ErodeSh a443 aredStep1<<<grid1, block1, block1.y * block1.x * sizeof(int)>>>(src, temp, radio, width, height, tile_w, tile_h); cudaDeviceSynchronize(); tile_w = 8; tile_h = 64; dim3 block2( tile_w, tile_h + 2 * radio ); dim3 grid2( ceil((float)width / tile_w), ceil((float)height / tile_h) ); ErodeSharedStep2<<<grid2, block2, block2.y * block2.x * sizeof(int)>>>(temp, dst, radio, width, height, tile_w, tile_h); cudaDeviceSynchronize(); cudaFree( temp ); } void DilateTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height) { unsigned char *temp = NULL; cudaMalloc( &temp, width * height * sizeof(char) ); int tile_w = 640; int tile_h = 1; dim3 block1( tile_w + 2 * radio, tile_h ); dim3 grid1( ceil((float)width / tile_w), ceil((float)height / tile_h) ); DilateSharedStep1<<<grid1, block1, block1.y * block1.x * sizeof(int)>>>(src, temp, radio, width, height, tile_w, tile_h); cudaDeviceSynchronize(); tile_w = 8; tile_h = 64; dim3 block2( tile_w, tile_h + 2 * radio ); dim3 grid2( ceil((float)width / tile_w), ceil((float)height / tile_h) ); DilateSharedStep2<<<grid2, block2, block2.y * block2.x * sizeof(int)>>>(temp, dst, radio, width, height, tile_w, tile_h); cudaDeviceSynchronize(); cudaFree( temp ); }
注:以上代码参考了文章
Luke Domanski, Pascal Vallotton, Dadong Wang. Parallel van Herk/Gil-Werman image morphology on GPUs using CUDA. CSIRO Mathematical & Information Sciences, Biotech Imaging
及开源代码 https://github.com/mompes/CUDA-dilation-and-erosion-filters
相关文章推荐
- 基于MATLAB的腐蚀膨胀算法实现
- 基于FPGA的腐蚀膨胀算法实现
- Python基于pyCUDA实现GPU加速并行计算功能入门教程
- 基于gpu与cuda c的神经网络实现
- SSE图像算法优化系列七:基于SSE实现的极速的矩形核腐蚀和膨胀(最大值和最小值)算法。
- Win7配置CUDA并搭建基于Theano框架的GPU加速环境
- 第一个cuda程序-基于VS2010+CUDA5.0 两个向量相加的GPU实现
- OpenCV中配置CUDA,实现GPU加速
- 基于GPU平台利用CUDA加速图像处理算法 实时处理高清图像
- 基于SSE实现的极速的矩形核腐蚀和膨胀(最大值和最小值)算法。
- Win7配置CUDA并搭建基于Theano框架的GPU加速环境
- Win7配置CUDA并搭建基于Theano框架的GPU加速环境
- 要点初见:OpenCV3中CUDA ORB特征提取算法的实现(GPU加速的ORB算法)
- 在Ubuntu 14.04平台上利用Intel的GPU实现硬件加速--基于VAAPI
- Win10 64bit 安装Theano 配置CUDA 实现GPU加速 详细步骤
- Win7配置CUDA并搭建基于Theano框架的GPU加速环境
- CUDA硬件实现分析(一)------安营扎寨-----GPU的革命 推荐
- 基于CUDA的GPU优化建议
- [菜鸟每天来段CUDA_C]基于GPU的Julia集
- 视频去交错:基于模糊控制的运动自适应算法(GPU实现)