您的位置:首页 > 其它

基于CUDA在GPU上实现膨胀、腐蚀加速

2017-09-18 21:05 387 查看
        传统的OpenCV形态学运算函数不能直接在GPU上运行,现提供几种方法,使得膨胀、腐蚀能在GPU上实现加速。笔者使用的是GPU是Nvidia的,故以下代码基于CUDA

1. 传统的膨胀、腐蚀

        下面的例子是,对一个尺寸为5×5的矩形元素分别进行腐蚀、膨胀,元素的支点为其中心,坐标为(2, 2)。它相当于对3×3元素进行两次操作

1.1 OpenCV 1.0

    

IplConvKernel *elem = cvCreateStructuringElementEx(5, 5, 2, 2, CV_SHAPE_RECT);
cvErode(src, dst, elem, 1);	// 相当于cvErode(src, dst, NULL, 2);
cvDilate(src, dst, elem, 1);	// 相当于cvDilate(src, dst, NULL, 2);
cvReleaseStructuringElement(&elem);


1.2 OpenCV 2.0

cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(5, 5), cv::Point(2, 2));
cv::dilate(src, dst, kernel, cv::Point(2, 2), 1);
cv::dilate(src, dst, kernel, cv::Point(2, 2), 1);


2. 使用CUDA提供的函数实现腐蚀、膨胀

// 假设src、dst的类型为IplImage*

using namespace cv;

Mat mat_src = cvarrToMat(src);
Mat mat_dst(mat_src);

cuda::GpuMat gpu_src(mat_src);
cuda::GpuMat gpu_dst(mat_dst);

Mat elem = getStructuringElement( MORPH_RECT, Size(5, 5), Point(2, 2) );
Ptr<cuda::Filter> erodeFilter = cuda::createMorphologyFilter( MORPH_ERODE, CV_8U, elem );
erodeFilter->apply( gpu_src, gpu_dst );

Ptr<cuda::Filter> dilateFilter = cuda::createMorphologyFilter( MORPH_DILATE, CV_8U, elem );
dilateFilter->apply( gpu_src, gpu_dst );

gpu_dst.download( mat_dst );
*dst = IplImage( mat_dst );


需要注意的是,这种方法第一次使用时,耗时较多,从第二次调用开始,运行时间变得正常。

3. 自己写的函数实现

上面的方法运行时间还是挺长的,为了进一步提高运行速度,可使用以下方法

3.1  函数接口:morphology.cuh

#ifndef _MORPHOLOGY_CUH_
#define _MORPHOLOGY_CUH_
#include <opencv/cv.h>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/features2d/features2d.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core/mat.hpp>
#include <opencv2/cudafilters.hpp>
using namespace cv;

void ErodeTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height);

void DilateTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height);

#endif


需要注意的是,传入的参数unsigned char *src及输出unsigned char *dst为cudaMalloc而来,通过

cudaMemcpy(src, img->imageData, img->width * img->height, cudaMemcpyHostToDevice);


来获得数据。

        另外,当运算的元素为n×n时,传给radio的值为(n-1)/2,例如:以5×5的元素进行腐蚀、膨胀,则radio的值为2。而且不能定制元素的形状,固定为矩形元素,且长宽相等。

        不同的OpenCV版本,包含的头文件路径有所差别,我用的版本是3.1.0。其他版本,若编译不通过,需要对头文件进行微小的改动。

3.2 源文件:morphology.cu

#include "morphology.cuh"

__global__ void ErodeSharedStep1(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
{
extern __shared__ int smem[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int x = bx * tile_w + tx - radio;
int y = by * tile_h + ty;

smem[ty * blockDim.x + tx] = 255;
__syncthreads();
if( x < 0 || x >= width || y >= height ) {
return;
}
smem[ty * blockDim.x + tx] = (int)src[y * width + x];
__syncthreads();

if( x < (bx * tile_w) || x >= (bx + 1) * tile_w ) {
return;
}

int *smem_thread = &smem[ty * blockDim.x + tx - radio];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) {
val = MIN( val, smem_thread[i] );
}
dst[y * width + x] = (unsigned char)val;
}

__global__ void ErodeSharedStep2(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
{
extern __shared__ int smem[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int x = bx * tile_w + tx;
int y = by * tile_h + ty - radio;

smem[ty * blockDim.x + tx] = 255;
__syncthreads();
if( x >= width || y < 0 || y >= height ) {
return;
}
smem[ty * blockDim.x + tx] = (int)src[y * width + x];
__syncthreads();

if( y < (by * tile_h) || y >= (by + 1) * tile_h ) {
return;
}

int *smem_thread = &smem[(ty - radio) * blockDim.x + tx];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) {
val = MIN( val, smem_thread[i * blockDim.x] );
}
dst[y * width + x] = (unsigned char)val;
}

__global__ void DilateSharedStep1(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
{
extern __shared__ int smem[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int x = bx * tile_w + tx - radio;
int y = by * tile_h + ty;

smem[ty * blockDim.x + tx] = 0;
__syncthreads();
if( x < 0 || x >= width || y >= height ) {
return;
}
smem[ty * blockDim.x + tx] = (int)src[y * width + x];
__syncthreads();

if( x < (bx * tile_w) || x >= (bx + 1) * tile_w ) {
return;
}

int *smem_thread = &smem[ty * blockDim.x + tx - radio];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) {
val = MAX( val, smem_thread[i] );
}
dst[y * width + x] = (unsigned char)val;
}

__global__ void DilateSharedStep2(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
{
extern __shared__ int smem[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int x = bx * tile_w + tx;
int y = by * tile_h + ty - radio;

smem[ty * blockDim.x + tx] = 0;
__syncthreads();
if( x >= width || y < 0 || y >= height ) {
return;
}
smem[ty * blockDim.x + tx] = (int)src[y * width + x];
__syncthreads();

if( y < (by * tile_h) || y >= (by + 1) * tile_h ) {
return;
}

int *smem_thread = &smem[(ty - radio) * blockDim.x + tx];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) {
val = MAX( val, smem_thread[i * blockDim.x] );
}
dst[y * width + x] = (unsigned char)val;
}

void ErodeTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height)
{
unsigned char *temp = NULL;
cudaMalloc( &temp, width * height * sizeof(char) );

int tile_w = 640;
int tile_h = 1;
dim3 block1( tile_w + 2 * radio, tile_h );
dim3 grid1( ceil((float)width / tile_w), ceil((float)height / tile_h) );
ErodeSh
a443
aredStep1<<<grid1, block1, block1.y * block1.x * sizeof(int)>>>(src, temp, radio, width, height, tile_w, tile_h);
cudaDeviceSynchronize();

tile_w = 8;
tile_h = 64;
dim3 block2( tile_w, tile_h + 2 * radio );
dim3 grid2( ceil((float)width / tile_w), ceil((float)height / tile_h) );
ErodeSharedStep2<<<grid2, block2, block2.y * block2.x * sizeof(int)>>>(temp, dst, radio, width, height, tile_w, tile_h);
cudaDeviceSynchronize();

cudaFree( temp );
}

void DilateTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height)
{
unsigned char *temp = NULL;
cudaMalloc( &temp, width * height * sizeof(char) );

int tile_w = 640;
int tile_h = 1;
dim3 block1( tile_w + 2 * radio, tile_h );
dim3 grid1( ceil((float)width / tile_w), ceil((float)height / tile_h) );
DilateSharedStep1<<<grid1, block1, block1.y * block1.x * sizeof(int)>>>(src, temp, radio, width, height, tile_w, tile_h);
cudaDeviceSynchronize();

tile_w = 8;
tile_h = 64;
dim3 block2( tile_w, tile_h + 2 * radio );
dim3 grid2( ceil((float)width / tile_w), ceil((float)height / tile_h) );
DilateSharedStep2<<<grid2, block2, block2.y * block2.x * sizeof(int)>>>(temp, dst, radio, width, height, tile_w, tile_h);
cudaDeviceSynchronize();

cudaFree( temp );
}


注:以上代码参考了文章
Luke Domanski, Pascal Vallotton, Dadong Wang. Parallel van Herk/Gil-Werman image morphology on GPUs using CUDA. CSIRO Mathematical & Information Sciences, Biotech Imaging
及开源代码 https://github.com/mompes/CUDA-dilation-and-erosion-filters
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: