您的位置：首页 > 其它

基于CUDA在GPU上实现膨胀、腐蚀加速

2017-09-18 21:05 387 查看

传统的OpenCV形态学运算函数不能直接在GPU上运行，现提供几种方法，使得膨胀、腐蚀能在GPU上实现加速。笔者使用的是GPU是Nvidia的，故以下代码基于CUDA。

1. 传统的膨胀、腐蚀

下面的例子是，对一个尺寸为5×5的矩形元素分别进行腐蚀、膨胀，元素的支点为其中心，坐标为(2, 2)。它相当于对3×3元素进行两次操作。

1.1 OpenCV 1.0

IplConvKernel *elem = cvCreateStructuringElementEx(5, 5, 2, 2, CV_SHAPE_RECT);
cvErode(src, dst, elem, 1);	// 相当于cvErode(src, dst, NULL, 2);
cvDilate(src, dst, elem, 1);	// 相当于cvDilate(src, dst, NULL, 2);
cvReleaseStructuringElement(&elem);

1.2 OpenCV 2.0

cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(5, 5), cv::Point(2, 2));
cv::dilate(src, dst, kernel, cv::Point(2, 2), 1);
cv::dilate(src, dst, kernel, cv::Point(2, 2), 1);

2. 使用CUDA提供的函数实现腐蚀、膨胀

// 假设src、dst的类型为IplImage*

using namespace cv;

Mat mat_src = cvarrToMat(src);
Mat mat_dst(mat_src);

cuda::GpuMat gpu_src(mat_src);
cuda::GpuMat gpu_dst(mat_dst);

Mat elem = getStructuringElement( MORPH_RECT, Size(5, 5), Point(2, 2) );
Ptr<cuda::Filter> erodeFilter = cuda::createMorphologyFilter( MORPH_ERODE, CV_8U, elem );
erodeFilter->apply( gpu_src, gpu_dst );

Ptr<cuda::Filter> dilateFilter = cuda::createMorphologyFilter( MORPH_DILATE, CV_8U, elem );
dilateFilter->apply( gpu_src, gpu_dst );

gpu_dst.download( mat_dst );
*dst = IplImage( mat_dst );

需要注意的是，这种方法第一次使用时，耗时较多，从第二次调用开始，运行时间变得正常。

3. 自己写的函数实现

上面的方法运行时间还是挺长的，为了进一步提高运行速度，可使用以下方法

3.1 函数接口：morphology.cuh

#ifndef _MORPHOLOGY_CUH_
#define _MORPHOLOGY_CUH_
#include <opencv/cv.h>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/features2d/features2d.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core/mat.hpp>
#include <opencv2/cudafilters.hpp>
using namespace cv;

void ErodeTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height);

void DilateTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height);

#endif

需要注意的是，传入的参数unsigned char *src及输出unsigned char *dst为cudaMalloc而来，通过

cudaMemcpy(src, img->imageData, img->width * img->height, cudaMemcpyHostToDevice);

来获得数据。

另外，当运算的元素为n×n时，传给radio的值为(n-1)/2，例如：以5×5的元素进行腐蚀、膨胀，则radio的值为2。而且不能定制元素的形状，固定为矩形元素，且长宽相等。

不同的OpenCV版本，包含的头文件路径有所差别，我用的版本是3.1.0。其他版本，若编译不通过，需要对头文件进行微小的改动。

3.2 源文件：morphology.cu

#include "morphology.cuh"

__global__ void ErodeSharedStep1(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
{
extern __shared__ int smem[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int x = bx * tile_w + tx - radio;
int y = by * tile_h + ty;

smem[ty * blockDim.x + tx] = 255;
__syncthreads();
if( x < 0 || x >= width || y >= height ) {
return;
}
smem[ty * blockDim.x + tx] = (int)src[y * width + x];
__syncthreads();

if( x < (bx * tile_w) || x >= (bx + 1) * tile_w ) {
return;
}

int *smem_thread = &smem[ty * blockDim.x + tx - radio];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) {
val = MIN( val, smem_thread[i] );
}
dst[y * width + x] = (unsigned char)val;
}

__global__ void ErodeSharedStep2(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
{
extern __shared__ int smem[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int x = bx * tile_w + tx;
int y = by * tile_h + ty - radio;

smem[ty * blockDim.x + tx] = 255;
__syncthreads();
if( x >= width || y < 0 || y >= height ) {
return;
}
smem[ty * blockDim.x + tx] = (int)src[y * width + x];
__syncthreads();

if( y < (by * tile_h) || y >= (by + 1) * tile_h ) {
return;
}

int *smem_thread = &smem[(ty - radio) * blockDim.x + tx];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) {
val = MIN( val, smem_thread[i * blockDim.x] );
}
dst[y * width + x] = (unsigned char)val;
}

__global__ void DilateSharedStep1(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
{
extern __shared__ int smem[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int x = bx * tile_w + tx - radio;
int y = by * tile_h + ty;

smem[ty * blockDim.x + tx] = 0;
__syncthreads();
if( x < 0 || x >= width || y >= height ) {
return;
}
smem[ty * blockDim.x + tx] = (int)src[y * width + x];
__syncthreads();

if( x < (bx * tile_w) || x >= (bx + 1) * tile_w ) {
return;
}

int *smem_thread = &smem[ty * blockDim.x + tx - radio];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) {
val = MAX( val, smem_thread[i] );
}
dst[y * width + x] = (unsigned char)val;
}

__global__ void DilateSharedStep2(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
{
extern __shared__ int smem[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int x = bx * tile_w + tx;
int y = by * tile_h + ty - radio;

smem[ty * blockDim.x + tx] = 0;
__syncthreads();
if( x >= width || y < 0 || y >= height ) {
return;
}
smem[ty * blockDim.x + tx] = (int)src[y * width + x];
__syncthreads();

if( y < (by * tile_h) || y >= (by + 1) * tile_h ) {
return;
}

int *smem_thread = &smem[(ty - radio) * blockDim.x + tx];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) {
val = MAX( val, smem_thread[i * blockDim.x] );
}
dst[y * width + x] = (unsigned char)val;
}

void ErodeTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height)
{
unsigned char *temp = NULL;
cudaMalloc( &temp, width * height * sizeof(char) );

int tile_w = 640;
int tile_h = 1;
dim3 block1( tile_w + 2 * radio, tile_h );
dim3 grid1( ceil((float)width / tile_w), ceil((float)height / tile_h) );
ErodeSh
a443
aredStep1<<<grid1, block1, block1.y * block1.x * sizeof(int)>>>(src, temp, radio, width, height, tile_w, tile_h);
cudaDeviceSynchronize();

tile_w = 8;
tile_h = 64;
dim3 block2( tile_w, tile_h + 2 * radio );
dim3 grid2( ceil((float)width / tile_w), ceil((float)height / tile_h) );
ErodeSharedStep2<<<grid2, block2, block2.y * block2.x * sizeof(int)>>>(temp, dst, radio, width, height, tile_w, tile_h);
cudaDeviceSynchronize();

cudaFree( temp );
}

void DilateTwoStepShared(unsigned char *src, unsigned char *dst, int radio, int width, int height)
{
unsigned char *temp = NULL;
cudaMalloc( &temp, width * height * sizeof(char) );

int tile_w = 640;
int tile_h = 1;
dim3 block1( tile_w + 2 * radio, tile_h );
dim3 grid1( ceil((float)width / tile_w), ceil((float)height / tile_h) );
DilateSharedStep1<<<grid1, block1, block1.y * block1.x * sizeof(int)>>>(src, temp, radio, width, height, tile_w, tile_h);
cudaDeviceSynchronize();

tile_w = 8;
tile_h = 64;
dim3 block2( tile_w, tile_h + 2 * radio );
dim3 grid2( ceil((float)width / tile_w), ceil((float)height / tile_h) );
DilateSharedStep2<<<grid2, block2, block2.y * block2.x * sizeof(int)>>>(temp, dst, radio, width, height, tile_w, tile_h);
cudaDeviceSynchronize();

cudaFree( temp );
}

注：以上代码参考了文章
Luke Domanski, Pascal Vallotton, Dadong Wang. Parallel van Herk/Gil-Werman image morphology on GPUs using CUDA. CSIRO Mathematical & Information Sciences, Biotech Imaging
及开源代码 https://github.com/mompes/CUDA-dilation-and-erosion-filters

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航