CUDA重写opencv最近邻插值,宽和高分别放大k倍

  • Post author:
  • Post category:其他


第一种方法:

从global memory加载数据时coalesce

#include "cuda_runtime.h"
#include <cuda.h>
#include <time.h>
#include <vector>
#include <opencv2/opencv.hpp>
#include <opencv2/cudawarping.hpp>
#include  "opencv2/highgui.hpp"
#include <iostream>
#include <math.h>
#include <chrono>
 
// using namespace cv;
// using namespace std;

// enlarge k times in x and y direction
__global__ void inter_nearest_k(uchar3 *dataIn, uchar3 *dataOut, int imgHeight, int imgWidth, int k)
{   
    int xIndex = threadIdx.x + blockIdx.x * blockDim.x;	
    int yIndex = threadIdx.y + blockIdx.y * blockDim.y;	

    if(xIndex < imgWidth && yIndex < imgHeight)
    {
        uchar3 rgb = dataIn[yIndex * imgWidth + xIndex];
        for(int j = 0; j < k; j++) // y direction
        {   
            for(int i = 0; i < k; i++) // x direction
            {  
                dataOut[(k * yIndex + j) * k * imgWidth + k * xIndex + i] = rgb;
            }
        }
    }
}

int main(void)
{ 
    int k = 11;
    cv::Mat img_ori = cv::imread("lisfan-70.jpg"); 
    int imgWidth = img_ori.cols;
    int imgHeight = img_ori.rows;
    std::cout << "img_ori.cols: " << img_ori.cols << std::endl; //列
    std::cout << "img_ori.rows: " << img_ori.rows << std::endl; //行

    cv::Mat img_resize_cpu;
    auto start_cpu = std::chrono::steady_clock::now();
    resize(img_ori, img_resize_cpu, cv::Size(k*img_ori.cols, k*img_ori.rows), 0, 0, cv::INTER_NEAREST);
    auto end_cpu = std::chrono::steady_clock::now();
    std::chrono::duration<double, std::micro> elapsed_cpu = end_cpu - start_cpu; // std::micro 表示以微秒为时间单位
    std::cout<< "CPU resize time: "  << elapsed_cpu.count() << " us" << std::endl; 
    // k=1, 1267.37 us; k=2, 3531.66 us; k=3, 5827.61 us; k=5, 12334.9 us; k=7, 17392.5 us; k=9, 25045.3 us; k=11, 36060.9 us
    cv::imwrite("lisfan-70_" + std::to_string(k) + "_cpu.jpg", img_resize_cpu);
    
    ///  GPU  //
    uchar3 *d_in;
    uchar3 *d_out;

    cv::Mat img_resize_gpu(k * imgHeight, k * imgWidth, CV_8UC3);

    cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(uchar3));
    cudaMalloc((void**)&d_out, k * k * imgHeight * imgWidth * sizeof(uchar3));

    cudaMemcpy(d_in, img_ori.data, imgHeight * imgWidth * sizeof(uchar3), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((k * imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (k * imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);
    
    auto start_gpu = std::chrono::steady_clock::now();
    inter_nearest_k << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out, imgHeight, imgWidth, k);
    cudaDeviceSynchronize(); //同步CPU和gpu,否则测速结果为cpu启动内核函数的速度
    auto end_gpu = std::chrono::steady_clock::now();
    std::chrono::duration<double, std::micro> elapsed_gpu = end_gpu - start_gpu; // std::micro 表示以微秒为时间单位
    std::cout<< "GPU resize time: "  << elapsed_gpu.count() << " us" << std::endl; 
    // k=1, 187 us; k=2, 263.203 us; k=3, 487 us; k=5, 1590.61 us; k=7, 3901.79 us; k=9, 7915.56 us; k=11, 13765.3 us
    cudaMemcpy(img_resize_gpu.data, d_out, k * k * imgHeight * imgWidth * sizeof(uchar3), cudaMemcpyDeviceToHost);
	
    int diff = 0;
    for(int i=0; i<img_resize_gpu.cols; i++)
    {
        for(int j=0; j<img_resize_gpu.rows; j++)
        {
            if(img_resize_gpu.data[i,j] != img_resize_cpu.data[i,j]) diff += 1;
        }
    }
    std::cout << "diff: " << diff << std::endl;
    
	cudaFree(d_in);
    cudaFree(d_out);
    
    cv::imwrite("lisfan-70_" + std::to_string(k) + "_gpu.jpg", img_resize_gpu);

    return 0;
}

第二种方法:

写入global memory时coalesce

#include "cuda_runtime.h"
#include <cuda.h>
#include <time.h>
#include <vector>
#include <opencv2/opencv.hpp>
#include "opencv2/highgui.hpp"
#include <iostream>
#include <math.h>
#include <chrono>

// enlarge the original image k times in x and y direction
// write dataOut coalesced
__global__ void inter_nearest_k(uchar3 *dataIn, uchar3 *dataOut, int imgHeight, int imgWidth, int imgHeight_k, int imgWidth_k, int k)
{   
    int xIdx = threadIdx.x + blockIdx.x * blockDim.x;	
    int yIdx = threadIdx.y + blockIdx.y * blockDim.y;	

    if(xIdx < imgWidth_k && yIdx < imgHeight_k)
    {        
        dataOut[yIdx * imgWidth_k + xIdx] = dataIn[(yIdx / k) * imgWidth + xIdx / k];
    }
}

int main(void)
{ 
    int k = 11;
    cv::Mat img_ori = cv::imread("lisfan-70.jpg"); 
    int imgWidth = img_ori.cols;
    int imgHeight = img_ori.rows;
    int imgHeight_k = imgHeight * k;
    int imgWidth_k = imgWidth * k;
    std::cout << "img_ori.cols: " << img_ori.cols << std::endl; //列
    std::cout << "img_ori.rows: " << img_ori.rows << std::endl; //行

    cv::Mat img_resize_cpu;
    auto start_cpu = std::chrono::steady_clock::now();
    resize(img_ori, img_resize_cpu, cv::Size(imgWidth_k, imgHeight_k), 0, 0, cv::INTER_NEAREST);
    auto end_cpu = std::chrono::steady_clock::now();
    std::chrono::duration<double, std::micro> elapsed_cpu = end_cpu - start_cpu; // std::micro 表示以微秒为时间单位
    std::cout<< "CPU resize time: "  << elapsed_cpu.count() << " us" << std::endl; 
    // k=1, 1267.37 us; k=2, 3531.66 us; k=3, 5827.61 us; k=5, 12334.9 us; k=7, 17392.5 us; k=9, 25045.3 us; k=11, 36060.9 us
    cv::imwrite("lisfan-70_" + std::to_string(k) + "_cpu.jpg", img_resize_cpu);
    
    ///  GPU  //
    uchar3 *d_in;
    uchar3 *d_out;

    cv::Mat img_resize_gpu(imgHeight_k, imgWidth_k, CV_8UC3);

    cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(uchar3));
    cudaMalloc((void**)&d_out, imgHeight_k * imgWidth_k * sizeof(uchar3));

    cudaMemcpy(d_in, img_ori.data, imgHeight * imgWidth * sizeof(uchar3), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgWidth_k + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight_k + threadsPerBlock.y - 1) / threadsPerBlock.y);
    
    auto start_gpu = std::chrono::steady_clock::now();
    inter_nearest_k <<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgHeight, imgWidth, imgHeight_k, imgWidth_k, k);
    cudaDeviceSynchronize(); //同步CPU和gpu,否则测速结果为cpu启动内核函数的速度
    auto end_gpu = std::chrono::steady_clock::now();
    std::chrono::duration<double, std::micro> elapsed_gpu = end_gpu - start_gpu; // std::micro 表示以微秒为时间单位
    std::cout<< "GPU resize time: "  << elapsed_gpu.count() << " us" << std::endl; 
    // k=1, 187 us; k=2, 263.203 us; k=3, 370.887 us; k=5, 729.543 us; k=7, 1246.63 us; k=9, 1915.2 us; k=11, 2754.21 us
    cudaMemcpy(img_resize_gpu.data, d_out, imgHeight_k * imgWidth_k * sizeof(uchar3), cudaMemcpyDeviceToHost);
	
    int diff = 0;
    for(int i=0; i<imgWidth_k; i++)
    {
        for(int j=0; j<imgHeight_k; j++)
        {
            if(img_resize_gpu.data[i,j] != img_resize_cpu.data[i,j]) diff += 1;
        }
    }
    std::cout << "diff: " << diff << std::endl;
    
	cudaFree(d_in);
    cudaFree(d_out);
    
    cv::imwrite("lisfan-70_" + std::to_string(k) + "_gpu.jpg", img_resize_gpu);

    return 0;
}

对应的CMakeLists.txt

CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
PROJECT(resize_gpu)
FIND_PACKAGE(OpenCV REQUIRED)
# ibopencv_imgcodecs
FIND_PACKAGE(CUDA REQUIRED)
CUDA_ADD_EXECUTABLE(resize_gpu resize_gpu.cu)
TARGET_LINK_LIBRARIES(resize_gpu ${OpenCV_LIBS})

# set(CMAKE_CXX_FLAGS "-std=c++11") # 加不加这行都可以
# 直接鼠标右键Run Code不能跑
# mkdir build && cd build
# cd ..
# ./build/display_image

由于写入global memory时,数据量增加到k^2倍,所以写入数据时coalesce会更快一些



版权声明:本文为ysh1026原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。