第一种方法:
从global memory加载数据时coalesce
#include "cuda_runtime.h"
#include <cuda.h>
#include <time.h>
#include <vector>
#include <opencv2/opencv.hpp>
#include <opencv2/cudawarping.hpp>
#include "opencv2/highgui.hpp"
#include <iostream>
#include <math.h>
#include <chrono>
// using namespace cv;
// using namespace std;
// enlarge k times in x and y direction
__global__ void inter_nearest_k(uchar3 *dataIn, uchar3 *dataOut, int imgHeight, int imgWidth, int k)
{
int xIndex = threadIdx.x + blockIdx.x * blockDim.x;
int yIndex = threadIdx.y + blockIdx.y * blockDim.y;
if(xIndex < imgWidth && yIndex < imgHeight)
{
uchar3 rgb = dataIn[yIndex * imgWidth + xIndex];
for(int j = 0; j < k; j++) // y direction
{
for(int i = 0; i < k; i++) // x direction
{
dataOut[(k * yIndex + j) * k * imgWidth + k * xIndex + i] = rgb;
}
}
}
}
int main(void)
{
int k = 11;
cv::Mat img_ori = cv::imread("lisfan-70.jpg");
int imgWidth = img_ori.cols;
int imgHeight = img_ori.rows;
std::cout << "img_ori.cols: " << img_ori.cols << std::endl; //列
std::cout << "img_ori.rows: " << img_ori.rows << std::endl; //行
cv::Mat img_resize_cpu;
auto start_cpu = std::chrono::steady_clock::now();
resize(img_ori, img_resize_cpu, cv::Size(k*img_ori.cols, k*img_ori.rows), 0, 0, cv::INTER_NEAREST);
auto end_cpu = std::chrono::steady_clock::now();
std::chrono::duration<double, std::micro> elapsed_cpu = end_cpu - start_cpu; // std::micro 表示以微秒为时间单位
std::cout<< "CPU resize time: " << elapsed_cpu.count() << " us" << std::endl;
// k=1, 1267.37 us; k=2, 3531.66 us; k=3, 5827.61 us; k=5, 12334.9 us; k=7, 17392.5 us; k=9, 25045.3 us; k=11, 36060.9 us
cv::imwrite("lisfan-70_" + std::to_string(k) + "_cpu.jpg", img_resize_cpu);
/// GPU //
uchar3 *d_in;
uchar3 *d_out;
cv::Mat img_resize_gpu(k * imgHeight, k * imgWidth, CV_8UC3);
cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(uchar3));
cudaMalloc((void**)&d_out, k * k * imgHeight * imgWidth * sizeof(uchar3));
cudaMemcpy(d_in, img_ori.data, imgHeight * imgWidth * sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((k * imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (k * imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);
auto start_gpu = std::chrono::steady_clock::now();
inter_nearest_k << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out, imgHeight, imgWidth, k);
cudaDeviceSynchronize(); //同步CPU和gpu,否则测速结果为cpu启动内核函数的速度
auto end_gpu = std::chrono::steady_clock::now();
std::chrono::duration<double, std::micro> elapsed_gpu = end_gpu - start_gpu; // std::micro 表示以微秒为时间单位
std::cout<< "GPU resize time: " << elapsed_gpu.count() << " us" << std::endl;
// k=1, 187 us; k=2, 263.203 us; k=3, 487 us; k=5, 1590.61 us; k=7, 3901.79 us; k=9, 7915.56 us; k=11, 13765.3 us
cudaMemcpy(img_resize_gpu.data, d_out, k * k * imgHeight * imgWidth * sizeof(uchar3), cudaMemcpyDeviceToHost);
int diff = 0;
for(int i=0; i<img_resize_gpu.cols; i++)
{
for(int j=0; j<img_resize_gpu.rows; j++)
{
if(img_resize_gpu.data[i,j] != img_resize_cpu.data[i,j]) diff += 1;
}
}
std::cout << "diff: " << diff << std::endl;
cudaFree(d_in);
cudaFree(d_out);
cv::imwrite("lisfan-70_" + std::to_string(k) + "_gpu.jpg", img_resize_gpu);
return 0;
}
第二种方法:
写入global memory时coalesce
#include "cuda_runtime.h"
#include <cuda.h>
#include <time.h>
#include <vector>
#include <opencv2/opencv.hpp>
#include "opencv2/highgui.hpp"
#include <iostream>
#include <math.h>
#include <chrono>
// enlarge the original image k times in x and y direction
// write dataOut coalesced
__global__ void inter_nearest_k(uchar3 *dataIn, uchar3 *dataOut, int imgHeight, int imgWidth, int imgHeight_k, int imgWidth_k, int k)
{
int xIdx = threadIdx.x + blockIdx.x * blockDim.x;
int yIdx = threadIdx.y + blockIdx.y * blockDim.y;
if(xIdx < imgWidth_k && yIdx < imgHeight_k)
{
dataOut[yIdx * imgWidth_k + xIdx] = dataIn[(yIdx / k) * imgWidth + xIdx / k];
}
}
int main(void)
{
int k = 11;
cv::Mat img_ori = cv::imread("lisfan-70.jpg");
int imgWidth = img_ori.cols;
int imgHeight = img_ori.rows;
int imgHeight_k = imgHeight * k;
int imgWidth_k = imgWidth * k;
std::cout << "img_ori.cols: " << img_ori.cols << std::endl; //列
std::cout << "img_ori.rows: " << img_ori.rows << std::endl; //行
cv::Mat img_resize_cpu;
auto start_cpu = std::chrono::steady_clock::now();
resize(img_ori, img_resize_cpu, cv::Size(imgWidth_k, imgHeight_k), 0, 0, cv::INTER_NEAREST);
auto end_cpu = std::chrono::steady_clock::now();
std::chrono::duration<double, std::micro> elapsed_cpu = end_cpu - start_cpu; // std::micro 表示以微秒为时间单位
std::cout<< "CPU resize time: " << elapsed_cpu.count() << " us" << std::endl;
// k=1, 1267.37 us; k=2, 3531.66 us; k=3, 5827.61 us; k=5, 12334.9 us; k=7, 17392.5 us; k=9, 25045.3 us; k=11, 36060.9 us
cv::imwrite("lisfan-70_" + std::to_string(k) + "_cpu.jpg", img_resize_cpu);
/// GPU //
uchar3 *d_in;
uchar3 *d_out;
cv::Mat img_resize_gpu(imgHeight_k, imgWidth_k, CV_8UC3);
cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(uchar3));
cudaMalloc((void**)&d_out, imgHeight_k * imgWidth_k * sizeof(uchar3));
cudaMemcpy(d_in, img_ori.data, imgHeight * imgWidth * sizeof(uchar3), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgWidth_k + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight_k + threadsPerBlock.y - 1) / threadsPerBlock.y);
auto start_gpu = std::chrono::steady_clock::now();
inter_nearest_k <<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgHeight, imgWidth, imgHeight_k, imgWidth_k, k);
cudaDeviceSynchronize(); //同步CPU和gpu,否则测速结果为cpu启动内核函数的速度
auto end_gpu = std::chrono::steady_clock::now();
std::chrono::duration<double, std::micro> elapsed_gpu = end_gpu - start_gpu; // std::micro 表示以微秒为时间单位
std::cout<< "GPU resize time: " << elapsed_gpu.count() << " us" << std::endl;
// k=1, 187 us; k=2, 263.203 us; k=3, 370.887 us; k=5, 729.543 us; k=7, 1246.63 us; k=9, 1915.2 us; k=11, 2754.21 us
cudaMemcpy(img_resize_gpu.data, d_out, imgHeight_k * imgWidth_k * sizeof(uchar3), cudaMemcpyDeviceToHost);
int diff = 0;
for(int i=0; i<imgWidth_k; i++)
{
for(int j=0; j<imgHeight_k; j++)
{
if(img_resize_gpu.data[i,j] != img_resize_cpu.data[i,j]) diff += 1;
}
}
std::cout << "diff: " << diff << std::endl;
cudaFree(d_in);
cudaFree(d_out);
cv::imwrite("lisfan-70_" + std::to_string(k) + "_gpu.jpg", img_resize_gpu);
return 0;
}
对应的CMakeLists.txt
CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
PROJECT(resize_gpu)
FIND_PACKAGE(OpenCV REQUIRED)
# ibopencv_imgcodecs
FIND_PACKAGE(CUDA REQUIRED)
CUDA_ADD_EXECUTABLE(resize_gpu resize_gpu.cu)
TARGET_LINK_LIBRARIES(resize_gpu ${OpenCV_LIBS})
# set(CMAKE_CXX_FLAGS "-std=c++11") # 加不加这行都可以
# 直接鼠标右键Run Code不能跑
# mkdir build && cd build
# cd ..
# ./build/display_image
由于写入global memory时,数据量增加到k^2倍,所以写入数据时coalesce会更快一些
版权声明:本文为ysh1026原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。