环境:
matlab2015b vs2010 cuda7.5
步骤一:创建
ComplexMul.h
文件
#include”cuComplex.h” //因为处理的数据是复数,需要添加这个头文件
#ifndef__COMPLEXMUL_H__
#define__COMPLEXMUL_H__
//声明所需要的函数。extern表示该函数在其他文件中执行
externvoid ComplexMul(cuComplex* A, cuComplex*B, cuComplex* C, int size);//实现矩阵点乘
externvoid Scale(cuComplex* Input, cuComplex*Output, int size);//实现归一化
#endif//__COMPLEXMUL_H__
步骤二:在
ComplexMul.cu
中实现声明的函数
#include “ComplexMul.h”
#include “mex.h”
#include “cuComplex.h”
__global__ void ComplexMulMask(cuComplex* A, cuComplex* B, cuComplex* C, int size)
{
const intnumThreads = blockDim.x * gridDim.x;
const intthreadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i= threadID; i < size; i += numThreads)
{
C[i].x= A[i].x * B[i].x – A[i].y * B[i].y;
C[i].y= A[i].x * B[i].y + A[i].y * B[i].x;
}
}
__global__ void ScaleMask(cuComplex* input, floatscale, int size)
{
const intnumThreads = blockDim.x * gridDim.x;
const intthreadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i= threadID; i < size; i += numThreads)
{
input[i].x *=scale;
input[i].y*=scale;
}
}
void ComplexMul(cuComplex* A, cuComplex* B, cuComplex* C, int size)
{
cuComplex*devPtrA=0, *devPtrB=0, *devPtrC=0;
cudaMalloc(&devPtrA,sizeof(cuComplex)*size);
cudaMalloc(&devPtrB,sizeof(cuComplex)*size);
cudaMalloc(&devPtrC,sizeof(cuComplex)*size);
cudaMemcpy(devPtrA,A,sizeof(cuComplex)*size, cudaMemcpyHostToDevice );
cudaMemcpy(devPtrB,B,sizeof(cuComplex)*size, cudaMemcpyHostToDevice );
ComplexMulMask<<<128,128>>>(devPtrA,devPtrB, devPtrC, size);
cudaMemcpy(C,devPtrC,sizeof(cuComplex)*size,cudaMemcpyDeviceToHost);
cudaFree(devPtrA);
cudaFree(devPtrB);
cudaFree(devPtrC);
}
void Scale(cuComplex*Input, cuComplex* Output, int size)
{
cuComplex*devPtrIn=0, *devPtrOut=0;
cudaMalloc(&devPtrIn,sizeof(cuComplex)*size);
cudaMalloc(&devPtrOut,sizeof(cuComplex)*size);
cudaMemcpy(devPtrIn,Input,sizeof(cuComplex)*size, cudaMemcpyHostToDevice );
ScaleMask<<<128,128>>>(devPtrIn,1.0f/size, size);
cudaMemcpy(Output,devPtrIn,sizeof(cuComplex)*size,cudaMemcpyDeviceToHost);
cudaFree(devPtrIn);
cudaFree(devPtrOut);
}
步骤三:使用
-c
选项编译
CUDA
代码,生成目标文件。
在matlab命令窗口输入:
>>system(‘nvcc–c
ComplexMul
.cu’)
编译成功后生成ComplexMul.obj文件
步骤四:创建mex
函数,在matlab里面创建PulseCompression.cpp文件
//脉冲压缩
#include “mex.h”
#include <cuda_runtime.h>
#include <cufft.h>
#include “ComplexMul.h”
#defineNX 16384 //脉压点数,必须是2的整数次幂
#defineBATCH 1
voidmexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] )
{
if( nrhs != 4 )
mexErrMsgTxt(“Invaid number ofinput arguments”);//检查输入参数个数是否为2
double* signal_i =(double*)mxGetData(prhs[0]); //读取数据
double* signal_q = (double*)mxGetData(prhs[1]);
double* filter_i =(double*)mxGetData(prhs[2]);
double* filter_q =(double*)mxGetData(prhs[3]);
cufftComplex*signal_H=(cufftComplex*)malloc(NX*BATCH*sizeof(cufftComplex));
cufftComplex* filter_H=(cufftComplex*)malloc(NX*BATCH*sizeof(cufftComplex));
// printf(“输入信号为:\n”);
for(int i=0;i<NX;i++)
{
signal_H[i].x=signal_i[i];
signal_H[i].y=signal_q[i];
// if(i<10) printf(“%.4f+%.4fi\n”,signal_i[i],signal_q[i] );
}
//printf(“输入滤波函数为:\n”);
for(int i=0;i<NX;i++)
{
filter_H[i].x=filter_i[i];
filter_H[i].y=filter_q[i];
// if(i<10) printf(“%.4f+%.4fi\n”,filter_i[i],filter_q[i] );
}
cufftComplex *signal_D; //存储信号
cudaMalloc( &signal_D, sizeof(cufftComplex)*NX*BATCH); // malloc memory
cudaMemcpy( signal_D, signal_H,sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice); // copy memory from host to device
cufftComplex *filter_D; //存储滤波器系数
cudaMalloc( &filter_D, sizeof(cufftComplex)*NX*BATCH); // malloc memory
cudaMemcpy( filter_D, filter_H,sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice); // copy memory from host to device
cufftComplex* signal_deviceOut; //存储信号fft的结果
cudaMalloc( &signal_deviceOut,sizeof(cufftComplex)*NX*BATCH );
cufftComplex* filter_deviceOut; //存储滤波器系数 fft的结果
cudaMalloc( &filter_deviceOut,sizeof(cufftComplex)*NX*BATCH );
//记录时间
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);//计时开始
cufftHandle plan_S; //FFT的参数都相同,只需要创建一个plan
cufftPlan1d(&plan_S, NX*BATCH,CUFFT_C2C, BATCH);
cufftExecC2C(plan_S, signal_D,signal_deviceOut, CUFFT_FORWARD );
cufftExecC2C(plan_S, filter_D,filter_deviceOut, CUFFT_FORWARD );
cufftComplex* MatchS_device; //存储点乘的结果
cudaMalloc( &MatchS_device,sizeof(cufftComplex)*NX*BATCH );
ComplexMul(signal_deviceOut,filter_deviceOut, MatchS_device,NX*BATCH);//调用函数计算复数点乘
cufftComplex* MatchSifft_device; //存储ifft之后的结果
cudaMalloc( &MatchSifft_device,sizeof(cufftComplex)*NX*BATCH );
cufftExecC2C(plan_S, MatchS_device,MatchSifft_device, CUFFT_INVERSE );
cufftComplex* MatchS_scale; //存储归一化之后的结果
cudaMalloc( &MatchS_scale,sizeof(cufftComplex)*NX*BATCH );
Scale(MatchSifft_device, MatchS_scale,NX);//调用函数对ifft结果进行归一
//记录结束时间
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);//计时结束
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
//printf(“%3.3fms\t”,elapsedTime);//输出计时结果
cudaEventDestroy(start);
cudaEventDestroy(stop);
float* signal_out = (float*)mxMalloc(sizeof(cufftComplex)*NX*BATCH );
cudaMemcpy(signal_out,MatchS_scale,sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost );
plhs[0] = mxCreateNumericMatrix( BATCH,NX, mxSINGLE_CLASS, mxCOMPLEX);
plhs[1] = mxCreateDoubleMatrix(1,1,mxREAL);
double *T;
T=mxGetPr(plhs[1]);
*T=elapsedTime;
float* S_real = (float*)mxGetPr(plhs[0]);
float* S_imag = (float*)mxGetPi(plhs[0]);
float* S_complex = signal_out;
for ( int c = 0; c < BATCH; ++c )
{
for (int r = 0; r < NX; ++r )
{
*S_real++ = *S_complex++;
*S_imag++ = *S_complex++;
}
}
mxFree(signal_out);
cufftDestroy(plan_S);
cudaFree(signal_D);
}
步骤五:编译
mex
,在
matlab
里输入以下命令:
>>mex PulseCompression.cpp complexMul.obj -lcudart -lcufft -L”C:\ProgramFiles\NVIDIA GPU Computing Toolkit\CUDA\v7.5\lib\x64″ -v-I”C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\include”
l -lcudart:告知mex正在使用cuda运行时库。(l是小写的L)
l -lcufft:表明子啊调用CUFFT库。
l -Ldir:dir是CUDA和CUFFT库所在的目录。
l -Idir:dir是CUDA和CUFFT头文件所在的目录。(I是大写的i)
编译成功会生成PulseCompression.mexw64文件。
步骤六:在
matlab
里面调用
PulseCompression
函数。编写
PulseCom.m
文件计算设备端脉压并与
matlab
脉压结果做对比。
clc;
clearall;
MON=1000;
fori=1:MON
%随机生成的数据
data_AI=rand(1,FFT_N)*100;
data_AQ=rand(1,FFT_N)*100;
PC_filter_I=rand(1,FFT_N)*100;
PC_filter_Q=rand(1,FFT_N)*100;
%设备端脉压
tic
[MatchSo_GPU,Tfft(i)]=PulseCompression(data_AI,data_AQ,PC_filter_I,PC_filter_Q);
TD(i)=toc;
%MATALB做脉压
tic
dataA=data_AI+j*data_AQ;
dataA_fft=fft(dataA,FFT_N);
PC_filter=PC_filter_I+j*PC_filter_Q;
Match_fft=fft(PC_filter,FFT_N);
MatchS=dataA_fft.*Match_fft;
MatchSo=ifft(MatchS);%脉压输出
TH(i)=toc;
end
T_fft=sum(Tfft)/MON
Tdev=sum(TD)/MON
THos=sum(TH)/MON
figure();
plot(abs(MatchSo));
xlabel(‘距离单元’)
ylabel(‘脉压输出’)
title(‘Matlab端脉冲压缩’);grid;
figure();
plot(abs(MatchSo_GPU));
xlabel(‘距离单元’)
ylabel(‘脉压输出’)
title(‘GPU端脉冲压缩’);grid;
结果: