NCNN conv3*3 native
conv3x3s1_sse
static void conv3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
const float* kernel = _kernel;
const float* bias = _bias;
#pragma omp parallel for num_threads(opt.num_threads) //设置omp多线程
for (int p=0; p<outch; p++) //最外层为输出个数
{
Mat out = top_blob.channel(p); //指针向后移动w*w位置,移到下一channel位置
const float bias0 = bias ? bias[p] : 0.f;
out.fill(bias0); //将一个核长度的位置填充为bias,之后为相加
for (int q=0; q<inch; q++) //内循环输入通道数
{
float* outptr = out;
float* outptr2 = outptr + outw;
const float* img0 = bottom_blob.channel(q);
const float* kernel0 = kernel + p*inch*9 + q*9; //定位不同通道的核
const float* r0 = img0; //一次性输入4行
const float* r1 = img0 + w;
const float* r2 = img0 + w*2;
const float* r3 = img0 + w*3;
const float* k0 = kernel0; //4行和核卷积
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;
int i = 0;
for (; i+1 < outh; i+=2) //一次输出2行
{
int remain = outw;
for (; remain>0; remain--)
{
float sum = 0;
float sum2 = 0;
sum += r0[0] * k0[0];
sum += r0[1] * k0[1];
sum += r0[2] * k0[2];
sum += r1[0] * k1[0];
sum += r1[1] * k1[1];
sum += r1[2] * k1[2];
sum += r2[0] * k2[0];
sum += r2[1] * k2[1];
sum += r2[2] * k2[2];
sum2 += r1[0] * k0[0];
sum2 += r1[1] * k0[1];
sum2 += r1[2] * k0[2];
sum2 += r2[0] * k1[0];
sum2 += r2[1] * k1[1];
sum2 += r2[2] * k1[2];
sum2 += r3[0] * k2[0];
sum2 += r3[1] * k2[1];
sum2 += r3[2] * k2[2];
*outptr += sum;
*outptr2 += sum2;
r0++;
r1++;
r2++;
r3++;
outptr++;
outptr2++;
}
r0 += 2 + w; //到下一个两行的首部
r1 += 2 + w;
r2 += 2 + w;
r3 += 2 + w;
outptr += outw;
outptr2 += outw;
}
for (; i < outh; i++) //多余的部分
{
int remain = outw;
for (; remain>0; remain--)
{
float sum = 0;
sum += r0[0] * k0[0];
sum += r0[1] * k0[1];
sum += r0[2] * k0[2];
sum += r1[0] * k1[0];
sum += r1[1] * k1[1];
sum += r1[2] * k1[2];
sum += r2[0] * k2[0];
sum += r2[1] * k2[1];
sum += r2[2] * k2[2];
*outptr += sum;
r0++;
r1++;
r2++;
outptr++;
}
r0 += 2;
r1 += 2;
r2 += 2;
}
}
}
}
版权声明:本文为xiaohe004原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。