NCNN conv3*3 native
   
    
    
    conv3x3s1_sse
   
static void conv3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;
    const float* kernel = _kernel;
    const float* bias = _bias;
    #pragma omp parallel for num_threads(opt.num_threads)   //设置omp多线程
    for (int p=0; p<outch; p++)               //最外层为输出个数
    {
        Mat out = top_blob.channel(p);        //指针向后移动w*w位置,移到下一channel位置
        const float bias0 = bias ? bias[p] : 0.f;
        out.fill(bias0); //将一个核长度的位置填充为bias,之后为相加
        for (int q=0; q<inch; q++) //内循环输入通道数
        {
            float* outptr = out;               
            float* outptr2 = outptr + outw;
            const float* img0 = bottom_blob.channel(q);
            const float* kernel0 = kernel + p*inch*9  + q*9;   //定位不同通道的核
            const float* r0 = img0;       //一次性输入4行
            const float* r1 = img0 + w;
            const float* r2 = img0 + w*2;
            const float* r3 = img0 + w*3;
            const float* k0 = kernel0;    //4行和核卷积
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;
            int i = 0;
            for (; i+1 < outh; i+=2)   //一次输出2行
            {
                int remain = outw;
                for (; remain>0; remain--)
                {
                    float sum = 0;
                    float sum2 = 0;
                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];
                    sum2 += r1[0] * k0[0];
                    sum2 += r1[1] * k0[1];
                    sum2 += r1[2] * k0[2];
                    sum2 += r2[0] * k1[0];
                    sum2 += r2[1] * k1[1];
                    sum2 += r2[2] * k1[2];
                    sum2 += r3[0] * k2[0];
                    sum2 += r3[1] * k2[1];
                    sum2 += r3[2] * k2[2];
                    *outptr += sum;
                    *outptr2 += sum2;
                    r0++;
                    r1++;
                    r2++;
                    r3++;
                    outptr++;
                    outptr2++;
                }
                r0 += 2 + w;   //到下一个两行的首部
                r1 += 2 + w;
                r2 += 2 + w;
                r3 += 2 + w;
                outptr += outw;
                outptr2 += outw;
            }
            for (; i < outh; i++)   //多余的部分
            {
                int remain = outw;
                for (; remain>0; remain--)
                {
                    float sum = 0;
                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];
                    *outptr += sum;
                    r0++;
                    r1++;
                    r2++;
                    outptr++;
                }
                r0 += 2;
                r1 += 2;
                r2 += 2;
            }
        }
    }
}
     
   
 
版权声明:本文为xiaohe004原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
