c++ 并行计算的优化方法

前言

关于并行计算可以参考：

https://computing.llnl.gov/tutorials/parallel_comp/

另外关于并行计算的常见方法总结可以参考：

https://yq.aliyun.com/articles/68226

常用方法

并行计算的优化可以从CPU 和 GPU 两个途径进行。

在CPU 上根据指令流和数据流可以分为SIMD(单指令多数据) 和 MIMD (多指令多数据)。

SIMD的方法主要是neon 等汇编指令的优化。(

关于SIMD的使用简介参考

)
MIMD 是基于多核CPU 的场景的，主要有tbb，openmp

GPU 上的方法主要是opencl 的使用。

方法的使用

这里记录下tbb 和 openmp 的基本使用

tbb

官方下载地址： https://github.com/intel/tbb

可以下载源码编译，也可以下载release的版本库。

编译中使用:

g++ -g -I /path/tbb/include/ -lrt -Wl,-rpath=/path/tbb/lib/intel64/gcc4.4/ -L/path/tbb/lib/intel64/gcc4.4/ -ltbb test.cpp -o test

openmp

openmp是一套支持跨平台共享内存方式的多线程并发的编程API。其使用很简单，目前大部分系统自带openmp 的库。

编译中使用：

编译选项中添加 -fopenmp

测试代码

#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <string.h>
#include <time.h>
#include <immintrin.h>  
#include "tbb/parallel_for.h"
#include "tbb/blocked_range2d.h"

using namespace tbb;

inline long int getTickNs()
{
    long timeTick;
    struct timespec time_tmp={0,0};
    clock_gettime(CLOCK_MONOTONIC,&time_tmp);
    
    timeTick = time_tmp.tv_nsec;
    timeTick = timeTick + (time_tmp.tv_sec * 1000000);

    return timeTick;
}

inline int getTime()
{
    return time(NULL);
}

const uint32_t ARR_NUM = 90;
const uint32_t ARR_SIZE = 80000;

#define DATA_TYPE int

#if 1
class SumBody2D {
private:
    int *m_arrSum;
    int **m_arr;
public:
    void operator()( const blocked_range2d<size_t>& r ) const {
        for( size_t i=r.rows().begin(); i!=r.rows().end(); ++i ){
            for( size_t j=r.cols().begin(); j!=r.cols().end(); ++j ) {
                m_arrSum[i] += m_arr[j][i];
            }
        }
    }
    SumBody2D( int **arr, int *sum) 
    : m_arr(arr) 
    , m_arrSum(sum)
    {}
};

class SumBody {
private:
    DATA_TYPE *m_arrSum;
    DATA_TYPE **m_arr;
public:
    void operator()( const blocked_range<size_t>& r ) const {
        for( size_t i=r.begin(); i!=r.end(); ++i ){
            for( size_t j=0; j<ARR_NUM; j+=10 ) {
                
                m_arrSum[i] += m_arr[j][i];
                m_arrSum[i] += m_arr[j+1][i];
                m_arrSum[i] += m_arr[j+2][i];
                m_arrSum[i] += m_arr[j+3][i];

                m_arrSum[i] += m_arr[j+4][i];
                m_arrSum[i] += m_arr[j+5][i];
                m_arrSum[i] += m_arr[j+6][i];
                m_arrSum[i] += m_arr[j+7][i];

                m_arrSum[i] += m_arr[j+8][i];
                m_arrSum[i] += m_arr[j+9][i];
            }
        }
    }
    SumBody( DATA_TYPE **arr, DATA_TYPE *sum) 
    : m_arr(arr) 
    , m_arrSum(sum)
    {}
};
#endif

int main()
{
    DATA_TYPE *arr[ARR_NUM];
    arr[0] = new DATA_TYPE[ARR_NUM * ARR_SIZE];
    for(int i=1; i<ARR_NUM; i++)
    {
        arr[i] = arr[i-1] + ARR_SIZE;
    }
    // memset(arr[0], 1, sizeof(int) * ARR_NUM * ARR_SIZE);
    long int start = getTickNs();
    for(int i=0; i<ARR_NUM; i++)
    {
        for(int j=0; j<ARR_SIZE; j++)
        {
            arr[i][j] = 1.0f;
        }
    }
    long int end = getTickNs();
    printf("set val time cost: %lldus\n", (end - start)/1000);
    printf("arr[0][0]: %d\n", arr[0][ARR_SIZE-1]);
    printf("arr[2][0]: %d\n", arr[ARR_NUM-1][ARR_SIZE-1]);

    DATA_TYPE relt[ARR_SIZE] = {0.0f};
    start = getTickNs();
    for(int i=0; i<ARR_SIZE; i++)
    {
        for(int j=0; j<ARR_NUM; j+=10)
        {
            // #pragma omp atomic
            // relt[i] += arr[j][i];
            relt[i] += arr[j][i];
            relt[i] += arr[j+1][i];
            relt[i] += arr[j+2][i];
            relt[i] += arr[j+3][i];
            relt[i] += arr[j+4][i];
            relt[i] += arr[j+5][i];
            relt[i] += arr[j+6][i];
            relt[i] += arr[j+7][i];
            relt[i] += arr[j+8][i];
            relt[i] += arr[j+9][i];
        }
    }

    printf("get reslt, reslt[0]: %d, cost:%lldus\n", relt[0], (getTickNs() - start)/1000);

    // start = getTickNs();
    // #pragma omp parallel
    for(int i=0; i<10; i++)
    {
        start = getTickNs();
        #pragma omp parallel for //collapse(2)
        for(int i=0; i<ARR_SIZE; i++)
        {
            #pragma simd
            for(int j=0; j<ARR_NUM; j+=10)
            {
                relt[i] += arr[j][i];
                relt[i] += arr[j+1][i];
                relt[i] += arr[j+2][i];
                relt[i] += arr[j+3][i];

                relt[i] += arr[j+4][i];
                relt[i] += arr[j+5][i];
                relt[i] += arr[j+6][i];
                relt[i] += arr[j+7][i];
                
                relt[i] += arr[j+8][i];
                relt[i] += arr[j+9][i];
            }
        }
        printf("openmp get reslt, reslt[0]: %d, cost:%lldus\n", relt[0], (getTickNs() - start)/1000);
    }


    for(int i=0; i<10; i++)
    {
        start = getTickNs();
        parallel_for( blocked_range<size_t>(0, ARR_SIZE), SumBody(arr, relt) );
        printf("tbb get reslt, reslt[0]: %d, cost:%lldus\n", relt[0], (getTickNs() - start)/1000);
    }

    delete [](arr[0]);

    return 0;
}

测试结果（Intel Xeon® CPU E5）：

//结果会累加，没有错误，重点关注时间消耗
get reslt, reslt[0]: 90, cost:37251us                                                                                                                                                 
openmp get reslt, reslt[0]: 180, cost:17382us                                                                                                                                         
openmp get reslt, reslt[0]: 270, cost:13578us                                                                                                                                         
openmp get reslt, reslt[0]: 360, cost:13842us                                                                                                                                         
openmp get reslt, reslt[0]: 450, cost:13713us                                                                                                                                         
openmp get reslt, reslt[0]: 540, cost:13772us                                                                                                                                         
openmp get reslt, reslt[0]: 630, cost:13583us                                                                                                                                         
openmp get reslt, reslt[0]: 720, cost:13747us                                                                                                                                         
openmp get reslt, reslt[0]: 810, cost:13932us                                                                                                                                         
openmp get reslt, reslt[0]: 900, cost:13720us                                                                                                                                         
openmp get reslt, reslt[0]: 990, cost:13561us                                                                                                                                         
tbb get reslt, reslt[0]: 1080, cost:15830us                                                                                                                                           
tbb get reslt, reslt[0]: 1170, cost:13202us                                                                                                                                           
tbb get reslt, reslt[0]: 1260, cost:13699us                                                                                                                                           
tbb get reslt, reslt[0]: 1350, cost:12965us                                                                                                                                           
tbb get reslt, reslt[0]: 1440, cost:13208us                                                                                                                                           
tbb get reslt, reslt[0]: 1530, cost:13397us                                                                                                                                           
tbb get reslt, reslt[0]: 1620, cost:13385us                                                                                                                                           
tbb get reslt, reslt[0]: 1710, cost:13202us                                                                                                                                           
tbb get reslt, reslt[0]: 1800, cost:13098us                                                                                                                                           
tbb get reslt, reslt[0]: 1890, cost:13558us

原文链接：https://blog.csdn.net/xubuwei/article/details/103478227

前言

常用方法

方法的使用

测试代码

你可能也喜欢