前言
关于并行计算可以参考:
https://computing.llnl.gov/tutorials/parallel_comp/
另外关于并行计算的常见方法总结可以参考:
https://yq.aliyun.com/articles/68226
常用方法
并行计算的优化可以从CPU 和 GPU 两个途径进行。
在CPU 上根据指令流和数据流可以分为SIMD(单指令多数据) 和 MIMD (多指令多数据)。
-
SIMD的方法主要是neon 等汇编指令的优化。(
关于SIMD的使用简介参考
) - MIMD 是基于多核CPU 的场景的,主要有tbb,openmp
GPU 上的方法主要是opencl 的使用。
方法的使用
这里记录下tbb 和 openmp 的基本使用
-
tbb
官方下载地址: https://github.com/intel/tbb
可以下载源码编译,也可以下载release的版本库。
编译中使用:
g++ -g -I /path/tbb/include/ -lrt -Wl,-rpath=/path/tbb/lib/intel64/gcc4.4/ -L/path/tbb/lib/intel64/gcc4.4/ -ltbb test.cpp -o test
-
openmp
openmp是一套支持跨平台共享内存方式的多线程并发的编程API。其使用很简单,目前大部分系统自带openmp 的库。
编译中使用:
编译选项中添加 -fopenmp
测试代码
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <string.h>
#include <time.h>
#include <immintrin.h>
#include "tbb/parallel_for.h"
#include "tbb/blocked_range2d.h"
using namespace tbb;
inline long int getTickNs()
{
long timeTick;
struct timespec time_tmp={0,0};
clock_gettime(CLOCK_MONOTONIC,&time_tmp);
timeTick = time_tmp.tv_nsec;
timeTick = timeTick + (time_tmp.tv_sec * 1000000);
return timeTick;
}
inline int getTime()
{
return time(NULL);
}
const uint32_t ARR_NUM = 90;
const uint32_t ARR_SIZE = 80000;
#define DATA_TYPE int
#if 1
class SumBody2D {
private:
int *m_arrSum;
int **m_arr;
public:
void operator()( const blocked_range2d<size_t>& r ) const {
for( size_t i=r.rows().begin(); i!=r.rows().end(); ++i ){
for( size_t j=r.cols().begin(); j!=r.cols().end(); ++j ) {
m_arrSum[i] += m_arr[j][i];
}
}
}
SumBody2D( int **arr, int *sum)
: m_arr(arr)
, m_arrSum(sum)
{}
};
class SumBody {
private:
DATA_TYPE *m_arrSum;
DATA_TYPE **m_arr;
public:
void operator()( const blocked_range<size_t>& r ) const {
for( size_t i=r.begin(); i!=r.end(); ++i ){
for( size_t j=0; j<ARR_NUM; j+=10 ) {
m_arrSum[i] += m_arr[j][i];
m_arrSum[i] += m_arr[j+1][i];
m_arrSum[i] += m_arr[j+2][i];
m_arrSum[i] += m_arr[j+3][i];
m_arrSum[i] += m_arr[j+4][i];
m_arrSum[i] += m_arr[j+5][i];
m_arrSum[i] += m_arr[j+6][i];
m_arrSum[i] += m_arr[j+7][i];
m_arrSum[i] += m_arr[j+8][i];
m_arrSum[i] += m_arr[j+9][i];
}
}
}
SumBody( DATA_TYPE **arr, DATA_TYPE *sum)
: m_arr(arr)
, m_arrSum(sum)
{}
};
#endif
int main()
{
DATA_TYPE *arr[ARR_NUM];
arr[0] = new DATA_TYPE[ARR_NUM * ARR_SIZE];
for(int i=1; i<ARR_NUM; i++)
{
arr[i] = arr[i-1] + ARR_SIZE;
}
// memset(arr[0], 1, sizeof(int) * ARR_NUM * ARR_SIZE);
long int start = getTickNs();
for(int i=0; i<ARR_NUM; i++)
{
for(int j=0; j<ARR_SIZE; j++)
{
arr[i][j] = 1.0f;
}
}
long int end = getTickNs();
printf("set val time cost: %lldus\n", (end - start)/1000);
printf("arr[0][0]: %d\n", arr[0][ARR_SIZE-1]);
printf("arr[2][0]: %d\n", arr[ARR_NUM-1][ARR_SIZE-1]);
DATA_TYPE relt[ARR_SIZE] = {0.0f};
start = getTickNs();
for(int i=0; i<ARR_SIZE; i++)
{
for(int j=0; j<ARR_NUM; j+=10)
{
// #pragma omp atomic
// relt[i] += arr[j][i];
relt[i] += arr[j][i];
relt[i] += arr[j+1][i];
relt[i] += arr[j+2][i];
relt[i] += arr[j+3][i];
relt[i] += arr[j+4][i];
relt[i] += arr[j+5][i];
relt[i] += arr[j+6][i];
relt[i] += arr[j+7][i];
relt[i] += arr[j+8][i];
relt[i] += arr[j+9][i];
}
}
printf("get reslt, reslt[0]: %d, cost:%lldus\n", relt[0], (getTickNs() - start)/1000);
// start = getTickNs();
// #pragma omp parallel
for(int i=0; i<10; i++)
{
start = getTickNs();
#pragma omp parallel for //collapse(2)
for(int i=0; i<ARR_SIZE; i++)
{
#pragma simd
for(int j=0; j<ARR_NUM; j+=10)
{
relt[i] += arr[j][i];
relt[i] += arr[j+1][i];
relt[i] += arr[j+2][i];
relt[i] += arr[j+3][i];
relt[i] += arr[j+4][i];
relt[i] += arr[j+5][i];
relt[i] += arr[j+6][i];
relt[i] += arr[j+7][i];
relt[i] += arr[j+8][i];
relt[i] += arr[j+9][i];
}
}
printf("openmp get reslt, reslt[0]: %d, cost:%lldus\n", relt[0], (getTickNs() - start)/1000);
}
for(int i=0; i<10; i++)
{
start = getTickNs();
parallel_for( blocked_range<size_t>(0, ARR_SIZE), SumBody(arr, relt) );
printf("tbb get reslt, reslt[0]: %d, cost:%lldus\n", relt[0], (getTickNs() - start)/1000);
}
delete [](arr[0]);
return 0;
}
测试结果(Intel Xeon® CPU E5):
//结果会累加,没有错误,重点关注时间消耗
get reslt, reslt[0]: 90, cost:37251us
openmp get reslt, reslt[0]: 180, cost:17382us
openmp get reslt, reslt[0]: 270, cost:13578us
openmp get reslt, reslt[0]: 360, cost:13842us
openmp get reslt, reslt[0]: 450, cost:13713us
openmp get reslt, reslt[0]: 540, cost:13772us
openmp get reslt, reslt[0]: 630, cost:13583us
openmp get reslt, reslt[0]: 720, cost:13747us
openmp get reslt, reslt[0]: 810, cost:13932us
openmp get reslt, reslt[0]: 900, cost:13720us
openmp get reslt, reslt[0]: 990, cost:13561us
tbb get reslt, reslt[0]: 1080, cost:15830us
tbb get reslt, reslt[0]: 1170, cost:13202us
tbb get reslt, reslt[0]: 1260, cost:13699us
tbb get reslt, reslt[0]: 1350, cost:12965us
tbb get reslt, reslt[0]: 1440, cost:13208us
tbb get reslt, reslt[0]: 1530, cost:13397us
tbb get reslt, reslt[0]: 1620, cost:13385us
tbb get reslt, reslt[0]: 1710, cost:13202us
tbb get reslt, reslt[0]: 1800, cost:13098us
tbb get reslt, reslt[0]: 1890, cost:13558us
版权声明:本文为xubuwei原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。