Sample: simpleMPI
Minimum spec: SM 3.0
Simple example demonstrating how to use MPI in combination with CUDA.
Key concepts:
CUDA Systems Integration
MPI
Multithreading
simpleMPI.h
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Simple example demonstrating how to use MPI with CUDA
*
* Generate some random numbers on one node.
* Dispatch them to all nodes.
* Compute their square root on each node's GPU.
* Compute the average of the results using MPI.
*
* simpleMPI.h: common header file
*/
// Forward declarations
extern "C" {
void initData(float *data, int dataSize);
void computeGPU(float *hostData, int blockSize, int gridSize);
float sum(float *data, int size);
void my_abort(int err);
}
simpleMPI.cu
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Simple example demonstrating how to use MPI with CUDA
*
* Generate some random numbers on one node.
* Dispatch them to all nodes.
* Compute their square root on each node's GPU.
* Compute the average of the results using MPI.
*
* simpleMPI.cu: GPU part, compiled with nvcc
*/
#include <iostream>
using std::cerr;
using std::endl;
#include "simpleMPI.h"
// Error handling macro
#define CUDA_CHECK(call) \
if((call) != cudaSuccess) { \
cudaError_t err = cudaGetLastError(); \
cerr << "CUDA error calling \""#call"\", code is " << err << endl; \
my_abort(err); }
// Device code
// Very simple GPU Kernel that computes square roots of input numbers
__global__ void simpleMPIKernel(float *input, float *output)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
output[tid] = sqrt(input[tid]);
}
// Initialize an array with random data (between 0 and 1)
void initData(float *data, int dataSize)
{
for (int i = 0; i < dataSize; i++)
{
data[i] = (float)rand() / RAND_MAX;
}
}
// CUDA computation on each node
// No MPI here, only CUDA
void computeGPU(float *hostData, int blockSize, int gridSize)
{
int dataSize = blockSize * gridSize;
// Allocate data on GPU memory
float *deviceInputData = NULL;
CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));
float *deviceOutputData = NULL;
CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
// Copy to GPU memory
CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
// Run kernel
simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
// Copy data back to CPU memory
CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize *sizeof(float), cudaMemcpyDeviceToHost));
// Free GPU memory
CUDA_CHECK(cudaFree(deviceInputData));
CUDA_CHECK(cudaFree(deviceOutputData));
}
float sum(float *data, int size)
{
float accum = 0.f;
for (int i = 0; i < size; i++)
{
accum += data[i];
}
return accum;
}
simpleMPI.cpp
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Simple example demonstrating how to use MPI with CUDA
*
* Generate some random numbers on one node.
* Dispatch them to all nodes.
* Compute their square root on each node's GPU.
* Compute the average of the results using MPI.
*
* simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
* on Windows, please download the Microsoft HPC Pack SDK 2008
*/
// MPI include
#include <mpi.h>
// System includes
#include <iostream>
using std::cout;
using std::cerr;
using std::endl;
// User include
#include "simpleMPI.h"
// Error handling macros
#define MPI_CHECK(call) \
if((call) != MPI_SUCCESS) { \
cerr << "MPI error calling \""#call"\"\n"; \
my_abort(-1); }
// Host code
// No CUDA here, only MPI
int main(int argc, char *argv[])
{
// Dimensions of the dataset
int blockSize = 256;
int gridSize = 10000;
int dataSizePerNode = gridSize * blockSize;
// Initialize MPI state
MPI_CHECK(MPI_Init(&argc, &argv));
// Get our MPI node number and node count
int commSize, commRank;
MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize));
MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank));
// Generate some random numbers on the root node (node 0)
int dataSizeTotal = dataSizePerNode * commSize;
float *dataRoot = NULL;
if (commRank == 0) // Are we the root node?
{
cout << "Running on " << commSize << " nodes" << endl;
dataRoot = new float[dataSizeTotal];
initData(dataRoot, dataSizeTotal);
}
// Allocate a buffer on each node
float *dataNode = new float[dataSizePerNode];
// Dispatch a portion of the input data to each node
MPI_CHECK(MPI_Scatter(dataRoot,
dataSizePerNode,
MPI_FLOAT,
dataNode,
dataSizePerNode,
MPI_FLOAT,
0,
MPI_COMM_WORLD));
if (commRank == 0)
{
// No need for root data any more
delete [] dataRoot;
}
// On each node, run computation on GPU
computeGPU(dataNode, blockSize, gridSize);
// Reduction to the root node, computing the sum of output elements
float sumNode = sum(dataNode, dataSizePerNode);
float sumRoot;
MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
if (commRank == 0)
{
float average = sumRoot / dataSizeTotal;
cout << "Average of square roots is: " << average << endl;
}
// Cleanup
delete [] dataNode;
MPI_CHECK(MPI_Finalize());
if (commRank == 0)
{
cout << "PASSED\n";
}
return 0;
}
// Shut down MPI cleanly if something goes wrong
void my_abort(int err)
{
cout << "Test FAILED\n";
MPI_Abort(MPI_COMM_WORLD, err);
}
版权声明:本文为lixiaoguang20原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。