ECE408 Lecture 23 Alternatives to CUDA

这次回顾ECE408 Lecture 23，这次介绍了CUDA的替代品。

课程主页：

https://wiki.illinois.edu/wiki/display/ECE408

搬运视频：

https://www.youtube.com/playlist?list=PL6RdenZrxrw-UKfRL5smPfFFpeqwN3Dsz

加速计算不再是问题

GPU 供应商包括：

Nvidia；
AMD；
Intel；
Samsung；
Apple；
Qualcomm；
ARM；

CUDA只是计算加速的一种模型

计算加速的历史：

OpenGL (1992), DirectX (1995)；
GPGPU(2002)；
CUDA (2007)；
OpenCL (2008)；
OpenACC (2012), C++AMP (2013), RenderScript (2013)；
Metal (2014), SYCL (2014), Vulkan (2016),
ROCm HIP (2016)；

MPI、TBB、OpenCV等现有框架适配以提供支持。Caffe、TensorFlow、R、PyCUDA等新框架原生支持加速。

OpenCL、OpenACC、MPI

OpenCL：开放标准加速API；
OpenACC：一种“低代码”加速API；
MPI：大规模、多节点并行API；

加速API的共同特征

硬件：

轻量级核心的层次结构；
本地暂存器内存；
缺乏硬件coherence ；
慢速全局原子操作；
线程；

软件：

面向内核的加速；
设备内存与主机内存；
软件管理内存；
Grids, Blocks, Threads；
批量同步并行；

OpenCL

CPU、GPU、DSP、FPGA等的框架（不仅仅是 Nvidia GPU）；
最初由Apple在 AMD、IBM、Qualcomm、Intel和Nvidia的支持下开发。OpenCL 1.0于2008 年推出；
OpenCL 2.2 于2017年5月推出；
Apple宣布在2018年放弃OpenCL；

OpenCL内存模型

OpenCL MatMulti

注意与CUDA的相似性;
WorkGroup类似于Block；
WorkItem类似于Thread；
__local类似于__shared；

代码：

1.// Tiled and coalesced version
2.__kernel void myGEMM2(int M, int N, int K, __global float* A, __global float* B, __global float* C) {
3.
4. 		// Thread identifiers
5. 		const int row = get_local_id(0); // Local row ID (max: TS)
6. 		const int col = get_local_id(1); // Local col ID (max: TS)
7. 		const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M)
8. 		const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)
9.
10. 	// Local memory to fit a tile of TS*TS elements of A and B
11. 	__local float Asub[TS][TS];
12. 	__local float Bsub[TS][TS];
13.
14. 	// Initialise the accumulation register
15. 	float acc = 0.0f;
16. 	// Loop over all tiles
17. 	const int numTiles = K/TS;
18. 	for (int t=0; t<numTiles; t++) {
19.
20. 		// Load one tile of A and B into local memory
21. 		const int tiledRow = TS*t + row;
22. 		const int tiledCol = TS*t + col;
23. 		Asub[col][row] = A[tiledCol*M + globalRow];
24. 		Bsub[col][row] = B[globalCol*K + tiledRow];
25.
26. 		// Synchronise to make sure the tile is loaded
27. 		barrier(CLK_LOCAL_MEM_FENCE);
28.
29. 		// Perform the computation for a single tile
30. 		for (int k=0; k<TS; k++)
31. 			acc += Asub[k][row] * Bsub[col][k];
32.
33. 		// Synchronise before loading the next tile
34. 		barrier(CLK_LOCAL_MEM_FENCE);
35. 	}
36.
37. 	// Store the final result in C
38. 	C[globalCol*M + globalRow] = acc;
39.}

OpenACC

OpenACC应用程序编程接口(API)提供了一组
- 编译器指令（pragmas）；
- 库例程；
- 环境变量；
可用于编写在加速器设备（包括GPU和CPU）上运行的数据并行的FORTRAN、C和C++程序；

OpenACC Pragmas

在C和C++ 中，#pragma指令是向编译器提供标准语言中未指定的信息的方法。

OpenACC抽象机器模型

OpenACC中的简单矩阵-矩阵乘法

1 void computeAcc(float *P, const float *M, const float *N, int Mh, int Mw, int Nw)
2 {
3
4 #pragma acc parallel loop copyin(M[0:Mh*Mw]) copyin(N[0:Nw*Mw]) copyout(P[0:Mh*Nw])
5 for (int i=0; i<Mh; i++) {
6 	#pragma acc loop
7 	for (int j=0; j<Nw; j++) {
8 		float sum = 0;
9 		for (int k=0; k<Mw; k++) {
10 			float a = M[i*Mw+k];
11 			float b = N[k*Nw+j];
12 			sum += a*b;
13 		}
14 		P[i*Nw+j] = sum;
15 	}
16 	}
17 }

一些观察

除了第4行和第6行带有#pragma的两行之外，该代码几乎与顺序版本相同
OpenACC使用编译器指令来扩展基础语言。
- 第4行的#pragma告诉编译器为第5行到第16行的“i”循环生成代码，以便循环迭代在加速器上并行执行。
- copyin子句和copyout子句指定矩阵数据应如何在主机和加速器之间传输。
- 第6行的#pragma指示编译器将内部“j”循环映射到加速器上的第二级并行性。

动机

OpenACC程序员通常可以从编写顺序版本开始，然后使用OpenACC指令注释他们的顺序程序。
- 将生成内核和数据传输的大部分细节留给OpenACC编译器。
OpenACC代码可以通过忽略编译指示由非OpenACC编译器编译。

经常遇到的问题

一些OpenACC pragmas是对OpenACC编译器的提示，它可能会或可能不会采取相应的行动；
- OpenACC的性能在很大程度上取决于编译器的质量；
- 这和CUDA和OpenCL不同；
如果忽略pragma，某些OpenACC程序可能会表现不同甚至不正确；

OpenACC设备模型

OpenACC中还有一组最小的API，用于同步、内存管理等。

OpenACC执行模型（术语：Gangs和Workers）

并行与循环构造

代码：

#pragma acc parallel loop copyin(M[0:Mh*Mw]) copyin(N[0:Nw*Mw]) copyout(P[0:Mh*Nw])
for (int i=0; i<Mh; i++) {
… 
}

等价于：

#pragma acc parallel copyin(M[0:Mh*Mw]) copyin(N[0:Nw*Mw]) copyout(P[0:Mh*Nw])
{
    #pragma acc loop
    for (int i=0; i<Mh; i++) {
    	…
    }
}

并行构造

并行构造在加速器上执行；
可以指定gangs数量和每个gang的workers数量；
程序员指令
```
#pragma acc parallel copyout(a) num_gangs(1024) num_workers(32)
{
	a = 23;
}
```
将创建1024*32个工人。 a=23将由所有1024个gang重复执行；

每个“Gang Loop”做什么？

#pragma acc parallel num_gangs(1024)
{
    for (int i=0; i<2048; i++) {
    	…
    }
}

for循环将被1024个gang冗余执行。

#pragma acc parallel num_gangs(1024)
{
	#pragma acc loop gang
    for (int i=0; i<2048; i++) {
    	…
    }
}

for循环的2048次迭代将分给1024个gang执行。

Worker循环

#pragma acc parallel num_gangs(1024) num_workers(32)
{
    #pragma acc loop gang
    for (int i=0; i<2048; i++) {
        #pragma acc loop worker
        for (int j=0; j<512; j++) {
        	foo(i,j);
        }
    }
}

将创建1024*32=32K个worker，每个执行1M/32K=32个foo()实例。

更复杂的例子

#pragma acc parallel num_gangs(32)
{
    Statement 1; Statement 2;
    #pragma acc loop gang
    for (int i=0; i<n; i++) {
    	Statement 3; Statement 4;
    }
    Statement 5; Statement 6;
    #pragma acc loop gang
    for (int i=0; i<m; i++) {
    	Statement 7; Statement 8;
    }
    Statement 9;
    if (condition)
    	Statement 10;
}

Statement 1和2被32个gang重复执行；
n-for循环迭代分配给32个gang；

抽象基于CUDA的节点

每个节点含有$N$个GPU：

MPI模型

许多进程分布在一个集群中；
每个进程计算部分输出；
进程通过消息传递相互通信（不是全局内存）；
进程可以通过消息同步；

MPI初始化，信息

用户通过在shell中执行来启动有X个进程的MPI作业；
- MPIrun –np X；
int MPI_Init(int *argc, char ***argv)；
- 初始化 MPI；
MPI_COMM_WORLD；
- 由所有分配的节点组成的MPI组；
int MPI_Comm_rank(MPI_Comm comm, int *rank)；
- 通信组中调用进程的rank；
int MPI_Comm_size(MPI_Comm comm, int *size)；
- comm组中的进程数；

向量加法：主进程

int main(int argc, char *argv[]) {
    int vector_size = 1024 * 1024 * 1024;
    int pid=-1, np=-1;
    
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    MPI_Comm_size(MPI_COMM_WORLD, &np);
    
    if(np < 3) {
        if(0 == pid) printf(“Nedded 3 or more processes.\n");
        MPI_Abort( MPI_COMM_WORLD, 1 ); return 1;
    }
    if(pid < np - 1)
    	compute_node(vector_size / (np - 1));
    else
    	data_server(vector_size);
                            
    MPI_Finalize();
    return 0;
}

MPI发送数据

int MPI_Send(void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)；
- Buf：发送缓冲区的起始地址；
- Count：发送缓冲区中的元素数（非负整数）；
- Datatype：每个发送缓冲区元素的数据类型；
- Dest：目的地rank（整数）；
- Tag：消息标签（整数）；
- Comm：通讯器（句柄）

MPI接收数据

int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)；
Buf：接收缓冲区的起始地址；
Count：接收缓冲区中的最大元素数（非负整数）；
Datatype：每个接收缓冲区元素的数据类型；
Source：来源rank（整数）；
Tag：消息标签（整数）；
Comm：通讯器（句柄）；
Status：状态对象；

向量加法：服务器进程

void data_server(unsigned int vector_size) {
    int np, num_nodes = np – 1, first_node = 0, last_node = np - 2;
    unsigned int num_bytes = vector_size * sizeof(float);
    float *input_a = 0, *input_b = 0, *output = 0;
    /* Set MPI Communication Size */
    MPI_Comm_size(MPI_COMM_WORLD, &np);
    /* Allocate input data */
    input_a = (float *)malloc(num_bytes);
    input_b = (float *)malloc(num_bytes);
    output = (float *)malloc(num_bytes);
    if(input_a == NULL || input_b == NULL || output == NULL) {
        printf("Server couldn't allocate memory\n");
        MPI_Abort( MPI_COMM_WORLD, 1 );
    }
    /* Initialize input data */
    random_data(input_a, vector_size, 1, 10);
    random_data(input_b, vector_size, 1, 10);

    /* Send data to compute nodes */
    float *ptr_a = input_a;
    float *ptr_b = input_b;
    for(int process = 1; process < last_node; process++) {
        MPI_Send(ptr_a, vector_size / num_nodes, MPI_FLOAT,
                 process, DATA_DISTRIBUTE, MPI_COMM_WORLD);
        ptr_a += vector_size / num_nodes;

        MPI_Send(ptr_b, vector_size / num_nodes, MPI_FLOAT,
                 process, DATA_DISTRIBUTE, MPI_COMM_WORLD);
        ptr_b += vector_size / num_nodes;
    }

    /* Wait for compute to complete*/
    MPI_Barrier(MPI_COMM_WORLD);
    /* Collect output data */
    MPI_Status status;
    for(int process = 0; process < num_nodes; process++) {
        MPI_Recv(output + process * num_points / num_nodes,
                 num_points / num_comp_nodes, MPI_REAL, process,
                 DATA_COLLECT, MPI_COMM_WORLD, &status );
    }
    /* Store output data */
    store_output(output, dimx, dimy, dimz);

    /* Release resources */
    free(input);
    free(output);
}

向量加法：计算进程

void compute_node(unsigned int vector_size ) {
    int np;
    unsigned int num_bytes = vector_size * sizeof(float);
    float *input_a, *input_b, *output;
    MPI_Status status;

    MPI_Comm_size(MPI_COMM_WORLD, &np);
    int server_process = np - 1;

    /* Alloc host memory */
    input_a = (float *)malloc(num_bytes);
    input_b = (float *)malloc(num_bytes);
    output = (float *)malloc(num_bytes);

    /* Get the input data from server process */
    MPI_Recv(input_a, vector_size, MPI_FLOAT, server_process,
             DATA_DISTRIBUTE, MPI_COMM_WORLD, &status);
    MPI_Recv(input_b, vector_size, MPI_FLOAT, server_process,
             DATA_DISTRIBUTE, MPI_COMM_WORLD, &status);

    /* Compute the partial vector addition */
    for(int i = 0; i < vector_size; ++i) {
        output[i] = input_a[i] + input_b[i];
    }

    /* Or, can offload to GPU here */
    /* cudaMalloc(), cudaMemcpy(), kernel launch, etc. */
    MPI_Barrier(MPI_COMM_WORLD);

    /* Send the output */
    MPI_Send(output, vector_size, MPI_FLOAT,
    server_process, DATA_COLLECT, MPI_COMM_WORLD);
    
    /* Release memory */
    free(input_a);
    free(input_b);
    free(output);
}