Deep Learning Systems Lecture 12 GPU Acceleration
这里回顾dlsys第十二讲,本讲内容是GPU加速。
课程主页:
大纲
- GPU编程;
- 案例研究:GPU上的矩阵乘法;
GPU编程
GPU vs CPU
GPU和CPU最大的区别在于有很多个核。
GPU编程模式:SIMT
- 单指令多线程(SIMT);
- 所有线程执行相同的代码,但可以采用不同的路径;
- 线程被分组到block中;
- 同一blocks内的线程共享内存;
- blocks被分组到一个启动grid中;
- 一个内核执行一个grid;
注意:我们将在本讲座中使用CUDA的术语。但通常这些概念在其他gpu编程模型中有对应概念(opencl、sycl、metal)
例子:向量加法
void VecAddCPU(float* A, float *B, float* C, int n) {
for (int i = 0; i < n; ++i) {
C[i] = A[i] + B[i];
}
}
__global__ void VecAddKernel(float* A, float *B, float* C, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
C[i] = A[i] + B[i];
}
}
例子:向量加法host侧代码
void VecAddCUDA(float* Acpu, float *Bcpu, float* Ccpu, int n) {
float *dA, *dB, *dC;
cudaMalloc(&dA, n * sizeof(float));
cudaMalloc(&dB, n * sizeof(float));
cudaMalloc(&dC, n * sizeof(float));
cudaMemcpy(dA, Acpu, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dB, Bcpu, n * sizeof(float), cudaMemcpyHostToDevice);
int threads_per_block = 512;
int nblocks = (n + threads_per_block - 1) / threads_per_block;
VecAddKernel<<<nblocks, thread_per_block>>>(dA, dB, dC, n);
cudaMemcpy(Ccpu, dC, n * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dA); cudaFree(dB); cudaFree(dC);
}
__global__ void VecAddKernel(float* A, float *B, float* C, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
C[i] = A[i] + B[i];
}
}
演示主机端启动和设备内存分配。与这个例子不同的是,真实的应用程序通常会尽可能长时间地将数据保存在gpu 内存中。
其他GPU编程模型例子
GPU内存层次结构
例子:window sum
#define RADIUS 2
__global__ void WindowSumSimpleKernel(float* A, float *B, int n) {
int out_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (out_idx < n) {
float sum = 0;
for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
sum += A[dx + out_idx + RADIUS];
}
B[out_idx] = sum;
}
}
例子:使用shared memory的window sum
__global__ void WindowSumSharedKernel(float* A, float *B, int n) {
__shared__ float temp[THREADS_PER_BLOCK + 2 * RADIUS];
int base = blockDim.x * blockIdx.x;
int out_idx = base + threadIdx.x;
if (base + threadIdx.x < n) {
temp[threadIdx.x] = A[base + threadIdx.x];
}
if (threadIdx.x < 2 * RADIUS && base + THREADS_PER_BLOCK + threadIdx.x < n) {
temp[threadIdx.x + THREADS_PER_BLOCK] = A[base + THREADS_PER_BLOCK + threadIdx.x];
}
__syncthreads();
if (out_idx < n) {
float sum = 0;
for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
sum += temp[threadIdx.x + dx + RADIUS];
}
B[out_idx] = sum;
}
}
小结
- 启动线程grid和block;
- 协作获取shared memory的公共内容以增加重用;
GPU上的矩阵乘法
Thread-level: register tiling
__global__ void mm(float A[N][N], float B[N][N], float C[N][N]) {
int ybase = blockIdx.y * blockDim.y + threadIdx.y;
int xbase = blockIdx.x * blockDim.x + threadIdx.x;
float c[V][V] = {0};
float a[V], b[V];
for (int k = 0; k < N; ++k) {
a[:] = A[k, ybase*V : ybase*V + V];
b[:] = B[k, xbase*V : xbase*V + V];
for (int y = 0; y < V; ++y) {
for (int x = 0; x < V; ++x) {
c[y][x] += a[y] * b[x];
}
}
}
C[ybase * V : ybase*V + V, xbase*V : xbase*V + V] = c[:];
}
Block-level: shared memory tiling
__global__ void mm(float A[N][N], float B[N][N], float C[N][N]) {
__shared__ float sA[S][L], sB[S][L];
float c[V][V] = {0};
float a[V], b[V];
int yblock = blockIdx.y;
int xblock = blockIdx.x;
for (int ko = 0; ko < N; ko += S) {
__syncthreads();
// needs to be implemented by thread cooperative fetching
sA[:, :] = A[k : k + S, yblock * L : yblock * L + L];
sB[:, :] = B[k : k + S, xblock * L : xblock * L + L];
__syncthreads();
for (int ki = 0; ki < S; ++ ki) {
a[:] = sA[ki, threadIdx.y * V : threadIdx.y * V + V];
b[:] = sA[ki, threadIdx.x * V : threadIdx.x * V + V];
for (int y = 0; y < V; ++y) {
for (int x = 0; x < V; ++x) {
c[y][x] += a[y] * b[x];
}
}
}
}
int ybase = blockIdx.y * blockDim.y + threadIdx.y;
int xbase = blockIdx.x * blockDim.x + threadIdx.x;
C[ybase * V : ybase*V + V, xbase*V : xbase*V + V] = c[:];
}
分析:
- global->shared copy: $2N^3/ L$;
- shared->register: $2N^3/ V$;
Cooperative Fetching展开
代码:
sA[:, :] = A[k : k + S, yblock * L : yblock * L + L];
可以展开为:
int nthreads = blockDim.y * blockDim.x;
int tid = threadIdx.y * blockDim.x + threadIdx.x;
for(int j = 0; j < L * S / nthreads; ++j) {
int y = (j * nthreads + tid) / L;
int x = (j * nthreads + tid) % L;
s[y, x] = A[k + y, yblock * L + x];
}
更多GPU优化技巧
- 全局内存连续读;
- 共享内存库冲突;
- 软件流水线;
- Warp level优化;
- Tensor Core;
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Doraemonzzz!
评论
ValineLivere