Deep Learning Systems Lecture 11 Hardware Acceleration
这里回顾dlsys第十一讲,本讲内容是硬件加速。
课程主页:
大纲
- 一般加速技术;
- 案例研究:矩阵乘法;
一般加速技术
机器学习框架中的层
向量化
相加两个长度为256的数组:
void vecadd(float* A, float *B, float* C) {
for (int i = 0; i < 64; ++i) {
float4 a = load_float4(A + i*4);
float4 b = load_float4(B + i*4);
float4 c = add_float4(a, b);
store_float4(C + i* 4, c);
}
}
附加要求:内存(A、B、C)需要对齐到128位。
数据布局和步幅
- 问题:如何在内存中存储一个矩阵;
- Row major:
A[i, j] => Adata[i * A.shape[1] + j]
; - Column major:
A[i, j] => Adata[j * A.shape[0] + i]
; - Strides format:
A[i, j] => Adata[i * A.strides[0] + j * A.strides[1]]
;
关于strides的讨论
优点:可以零拷贝方式进行转换/切片:
- 切片:更改begin offset和形状;
- Transpose:交换strides;
- 广播:插入一个等于0的stride;
缺点:内存访问变得不连续:
- 使向量化更难;
- 许多线性代数运算可能需要先压缩数组;
并行
void vecadd(float* A, float *B, float* C) {
#pragma omp parallel for
for (int i = 0; i < 64; ++i) {
float4 a = load_float4(A + i*4);
float4 b = load_float4(B + i*4);
float4 c = add_float4(a, b);
store_float4(C * 4, c);
}
}
案例研究:矩阵乘法
Vanilla矩阵乘法
计算$C=A B^T$,朴素的实现时间复杂度为$O(n^3)$,代码:
float A[n][n], B[n][n], C[n][n];
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
C[i][j] = 0;
for (int k = 0; k < n; ++k) {
C[i][j] += A[i][k] * B[j][k];
}
}
}
现代CPU上的内存层次结构
考虑结构的复杂度分析
dram float A[n][n], B[n][n], C[n][n];
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
register float c = 0;
for (int k = 0; k < n; ++k) {
register float a = A[i][k];
register float b = B[j][k];
c += a * b;
}
C[i][j] = c;
}
}
分析:
- A dram->rigister time cost: $n^3$;
- B dram->rigister time cost: $n^3$;
- A register memory cost: 1;
- B register memory cost: 1;
- C register memory cost: 1;
小结:
- Load cost: $2\times \mathrm{dramspeed} \times n^3$;
- Register cost:$3$;
Tiled矩阵乘法
dram float A[n/v1][n/v3][v1][v3];
dram float B[n/v2][n/v3][v2][v3];
dram float C[n/v1][n/v2][v1][v2];
for (int i = 0; i < n/v1; ++i) {
for (int j = 0; j < n/v2; ++j) {
register float c[v1][v2] = 0;
for (int k = 0; k < n/v3; ++k) {
register float a[v1][v3] = A[i][k];
register float b[v2][v3] = B[j][k];
c += dot(a, b.T);
}
C[i][j] = c;
}
}
分析:
- A dram->rigister time cost: $n/v_1 \times n/v_2 \times n/ v_3 \times v_1\times v_3=n^3/ v_2$;
- B dram->rigister time cost: $n/v_1 \times n/v_2 \times n/ v_3 \times v_2\times v_3=n^3/v_1$;
- A register memory cost: $v_1\times v_3$;
- B register memory cost: $v_2\times v_3$;
- C register memory cost: $v_1\times v_2$;
小结:
- Load cost: $\mathrm{dramspeed} \times (n^3/ v_2 + n^3/ v_1)$;
- Register cost:$v_1\times v_3+ v_2\times v_3 + v_1\times v_2$;
感知cache line的tiling
dram float A[n/b1][b1][n];
dram float B[n/b2][b2][n];
dram float C[n/b1][n/b2][b1][b2];
for (int i = 0; i < n/b1; ++i) {
l1cache float a[b1][n] = A[i];
for (int j = 0; j < n/b2; ++j) {
l1cache b[b2][n] = B[j];
C[i][j] = dot(a, b.T);
}
}
分析:
- A dram->l1 time cost: $n/b_1\times b_1 \times n = n^2$;
- B dram->l1 time cost: $n/b_1\times n/ b_2 \times b_2\times n =n^3/ b_1$;
约束:
b1 * n + b2 * n < l1 cache size
;- 要在点乘上使用分块,需要:
b1 % v1 == 0
;b2 % v2 == 0
;
小结
dram float A[n/b1][b1/v1][n][v1];
dram float B[n/b2][b2/v2][n][v2];
for (int i = 0; i < n/b1; ++i) {
l1cache float a[b1/v1][n][v1] = A[i];
for (int j = 0; j < n/b2; ++j) {
l1cache b[b2/v2][n][v2] = B[j];
for (int x = 0; x < b1/v1; ++x) {
for (int y = 0; y < b2/v2; ++y) {
register float c[v1][v2] = 0;
for (int k = 0; k < n; ++k) {
register float ar[v1] = a[x][k][:];
register float br[v2] = b[y][k][:];
C += dot(ar, br.T);
}
}
}
}
}
分析:
- load cost: $\mathrm{l1speed}\times (n^3/ v_2 + n^3 / v_1) + \mathrm{dramspeed} \times (n^2 + n^3 / b_1)$;
关键思想:内存加载重用
dram float A[n/v1][n/v3][v1][v3];
dram float B[n/v2][n/v3][v2][v3];
dram float C[n/v1][n/v2][v1][v2];
for (int i = 0; i < n/v1; ++i) {
for (int j = 0; j < n/v2; ++j) {
register float c[v1][v2] = 0;
for (int k = 0; k < n/v3; ++k) {
register float a[v1][v3] = A[i][k];
register float b[v2][v3] = B[j][k];
c += dot(a, b.T);
}
C[i][j] = c;
}
}
分析:
- a重用$v_2$次;
- b重用$v_1$次;
- A dram->rigister time cost: $n/v_1 \times n/v_2 \times n/ v_3 \times v_1\times v_3=n^3/ v_2$;
- B dram->rigister time cost: $n/v_1 \times n/v_2 \times n/ v_3 \times v_2\times v_3=n^3/v_1$;
常见的重用模式
float A[n][n];
float B[n][n];
float C[n][n];
C[i][j] = sum(A[i][k] * B[j][k], axis=k)
$A$的访问独立于$j$,对$j$维度tile $v$次,可以使$A$重用$v$次。
讨论:卷积中可能的重用模式
float Input[n][ci][h][w];
float Weight[co][ci][K][K];
float Output[n][co][h][w];
Output[b][co][y][x] = sum(Input[b][k][y+ry][x+rx] * Weight[co][k][ry][rx], axis=[k, ry, rx])
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Doraemonzzz!
评论
ValineLivere