CMU 15-213 Lab4 Cache Lab

课程主页:http://www.cs.cmu.edu/afs/cs/academic/class/15213-f15/www/schedule.html

课程资料:https://github.com/EugeneLiu/translationCSAPP

课程视频:https://www.bilibili.com/video/av31289365/

这一部分回顾CSAPP的Cache Lab。

参考资料:

https://www.jianshu.com/p/e68dd8305e9c

https://blog.codedragon.tech/2017/09/25/%E6%B7%B1%E5%85%A5%E7%90%86%E8%A7%A3%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%B3%BB%E7%BB%9FCacheLab-PartB%E5%AE%9E%E9%AA%8C%E6%8A%A5%E5%91%8A/

https://wdxmzy.com/csapp/thick-csapp-lab-4/2016/04/16/

https://www.jianshu.com/p/582367289391

备注,该实验需要安装valgrind

1
sudo apt install valgrind

Part A:Writing a Cache Simulator

说明

首先忽略I,因为文档中有如下说明:

  • For this lab, we are interested only in data cache performance, so your simulator should ignore all
    instruction cache accesses (lines starting with “I”).

其次忽略size参数,因为文档中有如下说明:

  • For this this lab, you should assume that memory accesses are aligned properly, such that a single
    memory access never crosses block boundaries. By making this assumption, you can ignore the
    request sizes in the valgrind traces.

算法

该程序最重要的是L,S,M操作,对应如下三个函数

  • Load
  • Store
  • Modify

Store函数和Load函数的操作相同,注意到文章中有如下说明:

  • The data modify operation (M) is treated as a load followed by a store to the same address

所以Modify函数为Load函数加上Store函数,所以我们只需要设计Load函数,伪代码如下

  • 第一轮迭代,计算最大的LRU,如果$(\text{cache[index_][i].valid_bit} == 1) \&\& (\text{cache[index_][i].tag == tag})$则hit,同时找到$\text{cache[index_][i].valid_bit} == 0$的位置
  • 如果没有hit,输出miss
  • 如果存在$\text{cache[index_][i].valid_bit} == 0$的位置,则直接保存结果
  • 否则再次遍历,找到最大LRU对应的位置,保存结果
  • 输出evict

代码

备注:不要自己实现printSummary函数,该函数仅用于自己调试。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#include "cachelab.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <getopt.h>

void parse_parameter(int argc, char *argv[]);
void read_file(char *tracefile);
void parse_address(unsigned address);
void get_cache();
void Load();
void Store();
void Modify();
void print_help();

typedef struct cache_line{
unsigned valid_bit;
unsigned tag;
int LRU;
}cache_line;

//参数
int h, v, s, E, b, S, B;
unsigned tag, index_, offset;
char *tracefile;
//计数
int hits = 0;
int misses = 0;
int evictions = 0;
//参数的形式
char mode[4] = {'s', 'E', 'b', 't'};
//判断参数是否正确
int flag = 1;
//缓存
cache_line **cache;

int main(int argc, char *argv[]){
//解析参数
parse_parameter(argc, argv);

//如果参数正确才进行后续操作
if (flag){
//输出help信息
if (h == 1){
print_help();
}else{
//构造cache
get_cache();
//读取文件
read_file(tracefile);
//文件可能不存在,所以这里再加以判断
if (flag){
//输出
printSummary(hits, misses, evictions);
}

}
}else{
printf("%s: Missing required command line argument\n", argv[0]);
print_help();
}

return 0;
}

//输出help信息
void print_help(){
printf("Usage: ./csim-ref [-hv] -s <num> -E <num> -b <num> -t <file>\n");
printf("Options:\n");
printf(" -h Print this help message.\n");
printf(" -v Optional verbose flag.\n");
printf(" -s <num> Number of set index bits.\n");
printf(" -E <num> Number of lines per set.\n");
printf(" -b <num> Number of block offset bits.\n");
printf(" -t <file> Trace file.\n");
printf("Examples:\n");
printf(" linux> ./csim-ref -s 4 -E 1 -b 4 -t traces/yi.trace\n");
printf(" linux> ./csim-ref -v -s 8 -E 2 -b 4 -t traces/yi.trace\n");
}

//参考https://blog.csdn.net/fengxinlinux/article/details/51541003
//申请缓存空间并初始化
void get_cache(){
cache = (cache_line **)malloc(S * sizeof(cache_line*));
for (int i = 0; i < S; i++){
cache[i] = (cache_line *)malloc(E * sizeof(cache_line));
for (int j = 0; j < E; j++){
//初始化
cache[i][j].valid_bit = 0;
cache[i][j].tag = 0;
cache[i][j].LRU = 0;
}
}
}

//解析参数
void parse_parameter(int argc, char *argv[]){
if ((argc != 9) && (argc != 10)){
printf("wrong argument\n");
flag = 0;
return;
}
int opt, n;
while ((opt = getopt(argc, argv, "s:E:b:t:vh")) != -1){
switch (opt) {
case 'h':
h = 1;
v = 0;
break;
case 'v':
h = 0;
v = 1;
break;
case 's':
s = atoi(optarg);
break;
case 'E':
E = atoi(optarg);
break;
case 'b':
b = atoi(optarg);
break;
case 't':
n = strlen(optarg);
tracefile = (char *)malloc(n + 1);
strcpy(tracefile, optarg);
break;
default:
printf("wrong argument\n");
flag = 0;
break;
}
}

if (flag){
S = 1 << s;
B = 1 << b;
}
}

//解析地址
void parse_address(unsigned address){
offset = address % (1 << b);
address >>= b;
index_ = address % (1 << s);
address >>= s;
tag = address;
}

//Load
void Load(){
//记录是否hit
int f = 0;
//最大lru
int max_lru = 0;
//可以存储的位置
int place = -1;
//匹配tag
for (int i = 0; i < E; i++){
//更新LRU
cache[index_][i].LRU++;
//更新max_lru
max_lru = (max_lru > cache[index_][i].LRU) ? max_lru : cache[index_][i].LRU;
//判断是否hit
if ((cache[index_][i].valid_bit == 1) && (cache[index_][i].tag == tag)){
f = 1;
cache[index_][i].LRU = 0;
}
//判断是否存在valid_bit=0的位置
if (cache[index_][i].valid_bit == 0){
place = i;
}
}
//如果没有hit则保存信息
if (f == 0){
misses++;
if (v){
printf(" miss");
}
//判断是否需要evict
if (place != -1){
cache[index_][place].valid_bit = 1;
cache[index_][place].tag = tag;
cache[index_][place].LRU = 0;
}else{
evictions++;
if (v){
printf(" eviction");
}
//遍历,找到LRU最大的cache
for (int i = 0; i < E; i++){
if (cache[index_][i].LRU == max_lru){
cache[index_][i].tag = tag;
cache[index_][i].LRU = 0;
break;
}
}
}
}else{
hits++;
if (v){
printf(" hit");
}
}
}

//Store
void Store(){
Load();
}

//Modify
void Modify(){
Load();
Store();
}

//读取文件
void read_file(char *tracefile){
FILE *pFile;
pFile = fopen(tracefile, "r");
//判断文件是否存在
if (pFile == NULL){
printf("%s: No such file or directory\n", tracefile);
flag = 0;
return;
}

char identifier;
unsigned address;
int size;

//读取
while (fscanf(pFile, " %c %x,%d", &identifier, &address, &size) > 0){
printf(" %c %x,%d\n", identifier, address, size);
if (identifier == 'I'){
continue;
}
parse_address(address);
printf("%x %x %x\n", tag, index_, offset);
//输出结果
if (v){
printf(" %c %x,%d", identifier, address, size);
}
if (identifier == 'L'){
Load();
}else if(identifier == 'S'){
Store();
}else{
Modify();
}
if (v){
printf("\n");
}
}

fclose(pFile);
}

//输出结果
/*
void printSummary(int hits, int misses, int evictions){
printf("hits:%d misses:%d evictions:%d\n", hits, misses, evictions);
}
*/

Part B: Optimizing Matrix Transpose

回顾缓存的结构

此处$s=5, E=1, b=5$,所以一共$32$组缓存块,每组$1$个缓存块,每个缓存块有$32$字节,因为本题考虑的变量为int($4$字节),所以一个缓存块存储$8$个int。

$32\times32$

每行有$32$个int,所以每行需要$4$个缓存块,从而矩阵中缓存index的分布(下图来自https://www.jianshu.com/p/e68dd8305e9c#fn1):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
 	+--+--+--+--+
0 | 0| 1| 2| 3|
+--+--+--+--+
1 | 4| 5| 6| 7|
+--+--+--+--+
2 | 8| 9|10|11|
+--+--+--+--+
3 |12|13|14|15|
+--+--+--+--+
4 |16|17|18|19|
+--+--+--+--+
5 |20|21|22|23|
+--+--+--+--+
6 |24|25|26|27|
+--+--+--+--+
7 |28|29|30|31|
+--+--+--+--+
8 | 0| 1| 2| 3|
+--+--+--+--+
每一个小格子表示一个缓存块,格子中的数字是缓存块的index。
可以看到第0行和第8行缓存块的index是重复的。

所以按$8\times 8$分块,代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
void transpose1_v0(int M, int N, int A[N][M], int B[M][N]){
int ii, jj, i, j;
for (ii = 0; ii < N; ii += 8){
for (jj = 0; jj < M; jj += 8){
for (i = 0; i < 8; i ++){
for (j = 0; j < 8; j++){
B[jj + j][ii + i] = A[ii + i][jj + j];
}
}
}
}
}

运行:

1
2
make
./test-trans -M 32 -N 32

结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1710, misses:343, evictions:311

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:870, misses:1183, evictions:1151

Summary for official submission (func 0): correctness=1 misses=343

TEST_TRANS_RESULTS=1:343

超过限制$300$,下面进行分析(依然参考https://www.jianshu.com/p/e68dd8305e9c#fn1):

每个$8\times 8$分块会产生$8$次miss(每行$1$次),一共有$32\times 32/(8\times 8)=16$个分块,加上两个矩阵,所以会产生$16\times 8\times 2= 256$次miss,和实际结果相差较多。

按照网上大佬的提示,在tracegen.c的如下代码处

1
2
/* Fill A with data */
initMatrix(M,N, A, B);

增加打印A,B位置的代码

1
printf("%p %p\n", A, B);

编译后运行

1
2
./tracegen -M 32 -N 32
0x5639516b0080 0x5639516f0080

接着回顾高速缓存读取的方法,索引组判断属于哪个组,块偏移确定了$B$个字节的数据块中的字偏移,标记位确定标记:

利用如下代码输出缓存信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#include <stdio.h>

//解析地址
void parse_address(unsigned address){
int b = 5;
int s = 5;

unsigned offset = address % (1 << b);
address >>= b;
unsigned index_ = address % (1 << s);
address >>= s;
unsigned tag = address;

printf("%x %x %x\n", tag, index_, offset);
}

int main(){
unsigned long address = 0x55bd9d978080;
parse_address(address);
address = 0x55bd9d9b8080;
parse_address(address);

return 0;
}

编译并且运行得到

1
2
3
4
gcc -o cache_address cache_address.c
./cache_address
2765e0 4 0
2766e0 4 0

说明A,B的同一行在缓存的同一组,这一步会产生问题,来看一个具体例子:

1
2
3
4
5
tmp = A[0][0] 			//cache:A[0]
B[0][0] = tmp //cache:B[0](A[0]和B[0]在同一组)
tmp = A[0][1] //cache:A[0]
B[1][0] = tmp //cache:A[0],B[1]
......

解决方法:A的每一行先存入本地变量,然后再复制到B,代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
void transpose1(int M, int N, int A[N][M], int B[M][N]){
int ii, jj, i, j;
for (ii = 0; ii < N; ii += 8){
for (jj = 0; jj < M; jj += 8){
for (i = 0; i < 8; i ++){
//使用本地变量,本地变量保存再寄存器上
int a[8];
for (j = 0; j < 8; j++){
a[j] = A[ii + i][jj + j];
}
for (j = 0; j < 8; j++){
B[jj + j][ii + i] = a[j];
}
}
}
}
}

运行:

1
2
make
./test-trans -M 32 -N 32

结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1766, misses:289, evictions:257

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:870, misses:1183, evictions:1151

Summary for official submission (func 0): correctness=1 misses=289

TEST_TRANS_RESULTS=1:289

$64\times 64$

参考:

https://blog.codedragon.tech/2017/09/25/%E6%B7%B1%E5%85%A5%E7%90%86%E8%A7%A3%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%B3%BB%E7%BB%9FCacheLab-PartB%E5%AE%9E%E9%AA%8C%E6%8A%A5%E5%91%8A/

思路是将$8\times 8$的分块拆成$4$个$4\times 4$的分块,代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
void transpose2(int M, int N, int A[N][M], int B[M][N]){
int ii, jj, i, j;
int a0, a1, a2, a3, a4, a5, a6, a7;
//将8*8矩阵分为4块,分别为A11,A12,A21,A22
for (ii = 0; ii < N; ii += 8){
for (jj = 0; jj < M; jj += 8){
//B11=A11.T,B12=A12.T
for (i = 0; i < 4; i++){
//本地变量
a0 = A[ii + i][jj + 0];
a1 = A[ii + i][jj + 1];
a2 = A[ii + i][jj + 2];
a3 = A[ii + i][jj + 3];
a4 = A[ii + i][jj + 4];
a5 = A[ii + i][jj + 5];
a6 = A[ii + i][jj + 6];
a7 = A[ii + i][jj + 7];
//保存前4个元素,按列
B[jj + 0][ii + i] = a0;
B[jj + 1][ii + i] = a1;
B[jj + 2][ii + i] = a2;
B[jj + 3][ii + i] = a3;
//保存后4个元素,按列
B[jj + 0][ii + i + 4] = a4;
B[jj + 1][ii + i + 4] = a5;
B[jj + 2][ii + i + 4] = a6;
B[jj + 3][ii + i + 4] = a7;
}

for (i = 4; i < 8; i++){
//将B12每行元素保存在本地变量
a0 = B[jj + i - 4][ii + 0 + 4];
a1 = B[jj + i - 4][ii + 1 + 4];
a2 = B[jj + i - 4][ii + 2 + 4];
a3 = B[jj + i - 4][ii + 3 + 4];
//B12=A21.T
for (j = 0; j < 4; j++){
B[jj + i - 4][ii + j + 4] = A[ii + j + 4][jj + i - 4];
}
//B21=B12=A12.T,按行
B[jj + i][ii + 0] = a0;
B[jj + i][ii + 1] = a1;
B[jj + i][ii + 2] = a2;
B[jj + i][ii + 3] = a3;
//B22=A22.T
for (j = 4; j < 8; j++){
B[jj + i][ii + j] = A[ii + j][jj + i];
}
}
}
}
}

运行:

1
2
make
./test-trans -M 64 -N 64

结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:9026, misses:1219, evictions:1187

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3474, misses:4723, evictions:4691

Summary for official submission (func 0): correctness=1 misses=1219

TEST_TRANS_RESULTS=1:1219

作为对比,这里提供另一个函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
void transpose4(int M, int N, int A[N][M], int B[M][N]){
int ii, jj, i, j;
int a0, a1, a2, a3, a4, a5, a6, a7;
//将8*8矩阵分为4块,分别为A11,A12,A21,A22
for (ii = 0; ii < N; ii += 8){
for (jj = 0; jj < M; jj += 8){
//B11=A11.T,B12=A12
for (i = 0; i < 4; i++){
//本地变量
a0 = A[ii + i][jj + 0];
a1 = A[ii + i][jj + 1];
a2 = A[ii + i][jj + 2];
a3 = A[ii + i][jj + 3];
a4 = A[ii + i][jj + 4];
a5 = A[ii + i][jj + 5];
a6 = A[ii + i][jj + 6];
a7 = A[ii + i][jj + 7];
//保存前4个元素,按列
B[jj + 0][ii + i] = a0;
B[jj + 1][ii + i] = a1;
B[jj + 2][ii + i] = a2;
B[jj + 3][ii + i] = a3;
//保存后4个元素,按行
B[jj + i][ii + 4] = a4;
B[jj + i][ii + 5] = a5;
B[jj + i][ii + 6] = a6;
B[jj + i][ii + 7] = a7;
}

for (i = 4; i < 8; i++){
//将B12每行元素保存在本地变量
a0 = B[jj + i - 4][ii + 0 + 4];
a1 = B[jj + i - 4][ii + 1 + 4];
a2 = B[jj + i - 4][ii + 2 + 4];
a3 = B[jj + i - 4][ii + 3 + 4];

//B12=A21.T
for (j = 0; j < 4; j++){
B[jj + i - 4][ii + j + 4] = A[ii + j + 4][jj + i - 4];
}

//B21=B12.T=A12.T,按列(这部分会造成miss)
B[jj + 4][ii + i - 4] = a0;
B[jj + 5][ii + i - 4] = a1;
B[jj + 6][ii + i - 4] = a2;
B[jj + 7][ii + i - 4] = a3;

//B22=A22.T
for (j = 4; j < 8; j++){
B[jj + j][ii + i] = A[ii + i][jj + j];
}
}
}
}
}

运行:

1
2
make
./test-trans -M 64 -N 64

结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:8546, misses:1699, evictions:1667

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3474, misses:4723, evictions:4691

Summary for official submission (func 0): correctness=1 misses=1699

TEST_TRANS_RESULTS=1:1699

$61\times 67$

经过尝试后选择$8\times17$的分块,将矩阵分为如下$4$个部分

按照$A_{11},A_{21},A_{12},A_{22}$的顺序处理,这部分依然利用局部变量,代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
void transpose3(int M, int N, int A[N][M], int B[M][N]){
int ii, jj, i, j;
//使用本地变量,本地变量保存再寄存器上
int a[8];
for (ii = 0; ii < 56; ii += 8){
for (jj = 0; jj < 51; jj += 17){
for (j = 0; j < 17; j++){
for (i = 0; i < 8; i++){
a[i] = A[ii + i][jj + j];
}
for (i = 0; i < 8; i++){
B[jj + j][ii + i] = a[i];
}
}
}
}

//处理剩余行
for (j = 0; j < 51; j++){
for (i = 56; i < N; i++){
B[j][i] = A[i][j];
}
}

//处理剩余列
for (i = 0; i < 56; i++){
for (j = 51; j < N; j++){
B[j][i] = A[i][j];
}
}

//处理对角线
for (i = 56; i < N; i++){
for (j = 51; j < M; j++){
//tmp = A[i][j];
B[j][i] = A[i][j];
}
}
}

运行:

1
2
make
./test-trans -M 61 -N 67

结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:6946, misses:1907, evictions:1875

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3756, misses:4423, evictions:4391

Summary for official submission (func 0): correctness=1 misses=1907

TEST_TRANS_RESULTS=1:1907

整体测试

1
./driver.py

结果如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
Part A: Testing cache simulator
Running ./test-csim
Your simulator Reference simulator
Points (s,E,b) Hits Misses Evicts Hits Misses Evicts
3 (1,1,1) 9 8 6 9 8 6 traces/yi2.trace
3 (4,2,4) 4 5 2 4 5 2 traces/yi.trace
3 (2,1,4) 2 3 1 2 3 1 traces/dave.trace
3 (2,1,3) 167 71 67 167 71 67 traces/trans.trace
3 (2,2,3) 201 37 29 201 37 29 traces/trans.trace
3 (2,4,3) 212 26 10 212 26 10 traces/trans.trace
3 (5,1,5) 231 7 0 231 7 0 traces/trans.trace
6 (5,1,5) 265189 21775 21743 265189 21775 21743 traces/long.trace
27


Part B: Testing transpose function
Running ./test-trans -M 32 -N 32
Running ./test-trans -M 64 -N 64
Running ./test-trans -M 61 -N 67

Cache Lab summary:
Points Max pts Misses
Csim correctness 27.0 27
Trans perf 32x32 8.0 8 289
Trans perf 64x64 8.0 8 1219
Trans perf 61x67 10.0 10 1907
Total points 53.0 53

本文标题:CMU 15-213 Lab4 Cache Lab

文章作者:Doraemonzzz

发布时间:2020年07月22日 - 21:03:00

最后更新:2020年07月22日 - 22:13:39

原始链接:http://doraemonzzz.com/2020/07/22/CMU 15-213 Lab4 Cache Lab/

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。