CMU 15-213 Lab4 Cache Lab
课程主页:http://www.cs.cmu.edu/afs/cs/academic/class/15213-f15/www/schedule.html
课程资料:https://github.com/EugeneLiu/translationCSAPP
课程视频:https://www.bilibili.com/video/av31289365/
这一部分回顾CSAPP的Cache Lab。
参考资料:
https://www.jianshu.com/p/e68dd8305e9c
https://wdxmzy.com/csapp/thick-csapp-lab-4/2016/04/16/
https://www.jianshu.com/p/582367289391
备注,该实验需要安装valgrind
sudo apt install valgrind
Part A:Writing a Cache Simulator
说明
首先忽略I,因为文档中有如下说明:
- For this lab, we are interested only in data cache performance, so your simulator should ignore all
instruction cache accesses (lines starting with “I”).
其次忽略size参数,因为文档中有如下说明:
- For this this lab, you should assume that memory accesses are aligned properly, such that a single
memory access never crosses block boundaries. By making this assumption, you can ignore the
request sizes in the valgrind traces.
算法
该程序最重要的是L,S,M操作,对应如下三个函数
- Load
- Store
- Modify
Store函数和Load函数的操作相同,注意到文章中有如下说明:
- The data modify operation (M) is treated as a load followed by a store to the same address
所以Modify函数为Load函数加上Store函数,所以我们只需要设计Load函数,伪代码如下
- 第一轮迭代,计算最大的LRU,如果$(\text{cache[index_][i].valid_bit} == 1) \&\& (\text{cache[index_][i].tag == tag})$则hit,同时找到$\text{cache[index_][i].valid_bit} == 0$的位置
- 如果没有hit,输出miss
- 如果存在$\text{cache[index_][i].valid_bit} == 0$的位置,则直接保存结果
- 否则再次遍历,找到最大LRU对应的位置,保存结果
- 输出evict
代码
备注:不要自己实现printSummary函数,该函数仅用于自己调试。
#include "cachelab.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <getopt.h>
void parse_parameter(int argc, char *argv[]);
void read_file(char *tracefile);
void parse_address(unsigned address);
void get_cache();
void Load();
void Store();
void Modify();
void print_help();
typedef struct cache_line{
unsigned valid_bit;
unsigned tag;
int LRU;
}cache_line;
//参数
int h, v, s, E, b, S, B;
unsigned tag, index_, offset;
char *tracefile;
//计数
int hits = 0;
int misses = 0;
int evictions = 0;
//参数的形式
char mode[4] = {'s', 'E', 'b', 't'};
//判断参数是否正确
int flag = 1;
//缓存
cache_line **cache;
int main(int argc, char *argv[]){
//解析参数
parse_parameter(argc, argv);
//如果参数正确才进行后续操作
if (flag){
//输出help信息
if (h == 1){
print_help();
}else{
//构造cache
get_cache();
//读取文件
read_file(tracefile);
//文件可能不存在,所以这里再加以判断
if (flag){
//输出
printSummary(hits, misses, evictions);
}
}
}else{
printf("%s: Missing required command line argument\n", argv[0]);
print_help();
}
return 0;
}
//输出help信息
void print_help(){
printf("Usage: ./csim-ref [-hv] -s <num> -E <num> -b <num> -t <file>\n");
printf("Options:\n");
printf(" -h Print this help message.\n");
printf(" -v Optional verbose flag.\n");
printf(" -s <num> Number of set index bits.\n");
printf(" -E <num> Number of lines per set.\n");
printf(" -b <num> Number of block offset bits.\n");
printf(" -t <file> Trace file.\n");
printf("Examples:\n");
printf(" linux> ./csim-ref -s 4 -E 1 -b 4 -t traces/yi.trace\n");
printf(" linux> ./csim-ref -v -s 8 -E 2 -b 4 -t traces/yi.trace\n");
}
//参考https://blog.csdn.net/fengxinlinux/article/details/51541003
//申请缓存空间并初始化
void get_cache(){
cache = (cache_line **)malloc(S * sizeof(cache_line*));
for (int i = 0; i < S; i++){
cache[i] = (cache_line *)malloc(E * sizeof(cache_line));
for (int j = 0; j < E; j++){
//初始化
cache[i][j].valid_bit = 0;
cache[i][j].tag = 0;
cache[i][j].LRU = 0;
}
}
}
//解析参数
void parse_parameter(int argc, char *argv[]){
if ((argc != 9) && (argc != 10)){
printf("wrong argument\n");
flag = 0;
return;
}
int opt, n;
while ((opt = getopt(argc, argv, "s:E:b:t:vh")) != -1){
switch (opt) {
case 'h':
h = 1;
v = 0;
break;
case 'v':
h = 0;
v = 1;
break;
case 's':
s = atoi(optarg);
break;
case 'E':
E = atoi(optarg);
break;
case 'b':
b = atoi(optarg);
break;
case 't':
n = strlen(optarg);
tracefile = (char *)malloc(n + 1);
strcpy(tracefile, optarg);
break;
default:
printf("wrong argument\n");
flag = 0;
break;
}
}
if (flag){
S = 1 << s;
B = 1 << b;
}
}
//解析地址
void parse_address(unsigned address){
offset = address % (1 << b);
address >>= b;
index_ = address % (1 << s);
address >>= s;
tag = address;
}
//Load
void Load(){
//记录是否hit
int f = 0;
//最大lru
int max_lru = 0;
//可以存储的位置
int place = -1;
//匹配tag
for (int i = 0; i < E; i++){
//更新LRU
cache[index_][i].LRU++;
//更新max_lru
max_lru = (max_lru > cache[index_][i].LRU) ? max_lru : cache[index_][i].LRU;
//判断是否hit
if ((cache[index_][i].valid_bit == 1) && (cache[index_][i].tag == tag)){
f = 1;
cache[index_][i].LRU = 0;
}
//判断是否存在valid_bit=0的位置
if (cache[index_][i].valid_bit == 0){
place = i;
}
}
//如果没有hit则保存信息
if (f == 0){
misses++;
if (v){
printf(" miss");
}
//判断是否需要evict
if (place != -1){
cache[index_][place].valid_bit = 1;
cache[index_][place].tag = tag;
cache[index_][place].LRU = 0;
}else{
evictions++;
if (v){
printf(" eviction");
}
//遍历,找到LRU最大的cache
for (int i = 0; i < E; i++){
if (cache[index_][i].LRU == max_lru){
cache[index_][i].tag = tag;
cache[index_][i].LRU = 0;
break;
}
}
}
}else{
hits++;
if (v){
printf(" hit");
}
}
}
//Store
void Store(){
Load();
}
//Modify
void Modify(){
Load();
Store();
}
//读取文件
void read_file(char *tracefile){
FILE *pFile;
pFile = fopen(tracefile, "r");
//判断文件是否存在
if (pFile == NULL){
printf("%s: No such file or directory\n", tracefile);
flag = 0;
return;
}
char identifier;
unsigned address;
int size;
//读取
while (fscanf(pFile, " %c %x,%d", &identifier, &address, &size) > 0){
printf(" %c %x,%d\n", identifier, address, size);
if (identifier == 'I'){
continue;
}
parse_address(address);
printf("%x %x %x\n", tag, index_, offset);
//输出结果
if (v){
printf(" %c %x,%d", identifier, address, size);
}
if (identifier == 'L'){
Load();
}else if(identifier == 'S'){
Store();
}else{
Modify();
}
if (v){
printf("\n");
}
}
fclose(pFile);
}
//输出结果
/*
void printSummary(int hits, int misses, int evictions){
printf("hits:%d misses:%d evictions:%d\n", hits, misses, evictions);
}
*/
Part B: Optimizing Matrix Transpose
回顾缓存的结构
此处$s=5, E=1, b=5$,所以一共$32$组缓存块,每组$1$个缓存块,每个缓存块有$32$字节,因为本题考虑的变量为int($4$字节),所以一个缓存块存储$8$个int。
$32\times32$
每行有$32$个int,所以每行需要$4$个缓存块,从而矩阵中缓存index的分布(下图来自https://www.jianshu.com/p/e68dd8305e9c#fn1):
+--+--+--+--+
0 | 0| 1| 2| 3|
+--+--+--+--+
1 | 4| 5| 6| 7|
+--+--+--+--+
2 | 8| 9|10|11|
+--+--+--+--+
3 |12|13|14|15|
+--+--+--+--+
4 |16|17|18|19|
+--+--+--+--+
5 |20|21|22|23|
+--+--+--+--+
6 |24|25|26|27|
+--+--+--+--+
7 |28|29|30|31|
+--+--+--+--+
8 | 0| 1| 2| 3|
+--+--+--+--+
每一个小格子表示一个缓存块,格子中的数字是缓存块的index。
可以看到第0行和第8行缓存块的index是重复的。
所以按$8\times 8$分块,代码如下:
void transpose1_v0(int M, int N, int A[N][M], int B[M][N]){
int ii, jj, i, j;
for (ii = 0; ii < N; ii += 8){
for (jj = 0; jj < M; jj += 8){
for (i = 0; i < 8; i ++){
for (j = 0; j < 8; j++){
B[jj + j][ii + i] = A[ii + i][jj + j];
}
}
}
}
}
运行:
make
./test-trans -M 32 -N 32
结果如下:
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1710, misses:343, evictions:311
Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:870, misses:1183, evictions:1151
Summary for official submission (func 0): correctness=1 misses=343
TEST_TRANS_RESULTS=1:343
超过限制$300$,下面进行分析(依然参考https://www.jianshu.com/p/e68dd8305e9c#fn1):
每个$8\times 8$分块会产生$8$次miss(每行$1$次),一共有$32\times 32/(8\times 8)=16$个分块,加上两个矩阵,所以会产生$16\times 8\times 2= 256$次miss,和实际结果相差较多。
按照网上大佬的提示,在tracegen.c的如下代码处
/* Fill A with data */
initMatrix(M,N, A, B);
增加打印A,B位置的代码
printf("%p %p\n", A, B);
编译后运行
./tracegen -M 32 -N 32
0x5639516b0080 0x5639516f0080
接着回顾高速缓存读取的方法,索引组判断属于哪个组,块偏移确定了$B$个字节的数据块中的字偏移,标记位确定标记:
利用如下代码输出缓存信息:
#include <stdio.h>
//解析地址
void parse_address(unsigned address){
int b = 5;
int s = 5;
unsigned offset = address % (1 << b);
address >>= b;
unsigned index_ = address % (1 << s);
address >>= s;
unsigned tag = address;
printf("%x %x %x\n", tag, index_, offset);
}
int main(){
unsigned long address = 0x55bd9d978080;
parse_address(address);
address = 0x55bd9d9b8080;
parse_address(address);
return 0;
}
编译并且运行得到
gcc -o cache_address cache_address.c
./cache_address
2765e0 4 0
2766e0 4 0
说明A,B的同一行在缓存的同一组,这一步会产生问题,来看一个具体例子:
tmp = A[0][0] //cache:A[0]
B[0][0] = tmp //cache:B[0](A[0]和B[0]在同一组)
tmp = A[0][1] //cache:A[0]
B[1][0] = tmp //cache:A[0],B[1]
......
解决方法:A的每一行先存入本地变量,然后再复制到B,代码如下:
void transpose1(int M, int N, int A[N][M], int B[M][N]){
int ii, jj, i, j;
for (ii = 0; ii < N; ii += 8){
for (jj = 0; jj < M; jj += 8){
for (i = 0; i < 8; i ++){
//使用本地变量,本地变量保存在寄存器上
int a[8];
for (j = 0; j < 8; j++){
a[j] = A[ii + i][jj + j];
}
for (j = 0; j < 8; j++){
B[jj + j][ii + i] = a[j];
}
}
}
}
}
运行:
make
./test-trans -M 32 -N 32
结果如下:
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1766, misses:289, evictions:257
Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:870, misses:1183, evictions:1151
Summary for official submission (func 0): correctness=1 misses=289
TEST_TRANS_RESULTS=1:289
$64\times 64$
参考:
思路是将$8\times 8$的分块拆成$4$个$4\times 4$的分块,代码如下:
void transpose2(int M, int N, int A[N][M], int B[M][N]){
int ii, jj, i, j;
int a0, a1, a2, a3, a4, a5, a6, a7;
//将8*8矩阵分为4块,分别为A11,A12,A21,A22
for (ii = 0; ii < N; ii += 8){
for (jj = 0; jj < M; jj += 8){
//B11=A11.T,B12=A12.T
for (i = 0; i < 4; i++){
//本地变量
a0 = A[ii + i][jj + 0];
a1 = A[ii + i][jj + 1];
a2 = A[ii + i][jj + 2];
a3 = A[ii + i][jj + 3];
a4 = A[ii + i][jj + 4];
a5 = A[ii + i][jj + 5];
a6 = A[ii + i][jj + 6];
a7 = A[ii + i][jj + 7];
//保存前4个元素,按列
B[jj + 0][ii + i] = a0;
B[jj + 1][ii + i] = a1;
B[jj + 2][ii + i] = a2;
B[jj + 3][ii + i] = a3;
//保存后4个元素,按列
B[jj + 0][ii + i + 4] = a4;
B[jj + 1][ii + i + 4] = a5;
B[jj + 2][ii + i + 4] = a6;
B[jj + 3][ii + i + 4] = a7;
}
for (i = 4; i < 8; i++){
//将B12每行元素保存在本地变量
a0 = B[jj + i - 4][ii + 0 + 4];
a1 = B[jj + i - 4][ii + 1 + 4];
a2 = B[jj + i - 4][ii + 2 + 4];
a3 = B[jj + i - 4][ii + 3 + 4];
//B12=A21.T
for (j = 0; j < 4; j++){
B[jj + i - 4][ii + j + 4] = A[ii + j + 4][jj + i - 4];
}
//B21=B12=A12.T,按行
B[jj + i][ii + 0] = a0;
B[jj + i][ii + 1] = a1;
B[jj + i][ii + 2] = a2;
B[jj + i][ii + 3] = a3;
//B22=A22.T
for (j = 4; j < 8; j++){
B[jj + i][ii + j] = A[ii + j][jj + i];
}
}
}
}
}
运行:
make
./test-trans -M 64 -N 64
结果如下:
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:9026, misses:1219, evictions:1187
Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3474, misses:4723, evictions:4691
Summary for official submission (func 0): correctness=1 misses=1219
TEST_TRANS_RESULTS=1:1219
作为对比,这里提供另一个函数:
void transpose4(int M, int N, int A[N][M], int B[M][N]){
int ii, jj, i, j;
int a0, a1, a2, a3, a4, a5, a6, a7;
//将8*8矩阵分为4块,分别为A11,A12,A21,A22
for (ii = 0; ii < N; ii += 8){
for (jj = 0; jj < M; jj += 8){
//B11=A11.T,B12=A12
for (i = 0; i < 4; i++){
//本地变量
a0 = A[ii + i][jj + 0];
a1 = A[ii + i][jj + 1];
a2 = A[ii + i][jj + 2];
a3 = A[ii + i][jj + 3];
a4 = A[ii + i][jj + 4];
a5 = A[ii + i][jj + 5];
a6 = A[ii + i][jj + 6];
a7 = A[ii + i][jj + 7];
//保存前4个元素,按列
B[jj + 0][ii + i] = a0;
B[jj + 1][ii + i] = a1;
B[jj + 2][ii + i] = a2;
B[jj + 3][ii + i] = a3;
//保存后4个元素,按行
B[jj + i][ii + 4] = a4;
B[jj + i][ii + 5] = a5;
B[jj + i][ii + 6] = a6;
B[jj + i][ii + 7] = a7;
}
for (i = 4; i < 8; i++){
//将B12每行元素保存在本地变量
a0 = B[jj + i - 4][ii + 0 + 4];
a1 = B[jj + i - 4][ii + 1 + 4];
a2 = B[jj + i - 4][ii + 2 + 4];
a3 = B[jj + i - 4][ii + 3 + 4];
//B12=A21.T
for (j = 0; j < 4; j++){
B[jj + i - 4][ii + j + 4] = A[ii + j + 4][jj + i - 4];
}
//B21=B12.T=A12.T,按列(这部分会造成miss)
B[jj + 4][ii + i - 4] = a0;
B[jj + 5][ii + i - 4] = a1;
B[jj + 6][ii + i - 4] = a2;
B[jj + 7][ii + i - 4] = a3;
//B22=A22.T
for (j = 4; j < 8; j++){
B[jj + j][ii + i] = A[ii + i][jj + j];
}
}
}
}
}
运行:
make
./test-trans -M 64 -N 64
结果如下:
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:8546, misses:1699, evictions:1667
Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3474, misses:4723, evictions:4691
Summary for official submission (func 0): correctness=1 misses=1699
TEST_TRANS_RESULTS=1:1699
$61\times 67$
经过尝试后选择$8\times17$的分块,将矩阵分为如下$4$个部分
按照$A_{11},A_{21},A_{12},A_{22}$的顺序处理,这部分依然利用局部变量,代码如下:
void transpose3(int M, int N, int A[N][M], int B[M][N]){
int ii, jj, i, j;
//使用本地变量,本地变量保存再寄存器上
int a[8];
for (ii = 0; ii < 56; ii += 8){
for (jj = 0; jj < 51; jj += 17){
for (j = 0; j < 17; j++){
for (i = 0; i < 8; i++){
a[i] = A[ii + i][jj + j];
}
for (i = 0; i < 8; i++){
B[jj + j][ii + i] = a[i];
}
}
}
}
//处理剩余行
for (j = 0; j < 51; j++){
for (i = 56; i < N; i++){
B[j][i] = A[i][j];
}
}
//处理剩余列
for (i = 0; i < 56; i++){
for (j = 51; j < N; j++){
B[j][i] = A[i][j];
}
}
//处理对角线
for (i = 56; i < N; i++){
for (j = 51; j < M; j++){
//tmp = A[i][j];
B[j][i] = A[i][j];
}
}
}
运行:
make
./test-trans -M 61 -N 67
结果如下:
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:6946, misses:1907, evictions:1875
Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3756, misses:4423, evictions:4391
Summary for official submission (func 0): correctness=1 misses=1907
TEST_TRANS_RESULTS=1:1907
整体测试
./driver.py
结果如下
Part A: Testing cache simulator
Running ./test-csim
Your simulator Reference simulator
Points (s,E,b) Hits Misses Evicts Hits Misses Evicts
3 (1,1,1) 9 8 6 9 8 6 traces/yi2.trace
3 (4,2,4) 4 5 2 4 5 2 traces/yi.trace
3 (2,1,4) 2 3 1 2 3 1 traces/dave.trace
3 (2,1,3) 167 71 67 167 71 67 traces/trans.trace
3 (2,2,3) 201 37 29 201 37 29 traces/trans.trace
3 (2,4,3) 212 26 10 212 26 10 traces/trans.trace
3 (5,1,5) 231 7 0 231 7 0 traces/trans.trace
6 (5,1,5) 265189 21775 21743 265189 21775 21743 traces/long.trace
27
Part B: Testing transpose function
Running ./test-trans -M 32 -N 32
Running ./test-trans -M 64 -N 64
Running ./test-trans -M 61 -N 67
Cache Lab summary:
Points Max pts Misses
Csim correctness 27.0 27
Trans perf 32x32 8.0 8 289
Trans perf 64x64 8.0 8 1219
Trans perf 61x67 10.0 10 1907
Total points 53.0 53