CMU 15-213 Lab4 Cache Lab

课程主页：http://www.cs.cmu.edu/afs/cs/academic/class/15213-f15/www/schedule.html

课程资料：https://github.com/EugeneLiu/translationCSAPP

课程视频：https://www.bilibili.com/video/av31289365/

这一部分回顾CSAPP的Cache Lab。

参考资料：

https://www.jianshu.com/p/e68dd8305e9c

https://blog.codedragon.tech/2017/09/25/%E6%B7%B1%E5%85%A5%E7%90%86%E8%A7%A3%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%B3%BB%E7%BB%9FCacheLab-PartB%E5%AE%9E%E9%AA%8C%E6%8A%A5%E5%91%8A/

https://wdxmzy.com/csapp/thick-csapp-lab-4/2016/04/16/

https://www.jianshu.com/p/582367289391

备注，该实验需要安装valgrind

sudo apt install valgrind

Part A:Writing a Cache Simulator

说明

首先忽略I，因为文档中有如下说明：

For this lab, we are interested only in data cache performance, so your simulator should ignore all
instruction cache accesses (lines starting with “I”).

其次忽略size参数，因为文档中有如下说明：

For this this lab, you should assume that memory accesses are aligned properly, such that a single
memory access never crosses block boundaries. By making this assumption, you can ignore the
request sizes in the valgrind traces.

算法

该程序最重要的是L,S,M操作，对应如下三个函数

Load
Store
Modify

Store函数和Load函数的操作相同，注意到文章中有如下说明：

The data modify operation (M) is treated as a load followed by a store to the same address

所以Modify函数为Load函数加上Store函数，所以我们只需要设计Load函数，伪代码如下

第一轮迭代，计算最大的LRU，如果$(\text{cache[index_][i].valid_bit} == 1) \&\& (\text{cache[index_][i].tag == tag})$则hit，同时找到$\text{cache[index_][i].valid_bit} == 0$的位置
如果没有hit，输出miss
如果存在$\text{cache[index_][i].valid_bit} == 0$的位置，则直接保存结果
否则再次遍历，找到最大LRU对应的位置，保存结果
输出evict

代码

备注：不要自己实现printSummary函数，该函数仅用于自己调试。

#include "cachelab.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <getopt.h>

void parse_parameter(int argc, char *argv[]);
void read_file(char *tracefile);
void parse_address(unsigned address);
void get_cache();
void Load();
void Store();
void Modify();
void print_help();

typedef struct cache_line{
    unsigned valid_bit;
    unsigned tag;
    int LRU;
}cache_line;

//参数
int h, v, s, E, b, S, B;
unsigned tag, index_, offset;
char *tracefile;
//计数
int hits = 0;
int misses = 0;
int evictions = 0;
//参数的形式
char mode[4] = {'s', 'E', 'b', 't'};
//判断参数是否正确
int flag = 1;
//缓存
cache_line **cache;

int main(int argc, char *argv[]){
    //解析参数
    parse_parameter(argc, argv);

    //如果参数正确才进行后续操作
    if (flag){
        //输出help信息
        if (h == 1){
            print_help();
        }else{
            //构造cache
            get_cache();
            //读取文件
            read_file(tracefile);
            //文件可能不存在，所以这里再加以判断
            if (flag){
                //输出
                printSummary(hits, misses, evictions);
            }

        }
    }else{
        printf("%s: Missing required command line argument\n", argv[0]);
        print_help();
    }

    return 0;
}

//输出help信息
void print_help(){
    printf("Usage: ./csim-ref [-hv] -s <num> -E <num> -b <num> -t <file>\n");
    printf("Options:\n");
    printf("  -h         Print this help message.\n");
    printf("  -v         Optional verbose flag.\n");
    printf("  -s <num>   Number of set index bits.\n");
    printf("  -E <num>   Number of lines per set.\n");
    printf("  -b <num>   Number of block offset bits.\n");
    printf("  -t <file>  Trace file.\n");
    printf("Examples:\n");
    printf("  linux>  ./csim-ref -s 4 -E 1 -b 4 -t traces/yi.trace\n");
    printf("  linux>  ./csim-ref -v -s 8 -E 2 -b 4 -t traces/yi.trace\n");
}

//参考https://blog.csdn.net/fengxinlinux/article/details/51541003
//申请缓存空间并初始化
void get_cache(){
    cache = (cache_line **)malloc(S * sizeof(cache_line*));
    for (int i = 0; i < S; i++){
        cache[i] = (cache_line *)malloc(E * sizeof(cache_line));
        for (int j = 0; j < E; j++){
            //初始化
            cache[i][j].valid_bit = 0;
            cache[i][j].tag = 0;
            cache[i][j].LRU = 0;
        }
    }
}

//解析参数
void parse_parameter(int argc, char *argv[]){
    if ((argc != 9) && (argc != 10)){
        printf("wrong argument\n");
        flag = 0;
        return;
    }
    int opt, n;
    while ((opt = getopt(argc, argv, "s:E:b:t:vh")) != -1){
        switch (opt) {
            case 'h':
                h = 1;
                v = 0;
                break;
            case 'v':
                h = 0;
                v = 1;
                break;
            case 's':
                s = atoi(optarg);
                break;
            case 'E':
                E = atoi(optarg);
                break;
            case 'b':
                b = atoi(optarg);
                break;
            case 't':
                n = strlen(optarg);
                tracefile = (char *)malloc(n + 1);
                strcpy(tracefile, optarg);
                break;
            default:
                printf("wrong argument\n");
                flag = 0;
                break;
            }
        }

        if (flag){
            S = 1 << s;
            B = 1 << b;
        }
}

//解析地址
void parse_address(unsigned address){
    offset = address % (1 << b);
    address >>= b;
    index_ = address % (1 << s);
    address >>= s;
    tag = address;
}

//Load
void Load(){
    //记录是否hit
    int f = 0;
    //最大lru
    int max_lru = 0;
    //可以存储的位置
    int place = -1;
    //匹配tag
    for (int i = 0; i < E; i++){
        //更新LRU
        cache[index_][i].LRU++;
        //更新max_lru
        max_lru = (max_lru > cache[index_][i].LRU) ? max_lru : cache[index_][i].LRU;
        //判断是否hit
        if ((cache[index_][i].valid_bit == 1) && (cache[index_][i].tag == tag)){
            f = 1;
            cache[index_][i].LRU = 0;
        }
        //判断是否存在valid_bit=0的位置
        if (cache[index_][i].valid_bit == 0){
            place = i;
        }
    }
    //如果没有hit则保存信息
    if (f == 0){
        misses++;
        if (v){
            printf(" miss");
        }
        //判断是否需要evict
        if (place != -1){
            cache[index_][place].valid_bit = 1;
            cache[index_][place].tag = tag;
            cache[index_][place].LRU = 0;
        }else{
            evictions++;
            if (v){
                printf(" eviction");
            }
            //遍历，找到LRU最大的cache
            for (int i = 0; i < E; i++){
                if (cache[index_][i].LRU == max_lru){
                    cache[index_][i].tag = tag;
                    cache[index_][i].LRU = 0;
                    break;
                }
            }
        }
    }else{
        hits++;
        if (v){
            printf(" hit");
        }
    }
}

//Store
void Store(){
    Load();
}

//Modify
void Modify(){
    Load();
    Store();
}

//读取文件
void read_file(char *tracefile){
    FILE *pFile;
    pFile = fopen(tracefile, "r");
    //判断文件是否存在
    if (pFile == NULL){
        printf("%s: No such file or directory\n", tracefile);
        flag = 0;
        return;
    }

    char identifier;
    unsigned address;
    int size;

    //读取
    while (fscanf(pFile, " %c %x,%d", &identifier, &address, &size) > 0){
        printf(" %c %x,%d\n", identifier, address, size);
        if (identifier == 'I'){
            continue;
        }
        parse_address(address);
        printf("%x %x %x\n", tag, index_, offset);
        //输出结果
        if (v){
            printf(" %c %x,%d", identifier, address, size);
        }
        if (identifier == 'L'){
            Load();
        }else if(identifier == 'S'){
            Store();
        }else{
            Modify();
        }
        if (v){
            printf("\n");
        }
    }

    fclose(pFile);
}

//输出结果
/*
void printSummary(int hits, int misses, int evictions){
    printf("hits:%d misses:%d evictions:%d\n", hits, misses, evictions);
}
*/

Part B: Optimizing Matrix Transpose

回顾缓存的结构

此处$s=5, E=1, b=5$，所以一共$32$组缓存块，每组$1$个缓存块，每个缓存块有$32$字节，因为本题考虑的变量为int($4$字节)，所以一个缓存块存储$8$个int。

$32\times32$

每行有$32$个int，所以每行需要$4$个缓存块，从而矩阵中缓存index的分布（下图来自https://www.jianshu.com/p/e68dd8305e9c#fn1）：

 	+--+--+--+--+
 0  | 0| 1| 2| 3|
    +--+--+--+--+
 1  | 4| 5| 6| 7|
    +--+--+--+--+
 2  | 8| 9|10|11|
    +--+--+--+--+
 3  |12|13|14|15|
    +--+--+--+--+
 4  |16|17|18|19|
    +--+--+--+--+
 5  |20|21|22|23|
    +--+--+--+--+
 6  |24|25|26|27|
    +--+--+--+--+
 7  |28|29|30|31|
    +--+--+--+--+
 8  | 0| 1| 2| 3|
    +--+--+--+--+
每一个小格子表示一个缓存块，格子中的数字是缓存块的index。
可以看到第0行和第8行缓存块的index是重复的。

所以按$8\times 8$分块，代码如下：

void transpose1_v0(int M, int N, int A[N][M], int B[M][N]){
    int ii, jj, i, j;
    for (ii = 0; ii < N; ii += 8){
        for (jj = 0; jj < M; jj += 8){
            for (i = 0; i < 8; i ++){
                for (j = 0; j < 8; j++){
                    B[jj + j][ii + i] = A[ii + i][jj + j];
                }
            }
        }
    }
}

运行：

make
./test-trans -M 32 -N 32

结果如下：

Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1710, misses:343, evictions:311

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:870, misses:1183, evictions:1151

Summary for official submission (func 0): correctness=1 misses=343

TEST_TRANS_RESULTS=1:343

超过限制$300$，下面进行分析（依然参考https://www.jianshu.com/p/e68dd8305e9c#fn1）：

每个$8\times 8$分块会产生$8$次miss（每行$1$次），一共有$32\times 32/(8\times 8)=16$个分块，加上两个矩阵，所以会产生$16\times 8\times 2= 256$次miss，和实际结果相差较多。

按照网上大佬的提示，在tracegen.c的如下代码处

/* Fill A with data */
initMatrix(M,N, A, B);

增加打印A,B位置的代码

printf("%p %p\n", A, B);

编译后运行

./tracegen -M 32 -N 32
0x5639516b0080 0x5639516f0080

接着回顾高速缓存读取的方法，索引组判断属于哪个组，块偏移确定了$B$个字节的数据块中的字偏移，标记位确定标记：

利用如下代码输出缓存信息：

#include <stdio.h>

//解析地址
void parse_address(unsigned address){
    int b = 5;
    int s = 5;

    unsigned offset = address % (1 << b);
    address >>= b;
    unsigned index_ = address % (1 << s);
    address >>= s;
    unsigned tag = address;

    printf("%x %x %x\n", tag, index_, offset);
}

int main(){
    unsigned long address = 0x55bd9d978080;
    parse_address(address);
    address = 0x55bd9d9b8080;
    parse_address(address);

    return 0;
}

编译并且运行得到

gcc -o cache_address cache_address.c
./cache_address
2765e0 4 0
2766e0 4 0

说明A,B的同一行在缓存的同一组，这一步会产生问题，来看一个具体例子：

tmp = A[0][0] 			//cache:A[0]
B[0][0] = tmp 			//cache:B[0](A[0]和B[0]在同一组)
tmp = A[0][1]			//cache:A[0]
B[1][0] = tmp			//cache:A[0],B[1]
......

解决方法：A的每一行先存入本地变量，然后再复制到B，代码如下：

void transpose1(int M, int N, int A[N][M], int B[M][N]){
    int ii, jj, i, j;
    for (ii = 0; ii < N; ii += 8){
        for (jj = 0; jj < M; jj += 8){
            for (i = 0; i < 8; i ++){
                //使用本地变量，本地变量保存在寄存器上
                int a[8];
                for (j = 0; j < 8; j++){
                    a[j] = A[ii + i][jj + j];
                }
                for (j = 0; j < 8; j++){
                    B[jj + j][ii + i] = a[j];
                }
            }
        }
    }
}

运行：

make
./test-trans -M 32 -N 32

结果如下：

Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1766, misses:289, evictions:257

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:870, misses:1183, evictions:1151

Summary for official submission (func 0): correctness=1 misses=289

TEST_TRANS_RESULTS=1:289

$64\times 64$

参考：

https://blog.codedragon.tech/2017/09/25/%E6%B7%B1%E5%85%A5%E7%90%86%E8%A7%A3%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%B3%BB%E7%BB%9FCacheLab-PartB%E5%AE%9E%E9%AA%8C%E6%8A%A5%E5%91%8A/

思路是将$8\times 8$的分块拆成$4$个$4\times 4$的分块，代码如下：

void transpose2(int M, int N, int A[N][M], int B[M][N]){
    int ii, jj, i, j;
    int a0, a1, a2, a3, a4, a5, a6, a7;
    //将8*8矩阵分为4块，分别为A11,A12,A21,A22
    for (ii = 0; ii < N; ii += 8){
        for (jj = 0; jj < M; jj += 8){
            //B11=A11.T,B12=A12.T
            for (i = 0; i < 4; i++){
                //本地变量
                a0 = A[ii + i][jj + 0];
                a1 = A[ii + i][jj + 1];
                a2 = A[ii + i][jj + 2];
                a3 = A[ii + i][jj + 3];
                a4 = A[ii + i][jj + 4];
                a5 = A[ii + i][jj + 5];
                a6 = A[ii + i][jj + 6];
                a7 = A[ii + i][jj + 7];
                //保存前4个元素，按列
                B[jj + 0][ii + i] = a0;
                B[jj + 1][ii + i] = a1;
                B[jj + 2][ii + i] = a2;
                B[jj + 3][ii + i] = a3;
                //保存后4个元素，按列
                B[jj + 0][ii + i + 4] = a4;
                B[jj + 1][ii + i + 4] = a5;
                B[jj + 2][ii + i + 4] = a6;
                B[jj + 3][ii + i + 4] = a7;
            }

            for (i = 4; i < 8; i++){
                //将B12每行元素保存在本地变量
                a0 = B[jj + i - 4][ii + 0 + 4];
                a1 = B[jj + i - 4][ii + 1 + 4];
                a2 = B[jj + i - 4][ii + 2 + 4];
                a3 = B[jj + i - 4][ii + 3 + 4];
                //B12=A21.T
                for (j = 0; j < 4; j++){
                    B[jj + i - 4][ii + j + 4] = A[ii + j + 4][jj + i - 4];
                }
                //B21=B12=A12.T，按行
                B[jj + i][ii + 0] = a0;
                B[jj + i][ii + 1] = a1;
                B[jj + i][ii + 2] = a2;
                B[jj + i][ii + 3] = a3;
                //B22=A22.T
                for (j = 4; j < 8; j++){
                    B[jj + i][ii + j] = A[ii + j][jj + i];
                }
            }
		}
	}
}

运行：

make
./test-trans -M 64 -N 64

结果如下：

Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:9026, misses:1219, evictions:1187

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3474, misses:4723, evictions:4691

Summary for official submission (func 0): correctness=1 misses=1219

TEST_TRANS_RESULTS=1:1219

作为对比，这里提供另一个函数：

void transpose4(int M, int N, int A[N][M], int B[M][N]){
    int ii, jj, i, j;
    int a0, a1, a2, a3, a4, a5, a6, a7;
    //将8*8矩阵分为4块，分别为A11,A12,A21,A22
    for (ii = 0; ii < N; ii += 8){
        for (jj = 0; jj < M; jj += 8){
            //B11=A11.T,B12=A12
            for (i = 0; i < 4; i++){
                //本地变量
                a0 = A[ii + i][jj + 0];
                a1 = A[ii + i][jj + 1];
                a2 = A[ii + i][jj + 2];
                a3 = A[ii + i][jj + 3];
                a4 = A[ii + i][jj + 4];
                a5 = A[ii + i][jj + 5];
                a6 = A[ii + i][jj + 6];
                a7 = A[ii + i][jj + 7];
                //保存前4个元素，按列
                B[jj + 0][ii + i] = a0;
                B[jj + 1][ii + i] = a1;
                B[jj + 2][ii + i] = a2;
                B[jj + 3][ii + i] = a3;
                //保存后4个元素，按行
                B[jj + i][ii + 4] = a4;
                B[jj + i][ii + 5] = a5;
                B[jj + i][ii + 6] = a6;
                B[jj + i][ii + 7] = a7;
            }

            for (i = 4; i < 8; i++){
                //将B12每行元素保存在本地变量
                a0 = B[jj + i - 4][ii + 0 + 4];
                a1 = B[jj + i - 4][ii + 1 + 4];
                a2 = B[jj + i - 4][ii + 2 + 4];
                a3 = B[jj + i - 4][ii + 3 + 4];

                //B12=A21.T
                for (j = 0; j < 4; j++){
                    B[jj + i - 4][ii + j + 4] = A[ii + j + 4][jj + i - 4];
                }

                //B21=B12.T=A12.T，按列（这部分会造成miss）
                B[jj + 4][ii + i - 4] = a0;
                B[jj + 5][ii + i - 4] = a1;
                B[jj + 6][ii + i - 4] = a2;
                B[jj + 7][ii + i - 4] = a3;

                //B22=A22.T
                for (j = 4; j < 8; j++){
                    B[jj + j][ii + i] = A[ii + i][jj + j];
                }
            }
		}
	}
}

运行：

make
./test-trans -M 64 -N 64

结果如下：

Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:8546, misses:1699, evictions:1667

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3474, misses:4723, evictions:4691

Summary for official submission (func 0): correctness=1 misses=1699

TEST_TRANS_RESULTS=1:1699

$61\times 67$

经过尝试后选择$8\times17$的分块，将矩阵分为如下$4$个部分

$\left[\begin{array}{ll} A_{11} & A_{12} \\ A_{21} & A_{22} \end{array}\right]$

按照$A_{11},A_{21},A_{12},A_{22}$的顺序处理，这部分依然利用局部变量，代码如下：

void transpose3(int M, int N, int A[N][M], int B[M][N]){
    int ii, jj, i, j;
    //使用本地变量，本地变量保存再寄存器上
    int a[8];
    for (ii = 0; ii < 56; ii += 8){
        for (jj = 0; jj < 51; jj += 17){
            for (j = 0; j < 17; j++){
                for (i = 0; i < 8; i++){
                    a[i] = A[ii + i][jj + j];
                }
                for (i = 0; i < 8; i++){
                    B[jj + j][ii + i] = a[i];
                }
            }
        }
    }

    //处理剩余行
    for (j = 0; j < 51; j++){
        for (i = 56; i < N; i++){
            B[j][i] = A[i][j];
        }
    }

    //处理剩余列
    for (i = 0; i < 56; i++){
        for (j = 51; j < N; j++){
            B[j][i] = A[i][j];
        }
    }

    //处理对角线
    for (i = 56; i < N; i++){
        for (j = 51; j < M; j++){
            //tmp = A[i][j];
            B[j][i] = A[i][j];
        }
    }
}

运行：

make
./test-trans -M 61 -N 67

结果如下：

Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:6946, misses:1907, evictions:1875

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3756, misses:4423, evictions:4391

Summary for official submission (func 0): correctness=1 misses=1907

TEST_TRANS_RESULTS=1:1907

整体测试

./driver.py

结果如下

Part A: Testing cache simulator
Running ./test-csim
                        Your simulator     Reference simulator
Points (s,E,b)    Hits  Misses  Evicts    Hits  Misses  Evicts
     3 (1,1,1)       9       8       6       9       8       6  traces/yi2.trace
     3 (4,2,4)       4       5       2       4       5       2  traces/yi.trace
     3 (2,1,4)       2       3       1       2       3       1  traces/dave.trace
     3 (2,1,3)     167      71      67     167      71      67  traces/trans.trace
     3 (2,2,3)     201      37      29     201      37      29  traces/trans.trace
     3 (2,4,3)     212      26      10     212      26      10  traces/trans.trace
     3 (5,1,5)     231       7       0     231       7       0  traces/trans.trace
     6 (5,1,5)  265189   21775   21743  265189   21775   21743  traces/long.trace
    27


Part B: Testing transpose function
Running ./test-trans -M 32 -N 32
Running ./test-trans -M 64 -N 64
Running ./test-trans -M 61 -N 67

Cache Lab summary:
                        Points   Max pts      Misses
Csim correctness          27.0        27
Trans perf 32x32           8.0         8         289
Trans perf 64x64           8.0         8        1219
Trans perf 61x67          10.0        10        1907
          Total points    53.0        53