这次更新第5章习题。

参考资料:

https://dreamanddead.github.io/CSAPP-3e-Solutions/chapter4/

5.13

基本参数:

汇编代码

.L5
    vmovsd 0(%rbp,%rcx,8), %xmm1
    vmulsd (%rax,%rcx,8), %xmm1, %xmm1
    vaddsd %xmm1, %xmm0, %xmm0
    addq $1, %rcx
    cmpq %rbx, %rcx
    jne .L15
(a)

关键路径的分析:

原图:

简化后:

关键路径:

因此关键路径为加法。

(b)

此时数据类型为浮点数,所以下界为浮点数加法周期3.0。

(c)

此时数据类型为整数,所以下界为浮点数加法周期1.0。

(d)

因为关键路径上的操作为加法。

5.14

代码:

#include <stdio.h>

/*Inner product. Accumulate in temporary */
void inner5(vec_ptr u, vec_ptr v, data_t *dest){
    long i;
    long length = vec_length(u);
    data_t *udata = get_vec_start(u);
    data_t *vdata = get_vec_start(v);
    data_t sum = (data_t) 0;

    for (i = 0; i + 6 < length; i += 6){
        sum = ((((((sum + udata[i] * vdata[i]) + udata[i + 1] * vdata[i + 1]) + 
                          udata[i + 2] * vdata[i + 2]) + udata[i + 3] * vdata[i + 3]) + 
                          udata[i + 4] * vdata[i + 4]) + udata[i + 5] * vdata[i + 5]);
    }

    for (; i < length; i++){
        sum = sum + udata[i] * vdata[i];
    }

    *dest = sum;
}


int main(){
    return 0;
}

后续讨论参考课本368。

(a)
(b)

5.15

代码:

#include <stdio.h>

/*Inner product. Accumulate in temporary */
void inner6(vec_ptr u, vec_ptr v, data_t *dest){
    long i;
    long length = vec_length(u);
    data_t *udata = get_vec_start(u);
    data_t *vdata = get_vec_start(v);
    data_t sum1 = (data_t) 0;
    data_t sum2 = (data_t) 0;
    data_t sum3 = (data_t) 0;
    data_t sum4 = (data_t) 0;
    data_t sum5 = (data_t) 0;
    data_t sum6 = (data_t) 0;

    for (i = 0; i + 6 < length; i += 6){
        sum1 = sum1 + udata[i] * vdata[i];
        sum2 = sum2 + udata[i + 1] * vdata[i + 1];
        sum3 = sum3 + udata[i + 2] * vdata[i + 2];
        sum4 = sum4 + udata[i + 3] * vdata[i + 3];
        sum5 = sum5 + udata[i + 4] * vdata[i + 4];;
        sum6 = sum6 + udata[i + 5] * vdata[i + 5];;
    }

    for (; i < length; i++){
        sum3 = sum3 + udata[i] * vdata[i];
    }

    *dest = sum1 + sum2 + sum3 + sum4 + sum5 + sum6;

}

int main(){
    return 0;
}

制约因素是容量,因为计算地址时需要整数加法单元。

5.16

代码如下:

#include <stdio.h>

/*Inner product. Accumulate in temporary */
void inner5(vec_ptr u, vec_ptr v, data_t *dest){
    long i;
    long length = vec_length(u);
    data_t *udata = get_vec_start(u);
    data_t *vdata = get_vec_start(v);
    data_t sum = (data_t) 0;

    for (i = 0; i + 6 < length; i += 6){
        sum = sum + (udata[i] * vdata[i] + udata[i + 1] * vdata[i + 1] + 
                     udata[i + 2] * vdata[i + 2] + udata[i + 3] * vdata[i + 3] + 
                     udata[i + 4] * vdata[i + 4] + udata[i + 5] * vdata[i + 5]);
    }

    for (; i < length; i++){
        sum = sum + udata[i] * vdata[i];
    }

    *dest = sum;
}

int main(){
    return 0;
}

5.17

#include <stdio.h>

void *memset1(void *s, int c, size_t n){
    size_t cnt = 0;
    unsigned char *schar = s;
    unsigned long slong = 0;
    unsigned char *addr = &slong;
    int k = sizeof(unsigned long);
    // 填充k个
    for (int i = 0; i < k; i++){
        *addr++ = c;
    }

    // 循环
    while (cnt + k < n) {
        *schar = slong;
        cnt += k;
        schar += k;
    }
    
    // 处理剩余部分
    while (cnt < n){
        *schar++ = (unsigned char) c;
        cnt++;
    }

    return s;
}

int main(){
    return 0;
}

5.18

#include <stdio.h>

double poly(double a[], double x, long degree){
    long i;
    double result = a[0];
    double xpwr = x;    /* Equals x^i at start of loop */
    for (i = 1; i <= degree; i++) {
        result += a[i] * xpwr;
        xpwr = x * xpwr;
    }
    return result;
}

// 循环展开
double poly1(double a[], double x, long degree){
    long i;
    double result = a[0];
    double xpwr = x;    /* Equals x^i at start of loop */
    double xpwr1 = xpwr * x;
    for (i = 1; i <= degree; i += 2) {
        result = result + a[i] * xpwr + a[i + 1] * xpwr1;
        xpwr = xpwr1;
        xpwr1 = x * xpwr;
    }

    for (i = 1; i <= degree; i++) {
        result += a[i] * xpwr;
        xpwr = x * xpwr;
    }

    return result;
}

// 并行累积
double poly2(double a[], double x, long degree){
    long i;
    double result0 = 0;
    double result1 = 0;
    double xpwr0 = x;    /* Equals x^i at start of loop */
    double y = x * x;
    double xpwr1 = y;
    for (i = 1; i + 1 <= degree; i += 2) {
        result0 = result0 + a[i] * xpwr0;
        result1 = result1 + a[i + 1] * xpwr1;
        xpwr0 = xpwr0 * y;
        xpwr1 = xpwr1 * y;
    }

    for (; i <= degree; i++) {
        result0 = result0 + a[i] * xpwr0;
        xpwr0 *= x;
    }

    return result0 + result1;
}

// 重新结合
double poly3(double a[], double x, long degree){
    long i;
    double result = a[0];
    double xpwr = x;    /* Equals x^i at start of loop */
    double xpwr1 = xpwr * x;
    for (i = 1; i <= degree; i += 2) {
        result = result + (a[i] * xpwr + a[i + 1] * xpwr1);
        xpwr = xpwr1;
        xpwr1 = x * xpwr;
    }

    for (i = 1; i <= degree; i++) {
        result += a[i] * xpwr;
        xpwr = x * xpwr;
    }

    return result;
}

int main(){
    return 0;
}

5.19

#include <stdio.h>

void psum1a(float a[], float p[], long n){
    long i;
    float last_val, val;
    last_val = 0;
    
    for (i = 0; i + 4 < n; i++){
        val = last_val + a[i];
        p[i] = val;
        val = val + a[i + 1];
        p[i + 1] = val;
        val = val + a[i + 2];
        p[i + 2] = val;
        val = val + a[i + 3];
        p[i + 3] = val;
        // update
        last_val = val;
    }

    for (; i < n; i++){
        val = last_val + a[i];
        p[i] = val;
        last_val = val;
    }
}

int main(){
    return 0;
}