CMU 15-418的课程名为Parallel Computer Architecture and Programming，主要介绍并行计算相关内容。这里回顾Assignment 1，主要是熟悉多核的基本操作，利用ISPC编写一些并行程序。

# Assignment 1 Exploring Multi-Core and SIMD Parallelism

## 准备工作

git clone https://github.com/cmu15418/asst1-s18.git

## Problem 1: Parallel Fractal Generation Using Pthreads (15 points)

### 1,2,3

╰─± ./mandelbrot --view 1 --thread 1
[mandelbrot serial]:            [201.027] ms
Wrote image file mandelbrot-serial.ppm
++++                            (0.98x speedup from 1 threads)

// Struct for passing arguments to thread routine
typedef struct {
float x0, x1;
float y0, y1;
unsigned int width;
unsigned int height;
int startRow;
int totalRows;
int startCol;
int totalCols;
int maxIterations;
int* output;
} WorkerArgs;

void mandelbrotThread(
float x0, float y0, float x1, float y1,
int width, int height,
int maxIterations, int output[])
{
const static int MAX_THREADS = 32;

{
exit(1);
}

// 已经计算的height数量
int cum_height = 0;
// 已经计算的width数量
int cum_width = 0;

for (int i=0; i<numThreads; i++) {
// TODO: Set thread arguments here.
args[i].x0 = x0;
args[i].y0 = y0;
args[i].x1 = x1;
if (i == numThreads - 1) {
}
args[i].y1 = y1;
args[i].width = width;
args[i].height = height;
args[i].maxIterations = maxIterations;
// 起始行
args[i].startRow = cum_height;
args[i].startCol = cum_width;

args[i].output = output;

// 更新
}

// are created and the main app thread is used as a worker as
// well.

// wait for worker threads to complete
}

╰─± ./mandelbrot --view 1 --thread 2
[mandelbrot serial]:            [200.329] ms
Wrote image file mandelbrot-serial.ppm
++++                            (1.92x speedup from 2 threads)

╰─± ./mandelbrot --view 1 --thread 4
[mandelbrot serial]:            [200.309] ms
Wrote image file mandelbrot-serial.ppm
++++                            (2.31x speedup from 4 threads)

╰─± ./mandelbrot --view 1 --thread 8
[mandelbrot serial]:            [201.094] ms
Wrote image file mandelbrot-serial.ppm
++++                            (3.63x speedup from 8 threads)

╰─± ./mandelbrot --view 1 --thread 16
[mandelbrot serial]:            [200.019] ms
Wrote image file mandelbrot-serial.ppm
++++                            (5.84x speedup from 16 threads)

2 2 1.92
4 4 2.31
8 8 3.63
16 16 5.84

### 4

void mandelbrotSerial_v2(
float x0, float y0, float x1, float y1,
int width, int height,
int startCol, int totalCols,
int maxIterations,
int output[])
{
// 前闭后开
float dx = (x1 - x0) / width;
float dy = (y1 - y0) / height;

int endCol = startCol + totalCols;

for (int i = startCol; i < endCol; i++) {
for (int j = 0; j < height; j++) {
float x = x0 + i * dx;
float y = y0 + j * dy;

int index = (j * width + i);
output[index] = mandel(x, y, maxIterations);
}
}
}

double startTime = CycleTimer::currentSeconds();

// TODO: Implement worker thread here.
mandelbrotSerial_v2(args->x0, args->y0, args->x1, args->y1,
args->width, args->height, args->startCol, args->totalCols,
args->maxIterations, args->output);

double endTime = CycleTimer::currentSeconds();

return NULL;
}

float x0, float y0, float x1, float y1,
int width, int height,
int maxIterations, int output[])
{
const static int MAX_THREADS = 32;

{
exit(1);
}

// 已经计算的height数量
int cum_height = 0;
// 已经计算的width数量
int cum_width = 0;

for (int i=0; i<numThreads; i++) {
// TODO: Set thread arguments here.
args[i].x0 = x0;
args[i].y0 = y0;
args[i].x1 = x1;
if (i == numThreads - 1) {
}
args[i].y1 = y1;
args[i].width = width;
args[i].height = height;
args[i].maxIterations = maxIterations;
// 起始行
args[i].startRow = cum_height;
args[i].startCol = cum_width;

args[i].output = output;

// 更新
}

// are created and the main app thread is used as a worker as
// well.

// wait for worker threads to complete

}

╰─± ./mandelbrot --view 1 --thread 16
[mandelbrot serial]:            [200.328] ms
Wrote image file mandelbrot-serial.ppm
++++                            (4.14x speedup from 16 threads)

╰─± ./mandelbrot --view 2 --thread 16
[mandelbrot serial]:            [121.285] ms
Wrote image file mandelbrot-serial.ppm
++++                            (6.28x speedup from 16 threads)

void mandelbrotSerial_v3(
float x0, float y0, float x1, float y1,
int width, int height,
int startRow, int totalRows,
int maxIterations,
int output[])
{
// 前闭后开
float dx = (x1 - x0) / width;
float dy = (y1 - y0) / height;
int endRow = startRow + totalRows;

for (int j = startRow; j < endRow; j++) {
int i = 0;
int index = (j - startRow) * width;
float x;
float y = y0 + j * dy;
// 循环展开
for (; i + 1 < width; i += 2, index += 2) {
x = x0 + i * dx;
output[index] = mandel(x, y, maxIterations);

x = x0 + (i + 1) * dx;
output[index + 1] = mandel(x, y, maxIterations);
}
// 处理剩余部分
for (; i < width; ++i, ++index) {
x = x0 + i * dx;
output[index] = mandel(x, y, maxIterations);
}
}
}

double startTime = CycleTimer::currentSeconds();

// TODO: Implement worker thread here.
mandelbrotSerial_v3(args->x0, args->y0, args->x1, args->y1,
args->width, args->height, args->startRow, args->totalRows,
args->maxIterations, args->output);

double endTime = CycleTimer::currentSeconds();

return NULL;
}

float x0, float y0, float x1, float y1,
int width, int height,
int maxIterations, int output[])
{
const static int MAX_THREADS = 32;

{
exit(1);
}

// 已经计算的height数量
int cum_height = 0;
// 已经计算的width数量
int cum_width = 0;

for (int i=0; i<numThreads; i++) {
// TODO: Set thread arguments here.
args[i].x0 = x0;
args[i].y0 = y0;
args[i].x1 = x1;
if (i == numThreads - 1) {
}
args[i].y1 = y1;
args[i].width = width;
args[i].height = height;
args[i].maxIterations = maxIterations;
// 起始行
args[i].startRow = cum_height;
args[i].startCol = cum_width;

args[i].output = new int[args[i].totalRows * args[i].width];
// 更新
}

// are created and the main app thread is used as a worker as
// well.

// wait for worker threads to complete

// 合并结果
int start = 0;
for (int i = 0; i < numThreads; i++) {
int m = args[i].totalRows * args[i].width;
for (int j = 0; j < m; j++) {
output[start + j] = args[i].output[j];
}
start += m;
}
}

╰─± ./mandelbrot --view 1 --thread 16
[mandelbrot serial]:            [200.283] ms
Wrote image file mandelbrot-serial.ppm
++++                            (5.91x speedup from 16 threads)

╰─± ./mandelbrot --view 2 --thread 16
[mandelbrot serial]:            [125.893] ms
Wrote image file mandelbrot-serial.ppm
++++                            (6.66x speedup from 16 threads)

## Problem 2: Vectorizing Code Using SIMD Intrinsics (20 points)

### 1

void clampedExpVector(float* values, int* exponents, float* output, int N) {
// Implement your vectorized version of clampedExpSerial here
//  ...
int i = 0;
// r表示第一次处理的数量
int r = N % VECTOR_WIDTH;
if (r == 0) {
r = VECTOR_WIDTH;
}
__cmu418_vec_float x;
__cmu418_vec_float result;
__cmu418_vec_float constant;
__cmu418_vec_int y;
__cmu418_vec_float xpower;
__cmu418_vec_int zero, one, yAndOne;

for (i = 0; i < N; ) {
if (i == 0) {
// 前r个为1
} else {
// 全1
}

// 初始化为false
greaterThanZero = _cmu418_init_ones(0);
greater = _cmu418_init_ones(0);
constant = _cmu418_vset_float(4.18f);

// x = values[i];
// result = 1.f
result = _cmu418_vset_float(1.f);
// y = exponents[i]
// xpower = x
// 判断是否大于0
// while (y > 0)
while (_cmu418_cntbits(greaterThanZero)) {
// y & 0x1
// 判断y & 0x1是否为1
// result *= xpower;
// xpower = xpower * xpower;
// y >>= 1
// 判断是否大于0
}
// if (result > 4.18f)
// result = 4.18f
_cmu418_vmove_float(result, constant, greater);
// output[i] = result
// 更新
if (i == 0) {
i += r;
} else {
i += VECTOR_WIDTH;
}
}
}

╰─± ./vrun -s 16
CLAMPED EXPONENT (required)
****************** Printing Vector Unit Statistics *******************
Vector Width:              8
Total Vector Instructions: 124
Vector Utilization:        92.237903%
Utilized Vector Lanes:     915
Total Vector Lanes:        992
************************ Result Verification *************************
Passed!!!

### 2

╰─± ./vrun -s 10000
CLAMPED EXPONENT (required)
****************** Printing Vector Unit Statistics *******************
Vector Width:              2
Total Vector Instructions: 301610
Vector Utilization:        91.087000%
Utilized Vector Lanes:     549455
Total Vector Lanes:        603220
************************ Result Verification *************************
Passed!!!

╰─± ./vrun -s 10000
CLAMPED EXPONENT (required)
****************** Printing Vector Unit Statistics *******************
Vector Width:              4
Total Vector Instructions: 154195
Vector Utilization:        90.843250%
Utilized Vector Lanes:     560303
Total Vector Lanes:        616780
************************ Result Verification *************************
Passed!!!

╰─± ./vrun -s 10000
****************** Printing Vector Unit Statistics *******************
Vector Width:              8
Total Vector Instructions: 77485
Vector Utilization:        90.789024%
Utilized Vector Lanes:     562783
Total Vector Lanes:        619880
************************ Result Verification *************************
Passed!!!

╰─± ./vrun -s 10000
CLAMPED EXPONENT (required)
****************** Printing Vector Unit Statistics *******************
Vector Width:              16
Total Vector Instructions: 38750
Vector Utilization:        90.786935%
Utilized Vector Lanes:     562879
Total Vector Lanes:        620000
************************ Result Verification *************************
Passed!!!

### 3

span的含义暂时没有理解，目前的做法是将数组拆成$N/W$份，每份分别求和，存储到数组中，然后对数组按顺序调用_cmu418_hadd_float,_cmu418_interleave_float，调用数量为$\log_2W$，不难得到，第$i$次调用时，数组的第一个元素为

float arraySumVectorHelper(float* values) {
int i = 1;
__cmu418_vec_float result, tmp;
while (i < VECTOR_WIDTH) {
_cmu418_interleave_float(tmp, result);
i *= 2;
}

return values[0];
}

float arraySumVector(float* values, int N) {
// Implement your vectorized version here
//  ...
int l = N / VECTOR_WIDTH;
float res[VECTOR_WIDTH];
// 一次最多只能看max(N/ W, log2W)
for (int i = 0; i < VECTOR_WIDTH; i++) {
res[i] = arraySumSerial(values + i * l, l);
}

return arraySumVectorHelper(res);
}

ARRAY SUM (bonus)
Passed!!!

## Problem 3: Parallel Fractal Generation Using ISPC (15 points)

### 准备工作

export PATH=~/ispc-v1.17.0-linux/bin:\$PATH

### 3.1

╰─± ./mandelbrot_ispc -v 1
[mandelbrot serial]:            [215.696] ms
Wrote image file mandelbrot-serial.ppm
[mandelbrot ispc]:              [46.147] ms
Wrote image file mandelbrot-ispc.ppm
(4.67x speedup from ISPC)

╰─± ./mandelbrot_ispc -v 2
[mandelbrot serial]:            [131.712] ms
Wrote image file mandelbrot-serial.ppm
[mandelbrot ispc]:              [32.244] ms
Wrote image file mandelbrot-ispc.ppm
(4.08x speedup from ISPC)

╰─± ./mandelbrot_ispc -v 3
[mandelbrot serial]:            [308.290] ms
Wrote image file mandelbrot-serial.ppm
[mandelbrot ispc]:              [68.822] ms
Wrote image file mandelbrot-ispc.ppm
(4.48x speedup from ISPC)

╰─± ./mandelbrot_ispc -v 4
[mandelbrot serial]:            [304.620] ms
Wrote image file mandelbrot-serial.ppm
[mandelbrot ispc]:              [68.016] ms
Wrote image file mandelbrot-ispc.ppm
(4.48x speedup from ISPC)

╰─± ./mandelbrot_ispc -v 5
[mandelbrot serial]:            [60.267] ms
Wrote image file mandelbrot-serial.ppm
[mandelbrot ispc]:              [13.930] ms
Wrote image file mandelbrot-ispc.ppm
(4.33x speedup from ISPC)

╰─± ./mandelbrot_ispc -v 6
[mandelbrot serial]:            [534.112] ms
Wrote image file mandelbrot-serial.ppm
[mandelbrot ispc]:              [112.647] ms
Wrote image file mandelbrot-ispc.ppm
(4.74x speedup from ISPC)

### 3.2

#### 1,2

// slightly different kernel to support tasking
uniform float x1, uniform float y1,
uniform int width, uniform int height,
uniform int maxIterations,
uniform int output[])
{

// taskIndex is an ISPC built-in

uniform int yend = ystart + rowsPerTask;
if (yend + rowsPerTask > height) {
yend = height;
}

uniform float dx = (x1 - x0) / width;
uniform float dy = (y1 - y0) / height;

foreach (j = ystart ... yend, i = 0 ... width) {
float x = x0 + i * dx;
float y = y0 + j * dy;

int index = j * width + i;
output[index] = mandel(x, y, maxIterations);
}
}

export void mandelbrot_ispc_withtasks(uniform float x0, uniform float y0,
uniform float x1, uniform float y1,
uniform int width, uniform int height,
uniform int maxIterations,
uniform int output[])
{

// uniform int threadCount = 2;

width, height,
maxIterations,
output);
}


╰─± ./mandelbrot_ispc -t -v 1
[mandelbrot serial]:            [215.468] ms
Wrote image file mandelbrot-serial.ppm
[mandelbrot ispc]:              [46.115] ms
Wrote image file mandelbrot-ispc.ppm
[mandelbrot multicore ispc]:    [12.226] ms
(4.67x speedup from ISPC)
(17.62x speedup from task ISPC)

#### 3

Pthread 抽象（用于问题 1）和 ISPC 任务抽象有什么区别？ (create/join 和 (launch/sync) 机制之间在语义上有一些明显的差异，但这些差异的含义更微妙。这里有一个思想实验来指导你的答案：当你启动 10,000 个 ISPC 任务时会发生什么？什么 当你启动 10,000 个 pthread 时会发生什么？

## Problem 4: Iterative Square Root (10 points)

### 1

╰─± ./sqrt
[sqrt serial]:          [829.827] ms
[sqrt ispc]:            [188.124] ms
(4.41x speedup from ISPC)
(39.64x speedup from task ISPC)

### 2

// Generate data that gives high relative speedup
void initGood(float *values, int N) {
for (int i=0; i<N; i++)
{
// Todo: Choose values
values[i] = 2.998f - i / N;
}
}

╰─± ./sqrt -d g
[sqrt serial]:          [2101.552] ms
[sqrt ispc]:            [321.759] ms
(6.53x speedup from ISPC)
(59.64x speedup from task ISPC)

### 3

// Generate data that gives low relative speedup
void initBad(float *values, int N) {
for (int i=0; i<N; i++)
{
// Todo: Choose values
values[i] = 1.0f;
}
}


╰─± ./sqrt -d b
[sqrt serial]:          [27.776] ms
[sqrt ispc]:            [17.068] ms
(1.63x speedup from ISPC)
(2.26x speedup from task ISPC)

## Problem 5: BLAS saxpy (10 points)

### 1

╰─± ./saxpy
[saxpy serial]:         [12.381] ms     [24.072] GB/s   [1.615] GFLOPS
[saxpy streaming]:      [13.943] ms     [21.374] GB/s   [1.434] GFLOPS
[saxpy ispc]:           [15.216] ms     [19.586] GB/s   [1.314] GFLOPS
[saxpy task ispc]:      [14.207] ms     [20.977] GB/s   [1.408] GFLOPS
(0.89x speedup from streaming)
(0.81x speedup from ISPC)
(0.87x speedup from task ISPC)

### 3

void saxpyStreaming(int N,
float scale,
float X[],
float Y[],
float result[])
{
// Replace this code with ones that make use of the streaming instructions
// saxpySerial(N, scale, X, Y, result);
__m128 x, y;
// scale, scale, scale, scale
__m128 scalar = _mm_set1_ps(scale);
for (int i = 0; i < N; i += 4) {
// X[0], ... , X[3]
}