## 注意事项

• 先按照标量形式求导；
• 然后转换成矩阵形式；
• 根据输入输出的维度得到最终结果；

• 前向的输出为NDArray数组；
• 反向的输出为Tensor；

• 后续讨论中将输出记录为$o$；

## Question 1: Implementing forward computation & Question 2: Implementing backward computation

class EWiseAdd(TensorOp):
def compute(self, a: NDArray, b: NDArray):
return a + b

return out_grad, out_grad

class AddScalar(TensorOp):
def __init__(self, scalar):
self.scalar = scalar

def compute(self, a: NDArray):
return a + self.scalar

return out_grad

### EWiseMul

class EWiseMul(TensorOp):
def compute(self, a: NDArray, b: NDArray):
return a * b

lhs, rhs = node.inputs
return out_grad * rhs, out_grad * lhs

### MulScalar

class MulScalar(TensorOp):
def __init__(self, scalar):
self.scalar = scalar

def compute(self, a: NDArray):
return a * self.scalar

return out_grad * self.scalar

### PowerScalar

class PowerScalar(TensorOp):
"""Op raise a tensor to an (integer) power."""

def __init__(self, scalar: int):
self.scalar = scalar

def compute(self, a: NDArray) -> NDArray:
return array_api.power(a, self.scalar)

input, = node.inputs
# nx^(n-1)
return out_grad * (self.scalar * array_api.power(input, self.scalar - 1))
### END YOUR SOLUTION

### EWiseDiv

class EWiseDiv(TensorOp):
"""Op to element-wise divide two nodes."""

def compute(self, a, b):
return a / b

lhs, rhs = node.inputs
### END YOUR SOLUTION

### DivScalar

class DivScalar(TensorOp):
def __init__(self, scalar):
self.scalar = scalar

def compute(self, a):
return a / self.scalar

### END YOUR SOLUTION

### Transpose

class Transpose(TensorOp):
def __init__(self, axes: Optional[tuple] = None):
self.axes = axes

def compute(self, a):
if self.axes:
x, y = self.axes[0], self.axes[1]
else:
x, y = -1, -2
return array_api.swapaxes(a, x, y)

if self.axes:
x, y = self.axes[0], self.axes[1]
else:
x, y = -1, -2
### END YOUR SOLUTION

### Reshape

class Reshape(TensorOp):
def __init__(self, shape):
self.shape = shape

def compute(self, a):
return array_api.reshape(a, self.shape)

input, = node.inputs
### END YOUR SOLUTION

• 找到广播的维度，求和，然后除以$x_0$即可；

• 直接求和即可；

class BroadcastTo(TensorOp):
def __init__(self, shape):
self.shape = shape

def compute(self, a):

input, = node.inputs
# 找到广播的维度
# input: scalar
n1 = len(input.shape)
n2 = len(self.shape)
# 计算系数
c = 1
if n1 != n2:
# scalar->tensor
axes = range(n2)
else:
# tensor->tensorx
axes = []
for i in range(n1):
if input.shape[i] != self.shape[i]:
axes.append(i)
c *= input.shape[i]
# 注意恢复形状
return reshape(summation(out_grad, axes=tuple(axes)), input.shape) / c
### END YOUR SOLUTION

### Summation

• 假设输入为$x$的维度为$(3, 4, 5)$，对第一个维度求和，那么输出的维度为$(3, 5)$；
• 在反向传播时，假设输入为$g$，维度为$(3, 5)$，首先将维度扩展为$(3, 1, 5)$，然后广播到$(3, 4, 5)$；

class Summation(TensorOp):
def __init__(self, axes: Optional[tuple] = None):
self.axes = axes

def compute(self, a):
return array_api.sum(a, self.axes)

input, = node.inputs
# 使坐标为正并且从小到大排列
if self.axes == None:
axes = input.shape
else:
axes = self.axes
n = len(input.shape)
new_axes = []
for x in axes:
if x >= 0:
new_axes.append(x)
else:
new_axes.append(x + n)
new_axes = sorted(new_axes)
for axis in new_axes:

### END YOUR SOLUTION

### MatMul

class MatMul(TensorOp):
def compute(self, a, b):
return array_api.matmul(a, b)

# (A, a, b), (B, b, c)
lhs, rhs = node.inputs
# (C, a, b)
# (C, b, c)
# 注意形状
n2 = len(lhs.shape)
n3 = len(rhs.shape)

if n1 > n2:
if n1 > n3:

### END YOUR SOLUTION

### Negate

class Negate(TensorOp):
def compute(self, a):
return -a

### END YOUR SOLUTION

### Log

class Log(TensorOp):
def compute(self, a):
return array_api.log(a)

input, = node.inputs
### END YOUR SOLUTION

### Exp

class Exp(TensorOp):
def compute(self, a):
return array_api.exp(a)

input, = node.inputs
### END YOUR SOLUTION

## Question 3: Topological sort

node_list是计算图的输出，不是整个计算图，要得到拓扑排序，需要从node_list中节点DFS即可（邻接点为input），代码如下：

def find_topo_sort(node_list: List[Value]) -> List[Value]:
"""Given a list of nodes, return a topological sort list of nodes ending in them.

A simple algorithm is to do a post-order DFS traversal on the given nodes,
going backwards based on input edges. Since a node is added to the ordering
after all its predecessors are traversed due to post-order DFS, we get a topological
sort.
"""
# 0: 未访问, 1: 临时访问, 2: 永久访问
visited = dict()
# init
for node in node_list:
visited[node] = 0

topo_order = []

for node in node_list:
if visited[node] == 0:
topo_sort_dfs(node, visited, topo_order)

def topo_sort_dfs(node, visited, topo_order):
"""Post-order DFS"""
if visited[node] == 2:
return
visited[node] = 1
for node1 in node.inputs:
if node1 not in visited:
visited[node1] = 0
if visited[node1] == 0:
topo_sort_dfs(node1, visited, topo_order)
visited[node] = 2
topo_order.append(node)
### END YOUR SOLUTION

## Question 4: Implementing reverse mode differentiation

def compute_gradient_of_variables(output_tensor, out_grad):
"""Take gradient of output node with respect to each node in node_list.

Store the computed result in the grad field of each Variable.
"""
# a map from node to a list of gradient contributions from each output node
# Special note on initializing gradient of
# We are really taking a derivative of the scalar reduce_sum(output_node)
# instead of the vector output_node. But this is the common case for loss function.

# Traverse graph in reverse topological order given the output_node that we are taking gradient wrt.
reverse_topo_order = list(reversed(find_topo_sort([output_tensor])))

for node in reverse_topo_order:
# important
# init
for node1 in node.inputs:
# compute
if not node.is_leaf():
for i, node1 in enumerate(node.inputs):
### END YOUR SOLUTION

## Question 5: Softmax loss

def softmax_loss(Z, y):
""" Return softmax loss.  Note that for the purposes of this assignment,
you don't need to worry about "nicely" scaling the numerical properties
of the log-sum-exp computation, but can just compute this directly.

Args:
Z (np.ndarray[np.float32]): 2D numpy array of shape
(batch_size, num_classes), containing the logit predictions for
each class.
y (np.ndarray[np.int8]): 1D numpy array of shape (batch_size, )
containing the true label of each example.

Returns:
Average softmax loss over the sample.
"""
exp_sum_z = np.sum(np.exp(Z), axis=-1)
b = Z.shape[0]
z_y = Z[np.arange(b), y]
loss = np.mean(np.log(exp_sum_z) - z_y)

return loss
### END YOUR CODE

def softmax_loss(Z, y_one_hot):
""" Return softmax loss.  Note that for the purposes of this assignment,
you don't need to worry about "nicely" scaling the numerical properties
of the log-sum-exp computation, but can just compute this directly.

Args:
Z (ndl.Tensor[np.float32]): 2D Tensor of shape
(batch_size, num_classes), containing the logit predictions for
each class.
y (ndl.Tensor[np.int8]): 2D Tensor of shape (batch_size, num_classes)
containing a 1 at the index of the true label of each example and
zeros elsewhere.

Returns:
Average softmax loss over the sample. (ndl.Tensor[np.float32])
"""
# n, d -> n
exp_sum_z = ndl.summation(ndl.exp(Z), axes=(-1,))
b = Z.shape[0]
z_y = ndl.summation(ndl.multiply(Z, y_one_hot), axes=(-1,))
# print(exp_sum_z.shape, z_y.shape)
loss = ndl.summation(ndl.log(exp_sum_z) - z_y) / b
# print(loss.shape)
return loss
### END YOUR SOLUTION

## Question 6: SGD for a two-layer neural network

class ReLU(TensorOp):
def compute(self, a):
return array_api.maximum(a, 0)

input, = node.inputs
# input_relu = input.realize_cached_data()
input_relu = relu(input).numpy()
return out_grad * Tensor(input_relu > 0)
### END YOUR SOLUTION

def nn_epoch(X, y, W1, W2, lr = 0.1, batch=100):
""" Run a single epoch of SGD for a two-layer neural network defined by the
weights W1 and W2 (with no bias terms):
logits = ReLU(X * W1) * W2
The function should use the step size lr, and the specified batch size (and
again, without randomizing the order of X).

Args:
X (np.ndarray[np.float32]): 2D input array of size
(num_examples x input_dim).
y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,)
W1 (ndl.Tensor[np.float32]): 2D array of first layer weights, of shape
(input_dim, hidden_dim)
W2 (ndl.Tensor[np.float32]): 2D array of second layer weights, of shape
(hidden_dim, num_classes)
lr (float): step size (learning rate) for SGD
batch (int): size of SGD mini-batch

Returns:
Tuple: (W1, W2)
W1: ndl.Tensor[np.float32]
W2: ndl.Tensor[np.float32]
"""

n = X.shape[0]
m = W2.shape[1]
y_one_hot = np.zeros((n, m))
y_one_hot[np.arange(n), y] = 1
step = n // batch
index = np.arange(batch)
for i in range(step + 1):
start = i * batch
end = min(start + batch, n)
if start == end:
break
x1 = ndl.Tensor(X[start: end])
y1 = ndl.Tensor(y_one_hot[start: end])
# compute
Z = ndl.matmul(ndl.relu(ndl.matmul(x1, W1)), W2)
# loss
loss = softmax_loss(Z, y1)
# backward
loss.backward()
W1 = ndl.Tensor(W1.numpy() - lr * W1.grad.numpy())
W2 = ndl.Tensor(W2.numpy() - lr * W2.grad.numpy())

return W1, W2
### END YOUR SOLUTION