Deep Learning Systems HW0
这里回顾HW0,主要是熟悉代码和复现一些基本的反传。
课程主页:
参考资料:
- https://github.com/hsjeong5/MNIST-for-Numpy/blob/master/mnist.py
- https://mattpetersen.github.io/load-mnist-with-numpy
Question 1: A basic add
function, and testing/autograding basics
这部分没有难度,主要是熟悉代码:
def add(x, y):
""" A trivial 'add' function you should implement to get used to the
autograder and submission system. The solution to this problem is in the
the homework notebook.
Args:
x (Python number or numpy array)
y (Python number or numpy array)
Return:
Sum of x + y
"""
### BEGIN YOUR CODE
return x + y
### END YOUR CODE
Question 2: Loading MNIST data
根据参考资料,可以给出如下代码:
def parse_mnist(image_filename, label_filename):
""" Read an images and labels file in MNIST format. See this page:
http://yann.lecun.com/exdb/mnist/ for a description of the file format.
Args:
image_filename (str): name of gzipped images file in MNIST format
label_filename (str): name of gzipped labels file in MNIST format
Returns:
Tuple (X,y):
X (numpy.ndarray[np.float32]): 2D numpy array containing the loaded
data. The dimensionality of the data should be
(num_examples x input_dim) where 'input_dim' is the full
dimension of the data, e.g., since MNIST images are 28x28, it
will be 784. Values should be of type np.float32, and the data
should be normalized to have a minimum value of 0.0 and a
maximum value of 1.0. The normalization should be applied uniformly
across the whole dataset, _not_ individual images.
y (numpy.ndarray[dtype=np.uint8]): 1D numpy array containing the
labels of the examples. Values should be of type np.uint8 and
for MNIST will contain the values 0-9.
"""
### BEGIN YOUR CODE
with gzip.open(image_filename, "rb") as f:
X = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1, 784).astype('float32') / 255
with gzip.open(label_filename, "rb") as f:
y = np.frombuffer(f.read(), np.uint8, offset=8)
return X, y
### END YOUR CODE
Question 3: Softmax loss
根据如下公式实现即可:
代码如下:
def softmax_loss(Z, y):
""" Return softmax loss. Note that for the purposes of this assignment,
you don't need to worry about "nicely" scaling the numerical properties
of the log-sum-exp computation, but can just compute this directly.
Args:
Z (np.ndarray[np.float32]): 2D numpy array of shape
(batch_size, num_classes), containing the logit predictions for
each class.
y (np.ndarray[np.int8]): 1D numpy array of shape (batch_size, )
containing the true label of each example.
Returns:
Average softmax loss over the sample.
"""
### BEGIN YOUR CODE
exp_sum_z = np.sum(np.exp(Z), axis=-1)
b = Z.shape[0]
z_y = Z[np.arange(b), y]
loss = np.mean(np.log(exp_sum_z) - z_y)
return loss
### END YOUR CODE
Question 4: Stochastic gradient descent for softmax regression
根据如下公式计算梯度即可:
代码如下:
def softmax(x):
x1 = np.exp(x)
return x1 / np.sum(x1, axis=-1, keepdims=True)
def softmax_regression_epoch(X, y, theta, lr = 0.1, batch=100):
""" Run a single epoch of SGD for softmax regression on the data, using
the step size lr and specified batch size. This function should modify the
theta matrix in place, and you should iterate through batches in X _without_
randomizing the order.
Args:
X (np.ndarray[np.float32]): 2D input array of size
(num_examples x input_dim).
y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,)
theta (np.ndarrray[np.float32]): 2D array of softmax regression
parameters, of shape (input_dim, num_classes)
lr (float): step size (learning rate) for SGD
batch (int): size of SGD minibatch
Returns:
None
"""
### BEGIN YOUR CODE
n = X.shape[0]
step = n // batch
index = np.arange(batch)
for i in range(step + 1):
start = i * batch
end = min(start + batch, n)
if start == end:
break
x1 = X[start: end]
y1 = y[start: end]
z = softmax(np.matmul(x1, theta))
z[index, y1] -= 1
grad = np.matmul(x1.transpose(), z) / batch
theta -= lr * grad
### END YOUR CODE
Question 5: SGD for a two-layer neural network
我们需要求解的问题为:
记:
梯度为:
代码:
def nn_epoch(X, y, W1, W2, lr = 0.1, batch=100):
""" Run a single epoch of SGD for a two-layer neural network defined by the
weights W1 and W2 (with no bias terms):
logits = ReLU(X * W1) * W2
The function should use the step size lr, and the specified batch size (and
again, without randomizing the order of X). It should modify the
W1 and W2 matrices in place.
Args:
X (np.ndarray[np.float32]): 2D input array of size
(num_examples x input_dim).
y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,)
W1 (np.ndarray[np.float32]): 2D array of first layer weights, of shape
(input_dim, hidden_dim)
W2 (np.ndarray[np.float32]): 2D array of second layer weights, of shape
(hidden_dim, num_classes)
lr (float): step size (learning rate) for SGD
batch (int): size of SGD minibatch
Returns:
None
"""
### BEGIN YOUR CODE
n = X.shape[0]
step = n // batch
index = np.arange(batch)
for i in range(step + 1):
start = i * batch
end = min(start + batch, n)
if start == end:
break
x1 = X[start: end]
y1 = y[start: end]
# ReLU
Z1 = np.matmul(x1, W1)
Z1[Z1 < 0] = 0
# G2
G2 = softmax(np.matmul(Z1, W2))
G2[index, y1] -= 1
# G1
G1 = np.matmul(G2, W2.transpose())
G1[Z1 <= 0] = 0
# grad
W1_grad = np.matmul(x1.transpose(), G1) / batch
W2_grad = np.matmul(Z1.transpose(), G2) / batch
W1 -= lr * W1_grad
W2 -= lr * W2_grad
### END YOUR CODE
Question 6: Softmax regression in C++
利用C++实现反传,主要难点是这里矩阵使用一维数组表示:
float* matmul(float *x, float *y, int m, int d, int n) {
float *z = new float[m * n];
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
int ij = i * n + j;
z[ij] = 0;
for (int k = 0; k < d; k++) {
int ik = i * d + k;
int kj = k * n + j;
z[ij] += x[ik] * y[kj];
}
}
}
return z;
}
float* slice(const float *x, int start_row, int end_row, int d) {
int n = (end_row - start_row) * d;
int start = start_row * d;
float* z = new float[n];
for (int i = 0; i < n; i++) {
z[i] = x[start + i];
}
return z;
}
unsigned char* slice(const unsigned char *x, int start_row, int end_row, int d) {
int n = (end_row - start_row) * d;
int start = start_row * d;
unsigned char* z = new unsigned char[n];
for (int i = 0; i < n; i++) {
z[i] = x[start + i];
}
return z;
}
float* softmax(float *x, int m, int n) {
float* res = new float[m * n];
for (int i = 0; i < m; i++) {
float s = 0;
for (int j = 0; j < n; j++) {
int index = i * n + j;
res[index] = exp(x[index]);
s += res[index];
}
for (int j = 0; j < n; j++) {
int index = i * n + j;
res[index] /= s;
}
}
return res;
}
float* transpose(float *x, int m, int n) {
float *y = new float[m * n];
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
y[j * m + i] = x[i * n + j];
}
}
return y;
}
void minus(float *x, float *y, int m, int n) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
int index = i * n + j;
x[index] -= y[index];
}
}
}
void mul(float *x, float a, int m, int n) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
int index = i * n + j;
x[index] *= a;
}
}
}
void softmax_regression_epoch_cpp(const float *X, const unsigned char *y,
float *theta, size_t m, size_t n, size_t k,
float lr, size_t batch)
{
/**
* A C++ version of the softmax regression epoch code. This should run a
* single epoch over the data defined by X and y (and sizes m,n,k), and
* modify theta in place. Your function will probably want to allocate
* (and then delete) some helper arrays to store the logits and gradients.
*
* Args:
* X (const float *): pointer to X data, of size m*n, stored in row
* major (C) format
* y (const unsigned char *): pointer to y data, of size m
* theta (float *): pointer to theta data, of size n*k, stored in row
* major (C) format
* m (size_t): number of examples
* n (size_t): input dimension
* k (size_t): number of classes
* lr (float): learning rate / SGD step size
* batch (int): SGD minibatch size
*
* Returns:
* (None)
*/
/// BEGIN YOUR CODE
int step = m / batch;
// int e = 1;
for (int i = 0; i < step + 1; i++) {
int start = i * batch;
int end = std::min(start + batch, m);
if (start == end) {
break;
}
// row number of x1, y1
int l = end - start;
// shape: l, n
float *x1 = slice(X, start, end, n);
// shape: l, 1
unsigned char *y1 = slice(y, start, end, 1);
// shape: l, k
float *score = matmul(x1, theta, l, n, k);
// shape: l, k
float *z = softmax(score, l, k);
// z - Iy, shape: l, k
for (int i = 0; i < l; i++) {
int j = y1[i];
int index = i * k + j;
z[index] -= 1;
}
// grad
// shape: n, l
float *x1_transpose = transpose(x1, l, n);
// shape: n, k
float *grad = matmul(x1_transpose, z, n, l, k);
// update
mul(grad, 1.0 * lr / batch, n, k);
// shape: n, k
minus(theta, grad, n, k);
}
/// END YOUR CODE
}
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Doraemonzzz!
评论
ValineLivere