David silver 强化学习 Easy21

课程主页： https://www.davidsilver.uk/teaching/

这里回顾David silver强化学习的作业Easy21。

参考资料

1 Implementation of Easy21

实现环境

import numpy as np

dealer = [[1, 4], [4, 7], [7, 10]]
player = [[1, 6], [4, 9], [7, 12], [10, 15], [13, 18], [16, 21]]
action = [0, 1]

def update(s, i):
    """
    Parameters
    ----------
    s : state array
    i : index
    
    Returns
    -------
    None.
    """
    #生成概率
    p = np.random.rand()
    v = np.random.randint(1, 11)
    if p < 1 / 3:
        s[i] -= v
    else:
        s[i] += v
        
def init():
    return np.random.randint(1, 11, 2)

def judge(s, i):
    #判断是否结束
    if s[i] > 21 or s[i] < 1:
        return True
    return False

def get_action():
    return np.random.randint(2)

class Easy21():
    def step(self, s, a):
        """
        Parameters
        ----------
        s : state array, dealer s[0], player s[1]
        a : action, hit: 0; stick: 1

        Returns
        -------
        next state s'
        reward r
        terminal flag
        """
        #判断是否结束
        flag = False
        r = 0
        if a == 0:
            update(s, 1)
            if judge(s, 1):
                r = -1
                flag = True
        else:
            flag = True
            while s[0] < 17 and s[0] > 0:
                update(s, 0)
            if judge(s, 0):
                r = 1
            else:
                if s[0] < s[1]:
                    r = 1
                elif s[0] == s[1]:
                    r = 0
                else:
                    r = -1
        
        return s, r, flag
    
def epsilon_greedy(Q, N, key, N0):
    n = N[key].sum()
    epsilon = N0 / (N0 + n)
    p = np.random.rand()
    if p >= epsilon:
        a = np.argmax(Q[key])
    else:
        a = get_action()
        
    return a

def get_feature(s, a):
    feature = []
    for d in dealer:
        for p in player:
            for act in action:
                if s[0] >= d[0] and s[0] <= d[1] and s[1] >= p[0] and s[1] <= p[1] and a == act:
                    feature.append(1)
                else:
                    feature.append(0)
                    
    return np.array(feature)

def greedy(theta, s, epsilon):
    p = np.random.rand()
    f1 = get_feature(s, 0)
    f2 = get_feature(s, 1)
    v = [np.dot(theta, f1), np.dot(theta, f2)]
    if p >= epsilon:
        a = np.argmax(v)
    else:
        a = get_action()
        
    return a

def error_square(Q1, Q2):
    error = 0
    for key in Q1:
        error += (np.max(Q1[key]) - np.max(Q2[key])) ** 2;
    
    return error

def get_dict():
    X = np.arange(1, 11)
    Y = np.arange(1, 22)
    res = dict()
    for x in X:
        for y in Y:
            res[(x, y)] = np.array([0.0, 0.0])
    
    return res

2 Monte-Carlo Control in Easy21

更新公式为

$\begin{array}{l}{N\left(S_{t}, A_{t}\right) \leftarrow N\left(S_{t}, A_{t}\right)+1} \\ {Q\left(S_{t}, A_{t}\right) \leftarrow Q\left(S_{t}, A_{t}\right)+\alpha\left(G_{t}-Q\left(S_{t}, A_{t}\right)\right)}\end{array}$

from env import *
import numpy as np
import pickle

gamma = 1
N0 = 100
Q = get_dict()
N = get_dict()
K = 1000000

def mc(Q, N, N0=100, gamma=1):
    flag = False
    s = init()
    env = Easy21()
    #轨迹
    trace = []
    while not flag:
        key = tuple(s)
        a = epsilon_greedy(Q, N, key, N0)
        s, r, flag = env.step(s, a)
        trace.append((key, a, r))
        #更新
        N[key][a] += 1
    
    G = 0
    n = len(trace)
    for i in range(n-1, -1, -1):
        s, a, r = trace[i]
        G = gamma * G + r
        alpha = 1 / N[s][a]
        Q[s][a] += alpha * (G - Q[s][a])
        
for i in range(K):
    mc(Q, N)
    
#保存
with open("mcQ.pickle", "wb") as file:
    pickle.dump(Q, file)

作图结果：

3 TD Learning in Easy21

伪代码如下：

import numpy as np
import pickle
import matplotlib.pyplot as plt
from env import *

def td(Q, N, lambda_, N0=100, gamma=1):
    flag = False
    s = init()
    key = tuple(s)
    a = epsilon_greedy(Q, N, key, N0)
    env = Easy21()
    E = get_dict()
    while not flag:
        key = tuple(s)
        #执行动作
        s, r, flag = env.step(s, a)
        #选择下一步动作
        a1 = epsilon_greedy(Q, N, key, N0)
        key1 = tuple(s)
        #更新计数
        N[key][a] += 1
        #更新E
        E[key][a] += 1
        #计算delta
        delta = r - Q[key][a]
        #判断是否终止
        if not flag:
            delta += gamma * Q[key1][a1]
        #更新Q
        alpha = 1 / N[key][a]
        for key in Q:
            Q[key] += alpha * delta * E[key]
            E[key] *= gamma * lambda_
        
        #赋值
        a = a1
        
with open("mcQ.pickle", "rb") as file:
    Q1 = pickle.load(file)

Lambda = np.linspace(0, 1, 11)
N0 = 100
K = 10000
Error = []

for lambda_ in Lambda:
    Q = get_dict()
    N = get_dict()
    for i in range(K):
        td(Q, N, lambda_)
    error = error_square(Q, Q1)
    Error.append(error)
    if lambda_ == 0 or lambda_ == 1:
        filename = "td" + str(int(lambda_)) + ".pickle"
        with open(filename, "wb") as file:
            pickle.dump(Q, file)

4 Linear Function Approximation in Easy21

利用第6节的公式，对前一部分的代码进行修改即可：

反向视角 $\begin{aligned} \delta_{t} &=R_{t+1}+\gamma \hat{q}\left(S_{t+1}, A_{t+1}, \mathbf{w}\right)-\hat{q}\left(S_{t}, A_{t}, \mathbf{w}\right) \\ &=R_{t+1} +\gamma \mathrm x\left(S_{t+1}\right)^{\top} \mathrm w - \mathrm x\left(S_{t}\right)^{\top}\mathrm w\\ E_{t} &=\gamma \lambda E_{t-1}+\nabla_{\mathbf{w}} \hat{q}\left(S_{t}, A_{t}, \mathbf{w}\right) \\ &=\gamma \lambda E_{t-1}+ \mathrm x\left(S_{t}\right) \\ \Delta \mathbf{w} &=\alpha \delta_{t} E_{t} \end{aligned}$

import numpy as np
import pickle
import matplotlib.pyplot as plt
from env import *

def linear_td(theta, lambda_, epsilon=0.05, gamma=1, alpha=0.01):
    #初始化
    E = np.zeros_like(theta)
    flag = False
    s = init()
    key = tuple(s)
    a = greedy(theta, key, epsilon)
    env = Easy21()
    while not flag:
        key = tuple(s)
        #执行动作
        s, r, flag = env.step(s, a)
        #特征
        f = get_feature(key, a)
        #选择下一步动作
        a1 = greedy(theta, key, epsilon)
        key1 = tuple(s)
        #特征
        f1 = get_feature(key1, a1)
        #计算delta
        delta = r - np.dot(theta, f)
        #判断是否终止
        if not flag:
            delta += gamma * np.dot(theta, f1)
        E = gamma * lambda_ * E + f
        theta += alpha * delta * E
        a = a1
        
def computeQ(theta):
    Q = dict()
    X = np.arange(1, 11)
    Y = np.arange(1, 22)
    A = [0, 1]
    for x in X:
        for y in Y:
            s = (x, y)
            Q[s] = np.array([0.0, 0.0])
            for a in A:
                f = get_feature(s, a)
                Q[s][a] = np.dot(theta, f)
    return Q

with open("mcQ.pickle", "rb") as file:
    Q1 = pickle.load(file)
    
Error = []
Lambda = np.linspace(0, 1, 11)
K = 10000

for lambda_ in Lambda:
    theta = np.zeros(36)
    for i in range(K):
        linear_td(theta, lambda_)
    Q = computeQ(theta)
    error = error_square(Q, Q1)
    Error.append(error)
    if lambda_ == 0 or lambda_ == 1:
        filename = "linear_td" + str(int(lambda_)) + ".pickle"
        with open(filename, "wb") as file:
            pickle.dump(Q, file)

补充

作图函数

import pickle
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

def plot(filename, title):
    with open(filename, "rb") as file:
        Q = pickle.load(file)
        
    X = np.arange(1, 11)
    Y = np.arange(1, 22)
    Z = np.zeros((21, 10))
    
    for key in Q:
        x = key[0]
        y = key[1]
        Z[y - 1][x - 1] = np.max(Q[key])
        
    X, Y = np.meshgrid(X, Y)
    fig = plt.figure()
    ax = Axes3D(fig)
    ax.plot_surface(X, Y, Z, cmap=cm.coolwarm)
    ax.set_xlabel('Dealer showing')
    ax.set_ylabel('Play sum')
    ax.set_zlabel('value')
    plt.title(title)
    plt.show()
    
mc = "mcQ.pickle"
td0 = "td0.pickle"
td1 = "td1.pickle"
linear_td0 = "linear_td0.pickle"
linear_td1 = "linear_td1.pickle"

plot(mc, "mc")
plot(td0, "td0")
plot(td1, "td1")
plot(linear_td0, "linear_td0")
plot(linear_td1, "linear_td1")