了解渐变策略派生


19

我正在尝试从原始资源Andrej Karpathy Blog中重新创建非常简单的Policy Gradient示例。在该文章中,您将找到带有CartPole和Policy Gradient以及重量和Softmax激活列表的示例。这是我重新创建的非常简单的CartPole政策梯度示例,效果很好

import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
import copy

NUM_EPISODES = 4000
LEARNING_RATE = 0.000025
GAMMA = 0.99


# noinspection PyMethodMayBeStatic
class Agent:
    def __init__(self):
        self.poly = PolynomialFeatures(1)
        self.w = np.random.rand(5, 2)

    def policy(self, state):
        z = state.dot(self.w)
        exp = np.exp(z)
        return exp/np.sum(exp)

    def __softmax_grad(self, softmax):
        s = softmax.reshape(-1,1)
        return np.diagflat(s) - np.dot(s, s.T)

    def grad(self, probs, action, state):
        dsoftmax = self.__softmax_grad(probs)[action,:]
        dlog = dsoftmax / probs[0,action]
        grad = state.T.dot(dlog[None,:])
        return grad

    def update_with(self, grads, rewards):

        for i in range(len(grads)):
            # Loop through everything that happend in the episode
            # and update towards the log policy gradient times **FUTURE** reward

            total_grad_effect = 0
            for t, r in enumerate(rewards[i:]):
                total_grad_effect += r * (GAMMA ** r)
            self.w += LEARNING_RATE * grads[i] * total_grad_effect
            print("Grads update: " + str(np.sum(grads[i])))



def main(argv):
    env = gym.make('CartPole-v0')
    np.random.seed(1)

    agent = Agent()
    complete_scores = []

    for e in range(NUM_EPISODES):
        state = env.reset()[None, :]
        state = agent.poly.fit_transform(state)

        rewards = []
        grads = []
        score = 0

        while True:

            probs = agent.policy(state)
            action_space = env.action_space.n
            action = np.random.choice(action_space, p=probs[0])

            next_state, reward, done,_ = env.step(action)
            next_state = next_state[None,:]
            next_state = agent.poly.fit_transform(next_state.reshape(1, 4))
            grad = agent.grad(probs, action, state)

            grads.append(grad)
            rewards.append(reward)

            score += reward
            state = next_state

            if done:
                break

        agent.update_with(grads, rewards)
        complete_scores.append(score)

    env.close()
    plt.plot(np.arange(NUM_EPISODES),
             complete_scores)
    plt.savefig('image1.png')


if __name__ == '__main__':
    main(None)

在此处输入图片说明

我正在尝试做几乎相同的示例,但使用了Sigmoid激活(仅出于简化目的)。这就是我要做的。将模型中的激活从切换softmaxsigmoid哪个应该可以正常工作(基于以下说明)。但是我的Policy Gradient模型什么也不学,并且保持随机性。有什么建议吗?

import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

NUM_EPISODES = 4000
LEARNING_RATE = 0.000025
GAMMA = 0.99


# noinspection PyMethodMayBeStatic
class Agent:
    def __init__(self):
        self.poly = PolynomialFeatures(1)
        self.w = np.random.rand(5, 1) - 0.5

    # Our policy that maps state to action parameterized by w
    # noinspection PyShadowingNames
    def policy(self, state):
        z = np.sum(state.dot(self.w))
        return self.sigmoid(z)

    def sigmoid(self, x):
        s = 1 / (1 + np.exp(-x))
        return s

    def sigmoid_grad(self, sig_x):
        return sig_x * (1 - sig_x)

    def grad(self, probs, action, state):
        dsoftmax = self.sigmoid_grad(probs)
        dlog = dsoftmax / probs
        grad = state.T.dot(dlog)
        grad = grad.reshape(5, 1)
        return grad

    def update_with(self, grads, rewards):
        if len(grads) < 50:
            return
        for i in range(len(grads)):
            # Loop through everything that happened in the episode
            # and update towards the log policy gradient times **FUTURE** reward

            total_grad_effect = 0
            for t, r in enumerate(rewards[i:]):
                total_grad_effect += r * (GAMMA ** r)
            self.w += LEARNING_RATE * grads[i] * total_grad_effect


def main(argv):
    env = gym.make('CartPole-v0')
    np.random.seed(1)

    agent = Agent()
    complete_scores = []

    for e in range(NUM_EPISODES):
        state = env.reset()[None, :]
        state = agent.poly.fit_transform(state)

        rewards = []
        grads = []
        score = 0

        while True:

            probs = agent.policy(state)
            action_space = env.action_space.n
            action = np.random.choice(action_space, p=[1 - probs, probs])

            next_state, reward, done, _ = env.step(action)
            next_state = next_state[None, :]
            next_state = agent.poly.fit_transform(next_state.reshape(1, 4))

            grad = agent.grad(probs, action, state)
            grads.append(grad)
            rewards.append(reward)

            score += reward
            state = next_state

            if done:
                break

        agent.update_with(grads, rewards)
        complete_scores.append(score)

    env.close()
    plt.plot(np.arange(NUM_EPISODES),
             complete_scores)
    plt.savefig('image1.png')


if __name__ == '__main__':
    main(None)

绘制所有学习内容将保持随机性。调整超级参数没有任何帮助。在示例图像下方。

在此处输入图片说明

参考文献

1)深度强化学习:Pong from Pixels

2)介绍Cartpole和Doom的策略梯度

3)得出政策梯度并实施REINFORCE

4)今日的机器学习技巧(5):对数导数技巧12


更新

似乎下面的答案可以从图形中完成一些工作。但这不是对数概率,甚至不是策略的梯度。并更改RL渐变政策的整个目的。请检查上面的参考。在图像之后,我们接下来声明。

在此处输入图片说明

我需要对“ 策略”的“对数”函数取一个渐变(简单来说就是权重和sigmoid激活)。


4
我建议您将此问题发布到Data Science Stack Exchange上,因为它主要是理论问题(Stack Overflow主要用于编码问题)。您还将接触到更多在该领域知识渊博的人。
吉尔斯·菲利普·派瑞

@Gilles-PhilippePaillé我添加了代表问题的代码。我需要做的只是通过激活来修复某些部分。请检查更新后的答案。
GensaGames

1
对于“派生策略梯度”,这是参考文章,其中包含相同类型的安排的工作示例,希望您能详细了解:medium.com/@thechrisyoon/…
穆罕默德·乌斯曼

@MuhammadUsman。感谢您的信息。我把那个来源。现在,上面的例子很清楚,我正在尝试将激活从更改softmaxsignmoid。在上面的示例中,这只是我要做的一件事。
GensaGames

2
@JasonChia乙状结肠输出一定范围内的实数,[0, 1]该实数可以解释为采取积极行动的概率(例如,在CartPole中向右转)。则负动作的可能性(向左转)为1 - sigmoid。这个概率的总和是1。是的,这是一个标准的极卡环境。
Pavel Tyshevskyi

Answers:


8

问题出在grad方法上。

def grad(self, probs, action, state):
    dsoftmax = self.sigmoid_grad(probs)
    dlog = dsoftmax / probs
    grad = state.T.dot(dlog)
    grad = grad.reshape(5, 1)
    return grad

在原始代码中,Softmax与CrossEntropy损失函数一起使用。当您将激活切换为Sigmoid时,适当的损失函数将变为Binary CrossEntropy。现在,该grad方法的目的是计算损失函数wrt的梯度。重量。保留细节,适当的梯度由(probs - action) * state程序的术语给出。最后一件事是添加减号-我们要使损失函数的负数最大化。

grad因此,正确的方法是:

def grad(self, probs, action, state):
    grad = state.T.dot(probs - action)
    return -grad

您可能要添加的另一个更改是提高学习率。 LEARNING_RATE = 0.0001NUM_EPISODES = 5000生成以下图:

正确的平均奖励与情节数

如果使用具有零均值和小方差的高斯分布初始化权重,则收敛将更快。

def __init__(self):
    self.poly = PolynomialFeatures(1)
    self.w = np.random.randn(5, 1) * 0.01

在此处输入图片说明

更新

添加了完整的代码以重现结果:

import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

NUM_EPISODES = 5000
LEARNING_RATE = 0.0001
GAMMA = 0.99


# noinspection PyMethodMayBeStatic
class Agent:
    def __init__(self):
        self.poly = PolynomialFeatures(1)
        self.w = np.random.randn(5, 1) * 0.01

    # Our policy that maps state to action parameterized by w
    # noinspection PyShadowingNames
    def policy(self, state):
        z = np.sum(state.dot(self.w))
        return self.sigmoid(z)

    def sigmoid(self, x):
        s = 1 / (1 + np.exp(-x))
        return s

    def sigmoid_grad(self, sig_x):
        return sig_x * (1 - sig_x)

    def grad(self, probs, action, state):
        grad = state.T.dot(probs - action)
        return -grad

    def update_with(self, grads, rewards):
        if len(grads) < 50:
            return
        for i in range(len(grads)):
            # Loop through everything that happened in the episode
            # and update towards the log policy gradient times **FUTURE** reward

            total_grad_effect = 0
            for t, r in enumerate(rewards[i:]):
                total_grad_effect += r * (GAMMA ** r)
            self.w += LEARNING_RATE * grads[i] * total_grad_effect


def main(argv):
    env = gym.make('CartPole-v0')
    np.random.seed(1)

    agent = Agent()
    complete_scores = []

    for e in range(NUM_EPISODES):
        state = env.reset()[None, :]
        state = agent.poly.fit_transform(state)

        rewards = []
        grads = []
        score = 0

        while True:

            probs = agent.policy(state)
            action_space = env.action_space.n
            action = np.random.choice(action_space, p=[1 - probs, probs])

            next_state, reward, done, _ = env.step(action)
            next_state = next_state[None, :]
            next_state = agent.poly.fit_transform(next_state.reshape(1, 4))

            grad = agent.grad(probs, action, state)
            grads.append(grad)
            rewards.append(reward)

            score += reward
            state = next_state

            if done:
                break

        agent.update_with(grads, rewards)
        complete_scores.append(score)

    env.close()
    plt.plot(np.arange(NUM_EPISODES),
             complete_scores)
    plt.savefig('image1.png')


if __name__ == '__main__':
    main(None)

非常感谢你。稍后我将尝试此解决方案。
GensaGames

我不确定,您对我的功能进行了这种推导。正如您可以检查上面的图像。我需要对策略的日志进行渐变。在我看来,Policy只是权衡sigmoid。但是您的答案梯度与我的梯度无关。对?
GensaGames

请注意,您没有包含有关执行了什么操作的任何信息。根据此有关“策略梯度”的讲座(幻灯片13),更新应该像(action - probs) * sigmoid_grad(probs),但sigmoid_grad由于S形梯度消失了,所以我省略了。
Pavel Tyshevskyi

这里的关键是指示我们要改变权重的方向。如果为action = 1,则我们想probs更接近1,增加权重(正梯度)。如果为action=0,则我们想probs更接近0,因此减小权重(负梯度)。
Pavel Tyshevskyi

1
无论如何,以上更改根本不起作用,您可以共享整个文件吗?同时,我想弄清楚示例,而不关心这种情况下的消失问题。而(action - probs)这只是改变同等水平的另一种方式。
GensaGames
By using our site, you acknowledge that you have read and understand our Cookie Policy and Privacy Policy.
Licensed under cc by-sa 3.0 with attribution required.