游戏中的Q-learning无法按预期工作

Jac*_*don 10 artificial-intelligence game-ai q-learning

我试图在我写的一个简单的游戏中实现Q-learning.该游戏基于玩家必须"跳跃"以避免迎面而来的盒子.

我设计了两个动作系统; jump并且do_nothing状态是距下一个区块的距离(划分和覆盖以确保没有大量状态).

我的问题似乎是我的算法实现没有考虑"未来的奖励",所以它最终在错误的时间跳跃.

这是我对Q学习算法的实现;

JumpGameAIClass.prototype.getQ = function getQ(state) {
    if (!this.Q.hasOwnProperty(state)) {
        this.Q[state] = {};

        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
            var action = this.actions[actionIndex];

            this.Q[state][action] = 0;
        }
    }

    return this.Q[state];
};

JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
    var closest = -1;

    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
        var block = this.blocks[blockIndex];

        var distance = block.x - this.playerX;

        if (distance >= 0 && (closest === -1 || distance < closest)) {
            closest = distance;
        }
    }

    return Math.max(0, Math.floor(closest * this.resolution));
};

JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
    var jumpReward = this.getQ(distance)[this.actions[0]];
    var doNothingReward = this.getQ(distance)[this.actions[1]];

    if (jumpReward > doNothingReward) {
        return this.actions[0];
    } else if (doNothingReward > jumpReward) {
        return this.actions[1];
    } else {
        if (!this.canJump()) {
            return this.actions[1];
        }

        return this.actions[Math.floor(Math.random() * this.actions.length)];
    }
};

JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
    // We can't jump while in mid-air
    if (!this.canJump()) {
        return this.actions[1];
    }

    if (Math.random() < this.epsilon) {
        return this.actions[Math.floor(Math.random() * this.actions.length)];
    } else {
        return this.getActionWithHighestQ(this.getBlockDistance());
    }
};

JumpGameAIClass.prototype.think = function think() {
    var reward = this.liveReward;

    if (this.score !== this.lastScore) {
        this.lastScore = this.score;
        reward = this.scoreReward;
    } else if (!this.playerAlive) {
        reward = this.deathReward;
    }

    this.drawDistance();

    var distance = this.getBlockDistance(),
        maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
        previousQ = this.getQ(this.lastDistance)[this.lastAction];

    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);

    this.lastAction = this.getActionEpsilonGreedy();
    this.lastDistance = distance;

    switch (this.lastAction) {
        case this.actions[0]:
            this.jump();
            break;
    }
};
Run Code Online (Sandbox Code Playgroud)

以下是它使用的一些属性:

epsilon: 0.05,
alpha: 1,
gamma: 1,
resolution: 0.1,
actions: [ 'jump', 'do_nothing' ],
Q: {},
liveReward: 0,
scoreReward: 100,
deathReward: -1000,
lastAction: 'do_nothing',
lastDistance: 0,
lastScore: 0
Run Code Online (Sandbox Code Playgroud)

我不得不使用lastAction/lastDistance来计算Q,因为我不能使用当前数据(将作用于之前在帧中执行的操作).

think完成所有渲染和游戏内容(物理,控制,死亡等)后,每帧调用一次该方法.

var JumpGameAIClass = function JumpGame(canvas) {
    Game.JumpGame.call(this, canvas);

    Object.defineProperties(this, {
        epsilon: {
            value: 0.05
        },

        alpha: {
            value: 1
        },

        gamma: {
            value: 1
        },

        resolution: {
            value: 0.1
        },

        actions: {
            value: [ 'jump', 'do_nothing' ]
        },

        Q: {
            value: { },
            writable: true
        },

        liveReward: {
            value: 0
        },

        scoreReward: {
            value: 100
        },

        deathReward: {
            value: -1000
        },

        lastAction: {
            value: 'do_nothing',
            writable: true
        },

        lastDistance: {
            value: 0,
            writable: true
        },

        lastScore: {
            value: 0,
            writable: true
        }
    });
};

JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);

JumpGameAIClass.prototype.getQ = function getQ(state) {
    if (!this.Q.hasOwnProperty(state)) {
        this.Q[state] = {};

        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
            var action = this.actions[actionIndex];

            this.Q[state][action] = 0;
        }
    }

    return this.Q[state];
};

JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
    var closest = -1;

    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
        var block = this.blocks[blockIndex];

        var distance = block.x - this.playerX;

        if (distance >= 0 && (closest === -1 || distance < closest)) {
            closest = distance;
        }
    }

    return Math.max(0, Math.floor(closest * this.resolution));
};

JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
    var jumpReward = this.getQ(distance)[this.actions[0]];
    var doNothingReward = this.getQ(distance)[this.actions[1]];

    if (jumpReward > doNothingReward) {
        return this.actions[0];
    } else if (doNothingReward > jumpReward) {
        return this.actions[1];
    } else {
        if (!this.canJump()) {
            return this.actions[1];
        }

        return this.actions[Math.floor(Math.random() * this.actions.length)];
    }
};

JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
    if (!this.canJump()) {
        return this.actions[1];
    }

    if (Math.random() < this.epsilon) {
        return this.actions[Math.floor(Math.random() * this.actions.length)];
    } else {
        return this.getActionWithHighestQ(this.getBlockDistance());
    }
};

JumpGameAIClass.prototype.onDeath = function onDeath() {
    this.restart();
};

JumpGameAIClass.prototype.think = function think() {
    var reward = this.liveReward;

    if (this.score !== this.lastScore) {
        this.lastScore = this.score;
        reward = this.scoreReward;
    } else if (!this.playerAlive) {
        reward = this.deathReward;
    }

    this.drawDistance();

    var distance = this.getBlockDistance(),
        maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
        previousQ = this.getQ(this.lastDistance)[this.lastAction];

    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);

    this.lastAction = this.getActionEpsilonGreedy();
    this.lastDistance = distance;

    switch (this.lastAction) {
        case this.actions[0]:
            this.jump();
            break;
    }
};

JumpGameAIClass.prototype.drawDistance = function drawDistance() {
    this.context.save();

    this.context.textAlign = 'center';
    this.context.textBaseline = 'bottom';

    this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);

    this.context.textBaseline = 'top';

    this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);

    this.context.restore();
};

JumpGameAIClass.prototype.onFrame = function onFrame() {
    Game.JumpGame.prototype.onFrame.apply(this, arguments);

    this.think();
}

Game.JumpGameAI = JumpGameAIClass;
Run Code Online (Sandbox Code Playgroud)
body {
    background-color: #EEEEEE;
    text-align: center;
}

canvas#game {
    background-color: #FFFFFF;
    border: 1px solid #DDDDDD;
}
Run Code Online (Sandbox Code Playgroud)
<!DOCTYPE HTML>
<html lang="en">
<head>
    <title>jump</title>
</head>
<body>
    <canvas id="game" width="512" height="512">
        <h1>Your browser doesn't support canvas!</h1>
    </canvas>
  
    <script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
  
    <!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
  
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>
Run Code Online (Sandbox Code Playgroud)

Mar*_*gus 5

你基本上有简化版:

在此输入图像描述

资料来源:Flappy Bird RL

我用过值:

    epsilon: {
        value: 0.01
    },
    alpha: {
        value: 0.7
    },
    gamma: {
        value: 0.9
    },
    resolution: {
        value: 0.1
    },  
    liveReward: {
        value: 10
    },
    scoreReward: {
        value: -100
    },
    deathReward: {
        value: 1000
    },
Run Code Online (Sandbox Code Playgroud)

在前20次尝试中没有超过100的麻烦.


可以用时间逻辑来描述Q学习

Q(s, a)=r(s,a)+gamma*max_a'(Q(s', a'))
Run Code Online (Sandbox Code Playgroud)

哪里

  • r(s,a)= r=立即奖励
  • gamma =延迟与直接奖励的相对值(0到1)
  • s' =行动后的新州 a
  • a =国家行动 s
  • a' =国家行动 s'

你应该执行它

选择一个动作并执行它

  1. 对于每个状态 - 动作对(s,a),将表条目Q(s,a)初始化为零
  2. 观察当前状态
  3. 永远做:
    • 选择一个动作一个并执行它
    • 立即获得奖励r又称Q(s,a)
    • 观察新州s'
    • 更新Q(s,a)= r(s,a)+ gamma*max_a'(Q(s',a'))的表条目
    • S = S'