diff --git a/week7/pomdp_seminar_kung_fu.ipynb b/week7/pomdp_seminar_kung_fu.ipynb index 3e1ebac6..e3d353c9 100644 --- a/week7/pomdp_seminar_kung_fu.ipynb +++ b/week7/pomdp_seminar_kung_fu.ipynb @@ -4,7 +4,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -31,9 +33,14 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ - "If you have not yet done below commands, you had better do it now." + "If you are new to this course and want more instructions on how to set up environement and all the libs (docker / windows / gpu / blas / etc.), you could read [vital instructions here](https://github.com/yandexdataschool/Practical_RL/issues/1#issue-202648393). \n", + "\n", + "Please make sure that your have bleeding edge versions of Theano, Lasagne and Agentnet. Quick and dirty way to ensure this is true is shown below. " ] }, { @@ -153,7 +160,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Downsample image, and crop it, showing only the most useful part of image. " ] @@ -176,7 +186,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Function for tracking performance while training " ] @@ -346,7 +359,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "All hyperparameters (except number of layers and neurons) are declared here as upper case letters along with global varaibles." ] @@ -365,21 +381,24 @@ "OBS_SHAPE = env.observation_space.shape \n", "OBS_CHANNELS, OBS_HEIGHT, OBS_WIDTH = OBS_SHAPE\n", "\n", - "N_SIMULTANEOUS_GAMES = 2 # this is also known as number of agents in exp_replay_pool\n", + "# These 4 constanst were shown to lead to nearly state of the art on kung-fu master game\n", + "N_SIMULTANEOUS_GAMES = 10 # this is also known as number of agents in exp_replay_pool\n", "MAX_POOL_SIZE = 1000\n", - "REPLAY_SIZE = 100\n", - "SEQ_LENGTH = 15 \n", - "\n", + "SEQ_LENGTH = 25\n", "N_POOL_UPDATES = 1\n", + "\n", "EVAL_EVERY_N_ITER = 10 \n", - "N_EVAL_GAMES = 1\n", + "N_EVAL_GAMES = 2\n", "\n", "N_FRAMES_IN_BUFFER = 4 # number of consequent frames to feed in CNN" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "# A2C with memory" ] @@ -413,80 +432,88 @@ "# TYPE YOUR CODE HERE\n", "# provide the main body of the network : first three convolutional layers and dense one on top \n", "# you may want to change nonlinearity - feel free to do this \n", - "# note that we have changed filter size here because of reduced image width and height compared to those in papers" + "# note that we have changed filter size here because of reduced image width and height compared to those in papers\n", + "conv1 = Conv2DLayer(wnd_reshape, ...)\n", + "...\n", + "dense = Dense(...)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ "# YOUR CODE HERE\n", "# define 256 neuron LSTM cell:\n", - "# - define two input layers each of n_lstm_cells neurons\n", - "# - Feed into `LSTMcell` this two layers and input layer as additional third param" + "# - define two input layers each of n_lstm_cells (maybe 256 is a good baseline) neurons \n", + "# - feed into `LSTMcell` this two layers and \n", + "# input layer (last `Dense` in case of A2C+LSTM) as additional third parameter" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "neck_layer = concat([, new_out]) # network neck \n", - "\n", - "# YOUR CODE HERE \n", - "# define actors head as \n", - "# - logits_layer – dense(neck) with nonlinearity=None \n", - "# - policy layer – softmax over logits_layer\n", - "\n", - "\n", - "action_layer = ProbabilisticResolver(policy_layer) " + "neck_layer = concat([, ]) # network neck " ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 78, "metadata": { - "collapsed": false + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ - "# critic head\n", - "V_layer = DenseLayer(neck_layer, 1, nonlinearity=None)" + "# YOUR CODE HERE \n", + "# define actors head as \n", + "# - logits_layer – dense(neck) with nonlinearity=None \n", + "# - policy layer – softmax over logits_layer\n", + "........\n", + "action_layer = ProbabilisticResolver(policy_layer) " ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 79, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ - "targetnet = TargetNetwork(V_layer)\n", - "V_target = targetnet.output_layers" + "# critic head\n", + "V_layer = DenseLayer(neck_layer, 1, nonlinearity=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ "# YOUR CODE HERE\n", - "# define observation_layer\n", - "# policy_estimators should include 1) logits_layer and 2) V_target \n", - "# agent_states is a dictionary of {new_value: old_value}\n", - "# pairs of this kind shoul be present for prev_wnd, prev_cell, prev_out\n", - "# action_layer is action_layer, as usual : ) \n", + "# `observation_layer` is input layer to NN, as usual\n", + "# `policy_estimators` should include 1) logits_layer and 2) V_layer \n", + "# `agent_states` is a dictionary of {new_value: old_value}. You should bother to update \n", + "# a) prev window (input buffer, prev_wnd) b) previous LSTM cell state c) output of LSTM cell \n", + "# `action_layer` is action_layer, as usual : ) \n", "agent = Agent(....)" ] }, @@ -494,11 +521,14 @@ "cell_type": "code", "execution_count": 82, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ - "pool = EnvPool(agent, make_env, n_games=N_SIMULTANEOUS_GAMES, max_size=MAX_POOL_SIZE) # may need to adjust\n", + "# may need to adjust (increasing N_SIMULTANEOUS_GAMES is usually a good idea)\n", + "pool = EnvPool(agent, make_env, n_games=N_SIMULTANEOUS_GAMES, max_size=MAX_POOL_SIZE) \n", "replay = pool.experience_replay" ] }, @@ -506,11 +536,13 @@ "cell_type": "code", "execution_count": 83, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ - "_, _, _, action_seq, (logits_seq, V_seq, target_V_seq) = agent.get_sessions( \n", + "_, _, _, action_seq, (logits_seq, V_seq) = agent.get_sessions( \n", " replay, \n", " session_length=SEQ_LENGTH, \n", " experience_replay=True\n", @@ -521,7 +553,9 @@ "cell_type": "code", "execution_count": 84, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -529,7 +563,7 @@ "# we can't guarantee that theano optimizes logsoftmax automatically since it's still in dev \n", "# for more info see (https://github.com/Theano/Theano/issues/2944 of 2015 year)\n", "\n", - "# logits_seq.shape = (batch_size, SEQ_LENGTH, N_ACTIONS)\n", + "# logits_seq.shape is (batch_size, SEQ_LENGTH, N_ACTIONS)\n", "logits_flat = logits_seq.reshape([-1, N_ACTIONS])\n", "policy_seq = T.nnet.softmax(logits_flat).reshape(logits_seq.shape)\n", "logpolicy_seq = T.nnet.logsoftmax(logits_flat).reshape(logits_seq.shape)" @@ -539,7 +573,9 @@ "cell_type": "code", "execution_count": 92, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -549,7 +585,6 @@ " policy=logpolicy_seq,\n", " treat_policy_as_logpolicy=True,\n", " state_values=V_seq[:,:,0],\n", - " state_values_target=target_V_seq[:,:,0],\n", " actions=replay.actions[0],\n", " rewards=replay.rewards, \n", " is_alive=replay.is_alive,\n", @@ -572,7 +607,9 @@ "cell_type": "code", "execution_count": 93, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -610,22 +647,6 @@ "loss, eval_rewards = 0, []" ] }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# import os\n", - "# import sys\n", - "# # stder_old = sys.stderr\n", - "# sys.stderr = open(os.devnull, 'w')" - ] - }, { "cell_type": "code", "execution_count": 32, @@ -651,6 +672,26 @@ "untrained_reward" ] }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# IF you feel disgust about stderr messages due to pool.evaluate() execution \n", + "# which pollutes output of jupyter cell, you could do one of the following:\n", + "# 1. use warnings.filterwarnings(\"ignore\")\n", + "# 2. use cell magic %%capture\n", + "# 3. simply redirect stderr to /dev/null with command\n", + "# import os, sys\n", + "# stder_old = sys.stderr\n", + "# sys.stderr = open(os.devnull, 'w') " + ] + }, { "cell_type": "code", "execution_count": 38, @@ -675,7 +716,7 @@ " for _ in range(N_POOL_UPDATES): pool.update(SEQ_LENGTH, append=True) \n", " train_starts = timer()\n", " \n", - " # YOUR CODE HERE : train network and update target network\n", + " # YOUR CODE HERE : train network (actor and critic)\n", " raise NotImplementedError\n", " \n", " th_times.append(timer() - train_starts)\n",