I have been learning Reinforcement Learning for about two weeks. Although haven’t go through all the course of Arthur Juliani, I had been able to write a small example of Q-learning now.
This example is about using DNN for Q-value table to solve a path-finding-problem. Actually, the path is more looks like a tree:




The start point is ‘0’, and the destination (or ‘goal’) is ’12’.
The code framework of my example is mainly from Manuel Amunategui’s tutorial but replacing Q-value table with a one-layer-neural-network.

import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import pylab as plt
MATRIX_SIZE = 15
goal = 12
points_list = [(0,1), (0,2), \
        (1,3), (1,4), (2,5), (2,6), \
        (3,7), (3,8), (4,9), (4,10), \
        (5,11), (5,12), (6,13), (6,14)]
# Build feed-forward network by using 'state' as input, 'best action' as output
state_in = tf.placeholder(tf.int32, [1])
state_oh = slim.one_hot_encoding(state_in, 15)
output = slim.fully_connected(state_oh, 15,
        biases_initializer = None, activation_fn = tf.nn.relu,
        weights_initializer = tf.ones_initializer())
outputQ = tf.reshape(output, [-1])
chosen_action = tf.argmax(outputQ, 0)
nextQ = tf.placeholder(tf.float32, [15])
loss = tf.reduce_sum(tf.square(nextQ - outputQ))
# Gradient Descent Optimizer usually have better generalization performance
optimizer = tf.train.GradientDescentOptimizer(0.1)
update = optimizer.minimize(loss)
# Build reward matrix
R = np.matrix(np.ones(shape=(MATRIX_SIZE, MATRIX_SIZE)))
# Set extremely low reward (minus) for unconnected nodes
R *= -1000
for point in points_list:
    if point[1] == goal:
        R[point] = 100
    else:
        R[point] = 0
    if point[0]== goal:
        R[point[::-1]] = 100
    else:
        R[point[::-1]] = 0
R[goal, goal] = 100
# learning parameter
gamma = 0.9
# Epsilon-Greedy Algorithm
e = 0.1
# Training
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    reward_list = []
    for j in range(50):
        all_reward = 0
        for i in range(10):
            current_state = np.random.randint(0, 15)
            # Use current state to predict best action
            action, allQ = sess.run([chosen_action, outputQ],
                    feed_dict = {state_in: [current_state]})
            if np.random.rand(1) < e:
                action = np.random.randint(0, 15, dtype = np.int)
            new_state = action
            Q1 = sess.run(outputQ,
                    feed_dict = {state_in: [new_state]})
            maxQ1 = np.max(Q1)
            reward = R[current_state, action]
            targetQ = allQ
            targetQ[action] = reward + gamma * maxQ1
            # Use next state and next Q-values to train neural network
            _, read_loss = sess.run([update, loss],
                    feed_dict = {state_in: [current_state], nextQ: targetQ})
            all_reward += reward
        reward_list.append(all_reward)
    # show curve of reward in different training steps
    plt.plot(reward_list)
    plt.show()
    # Testing
    current_state = 0
    steps = [current_state]
    while current_state != goal:
        action = sess.run([chosen_action],
                feed_dict = {state_in: [current_state]})
        steps.append(action[0])
        current_state = action[0]
    print("Most efficient path:")
    print(steps)

The rewards curve in training steps:




And this example will finally report:

Most efficient path:
[0, 2, 5, 12]

which is the correct answer.