You are on page 1of 33

var R = {}; // the Recurrent library

(function(global) {
"use strict";
// Utility fun
function assert(condition, message) {
// from http://stackoverflow.com/questions/15313418/javascript-assert
if (!condition) {
message = message || "Assertion failed";
if (typeof Error !== "undefined") {
throw new Error(message);
}
throw message; // Fallback
}
}
// Random numbers utils
var return_v = false;
var v_val = 0.0;
var gaussRandom = function() {
if (return_v) {
return_v = false;
return v_val;
}
var u = 2 * Math.random() - 1;
var v = 2 * Math.random() - 1;
var r = u * u + v * v;
if (r == 0 || r > 1) return gaussRandom();
var c = Math.sqrt(-2 * Math.log(r) / r);
v_val = v * c; // cache this
return_v = true;
return u * c;
}
var randf = function(a, b) {
return Math.random() * (b - a) + a;
}
var randi = function(a, b) {
return Math.floor(Math.random() * (b - a) + a);
}
var randn = function(mu, std) {
return mu + gaussRandom() * std;
}
// helper function returns array of zeros of length n
// and uses typed arrays if available
var zeros = function(n) {
if (typeof(n) === 'undefined' || isNaN(n)) {
return [];
}
if (typeof ArrayBuffer === 'undefined') {
// lacking browser support
var arr = new Array(n);
for (var i = 0; i < n; i++) {
arr[i] = 0;
}
return arr;
} else {
return new Float64Array(n);
}
}
// Mat holds a matrix
var Mat = function(n, d) {
// n is number of rows d is number of columns
this.n = n;
this.d = d;
this.w = zeros(n * d);
this.dw = zeros(n * d);
}
Mat.prototype = {
get: function(row, col) {
// slow but careful accessor function
// we want row-major order
var ix = (this.d * row) + col;
assert(ix >= 0 && ix < this.w.length);
return this.w[ix];
},
set: function(row, col, v) {
// slow but careful accessor function
var ix = (this.d * row) + col;
assert(ix >= 0 && ix < this.w.length);
this.w[ix] = v;
},
setFrom: function(arr) {
for (var i = 0, n = arr.length; i < n; i++) {
this.w[i] = arr[i];
}
},
setColumn: function(m, i) {
for (var q = 0, n = m.w.length; q < n; q++) {
this.w[(this.d * q) + i] = m.w[q];
}
},
toJSON: function() {
var json = {};
json['n'] = this.n;
json['d'] = this.d;
json['w'] = this.w;
return json;
},
fromJSON: function(json) {
this.n = json.n;
this.d = json.d;
this.w = zeros(this.n * this.d);
this.dw = zeros(this.n * this.d);
for (var i = 0, n = this.n * this.d; i < n; i++) {
this.w[i] = json.w[i]; // copy over weights
}
}
}
var copyMat = function(b) {
var a = new Mat(b.n, b.d);
a.setFrom(b.w);
return a;
}
var copyNet = function(net) {
// nets are (k,v) pairs with k = string key, v = Mat()
var new_net = {};
for (var p in net) {
if (net.hasOwnProperty(p)) {
new_net[p] = copyMat(net[p]);
}
}
return new_net;
}
var updateMat = function(m, alpha) {
// updates in place
for (var i = 0, n = m.n * m.d; i < n; i++) {
if (m.dw[i] !== 0) {
m.w[i] += -alpha * m.dw[i];
m.dw[i] = 0;
}
}
}
var updateNet = function(net, alpha) {
for (var p in net) {
if (net.hasOwnProperty(p)) {
updateMat(net[p], alpha);
}
}
}
var netToJSON = function(net) {
var j = {};
for (var p in net) {
if (net.hasOwnProperty(p)) {
j[p] = net[p].toJSON();
}
}
return j;
}
var netFromJSON = function(j) {
var net = {};
for (var p in j) {
if (j.hasOwnProperty(p)) {
net[p] = new Mat(1, 1); // not proud of this
net[p].fromJSON(j[p]);
}
}
return net;
}
var netZeroGrads = function(net) {
for (var p in net) {
if (net.hasOwnProperty(p)) {
var mat = net[p];
gradFillConst(mat, 0);
}
}
}
var netFlattenGrads = function(net) {
var n = 0;
for (var p in net) {
if (net.hasOwnProperty(p)) {
var mat = net[p];
n += mat.dw.length;
}
}
var g = new Mat(n, 1);
var ix = 0;
for (var p in net) {
if (net.hasOwnProperty(p)) {
var mat = net[p];
for (var i = 0, m = mat.dw.length; i < m; i++) {
g.w[ix] = mat.dw[i];
ix++;
}
}
}
return g;
}
// return Mat but filled with random numbers from gaussian
var RandMat = function(n, d, mu, std) {
var m = new Mat(n, d);
fillRandn(m, mu, std);
//fillRand(m,-std,std); // kind of :P
return m;
}
// Mat utils
// fill matrix with random gaussian numbers
var fillRandn = function(m, mu, std) {
for (var i = 0, n = m.w.length; i < n; i++) {
m.w[i] = randn(mu, std);
}
}
var fillRand = function(m, lo, hi) {
for (var i = 0, n = m.w.length; i < n; i++) {
m.w[i] = randf(lo, hi);
}
}
var gradFillConst = function(m, c) {
for (var i = 0, n = m.dw.length; i < n; i++) {
m.dw[i] = c
}
}
// Transformer definitions
var Graph = function(needs_backprop) {
if (typeof needs_backprop === 'undefined') {
needs_backprop = true;
}
this.needs_backprop = needs_backprop;
// this will store a list of functions that perform backprop,
// in their forward pass order. So in backprop we will go
// backwards and evoke each one
this.backprop = [];
}
Graph.prototype = {
backward: function() {
for (var i = this.backprop.length - 1; i >= 0; i--) {
this.backprop[i](); // tick!
}
},
rowPluck: function(m, ix) {
// pluck a row of m with index ix and return it as col vector
assert(ix >= 0 && ix < m.n);
var d = m.d;
var out = new Mat(d, 1);
for (var i = 0, n = d; i < n; i++) {
out.w[i] = m.w[d * ix + i];
} // copy over the data
if (this.needs_backprop) {
var backward = function() {
for (var i = 0, n = d; i < n; i++) {
m.dw[d * ix + i] += out.dw[i];
}
}
this.backprop.push(backward);
}
return out;
},
tanh: function(m) {
// tanh nonlinearity
var out = new Mat(m.n, m.d);
var n = m.w.length;
for (var i = 0; i < n; i++) {
out.w[i] = Math.tanh(m.w[i]);
}
if (this.needs_backprop) {
var backward = function() {
for (var i = 0; i < n; i++) {
// grad for z = tanh(x) is (1 - z^2)
var mwi = out.w[i];
m.dw[i] += (1.0 - mwi * mwi) * out.dw[i];
}
}
this.backprop.push(backward);
}
return out;
},
sigmoid: function(m) {
// sigmoid nonlinearity
var out = new Mat(m.n, m.d);
var n = m.w.length;
for (var i = 0; i < n; i++) {
out.w[i] = sig(m.w[i]);
}
if (this.needs_backprop) {
var backward = function() {
for (var i = 0; i < n; i++) {
// grad for z = tanh(x) is (1 - z^2)
var mwi = out.w[i];
m.dw[i] += mwi * (1.0 - mwi) * out.dw[i];
}
}
this.backprop.push(backward);
}
return out;
},
relu: function(m) {
var out = new Mat(m.n, m.d);
var n = m.w.length;
for (var i = 0; i < n; i++) {
out.w[i] = Math.max(0, m.w[i]); // relu
}
if (this.needs_backprop) {
var backward = function() {
for (var i = 0; i < n; i++) {
m.dw[i] += m.w[i] > 0 ? out.dw[i] : 0.0;
}
}
this.backprop.push(backward);
}
return out;
},
mul: function(m1, m2) {
// multiply matrices m1 * m2
assert(m1.d === m2.n, 'matmul dimensions misaligned');
var n = m1.n;
var d = m2.d;
var out = new Mat(n, d);
for (var i = 0; i < m1.n; i++) { // loop over rows of m1
for (var j = 0; j < m2.d; j++) { // loop over cols of m2
var dot = 0.0;
for (var k = 0; k < m1.d; k++) { // dot product loop
dot += m1.w[m1.d * i + k] * m2.w[m2.d * k + j];
}
out.w[d * i + j] = dot;
}
}
if (this.needs_backprop) {
var backward = function() {
for (var i = 0; i < m1.n; i++) { // loop over rows of
m1
for (var j = 0; j < m2.d; j++) { // loop over
cols of m2
for (var k = 0; k < m1.d; k++) { // dot
product loop
var b = out.dw[d * i + j];
m1.dw[m1.d * i + k] += m2.w[m2.d *
k + j] * b;
m2.dw[m2.d * k + j] += m1.w[m1.d *
i + k] * b;
}
}
}
}
this.backprop.push(backward);
}
return out;
},
add: function(m1, m2) {
assert(m1.w.length === m2.w.length);
var out = new Mat(m1.n, m1.d);
for (var i = 0, n = m1.w.length; i < n; i++) {
out.w[i] = m1.w[i] + m2.w[i];
}
if (this.needs_backprop) {
var backward = function() {
for (var i = 0, n = m1.w.length; i < n; i++) {
m1.dw[i] += out.dw[i];
m2.dw[i] += out.dw[i];
}
}
this.backprop.push(backward);
}
return out;
},
dot: function(m1, m2) {
// m1 m2 are both column vectors
assert(m1.w.length === m2.w.length);
var out = new Mat(1, 1);
var dot = 0.0;
for (var i = 0, n = m1.w.length; i < n; i++) {
dot += m1.w[i] * m2.w[i];
}
out.w[0] = dot;
if (this.needs_backprop) {
var backward = function() {
for (var i = 0, n = m1.w.length; i < n; i++) {
m1.dw[i] += m2.w[i] * out.dw[0];
m2.dw[i] += m1.w[i] * out.dw[0];
}
}
this.backprop.push(backward);
}
return out;
},
eltmul: function(m1, m2) {
assert(m1.w.length === m2.w.length);
var out = new Mat(m1.n, m1.d);
for (var i = 0, n = m1.w.length; i < n; i++) {
out.w[i] = m1.w[i] * m2.w[i];
}
if (this.needs_backprop) {
var backward = function() {
for (var i = 0, n = m1.w.length; i < n; i++) {
m1.dw[i] += m2.w[i] * out.dw[i];
m2.dw[i] += m1.w[i] * out.dw[i];
}
}
this.backprop.push(backward);
}
return out;
},
}
var softmax = function(m) {
var out = new Mat(m.n, m.d); // probability volume
var maxval = -999999;
for (var i = 0, n = m.w.length; i < n; i++) {
if (m.w[i] > maxval) maxval = m.w[i];
}
var s = 0.0;
for (var i = 0, n = m.w.length; i < n; i++) {
out.w[i] = Math.exp(m.w[i] - maxval);
s += out.w[i];
}
for (var i = 0, n = m.w.length; i < n; i++) {
out.w[i] /= s;
}
// no backward pass here needed
// since we will use the computed probabilities outside
// to set gradients directly on m
return out;
}
var Solver = function() {
this.decay_rate = 0.999;
this.smooth_eps = 1e-8;
this.step_cache = {};
}
Solver.prototype = {
step: function(model, step_size, regc, clipval) {
// perform parameter update
var solver_stats = {};
var num_clipped = 0;
var num_tot = 0;
for (var k in model) {
if (model.hasOwnProperty(k)) {
var m = model[k]; // mat ref
if (!(k in this.step_cache)) {
this.step_cache[k] = new Mat(m.n, m.d);
}
var s = this.step_cache[k];
for (var i = 0, n = m.w.length; i < n; i++) {
// rmsprop adaptive learning rate
var mdwi = m.dw[i];
s.w[i] = s.w[i] * this.decay_rate + (1.0 -
this.decay_rate) * mdwi * mdwi;
// gradient clip
if (mdwi > clipval) {
mdwi = clipval;
num_clipped++;
}
if (mdwi < -clipval) {
mdwi = -clipval;
num_clipped++;
}
num_tot++;
// update (and regularize)
m.w[i] += -step_size * mdwi / Math.sqrt(s.w[i]
+ this.smooth_eps) - regc * m.w[i];
m.dw[i] = 0; // reset gradients for next
iteration
}
}
}
solver_stats['ratio_clipped'] = num_clipped * 1.0 / num_tot;
return solver_stats;
}
}
var initLSTM = function(input_size, hidden_sizes, output_size) {
// hidden size should be a list
var model = {};
for (var d = 0; d < hidden_sizes.length; d++) { // loop over depths
var prev_size = d === 0 ? input_size : hidden_sizes[d - 1];
var hidden_size = hidden_sizes[d];
// gates parameters
model['Wix' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
model['Wih' + d] = new RandMat(hidden_size, hidden_size, 0,
0.08);
model['bi' + d] = new Mat(hidden_size, 1);
model['Wfx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
model['Wfh' + d] = new RandMat(hidden_size, hidden_size, 0,
0.08);
model['bf' + d] = new Mat(hidden_size, 1);
model['Wox' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
model['Woh' + d] = new RandMat(hidden_size, hidden_size, 0,
0.08);
model['bo' + d] = new Mat(hidden_size, 1);
// cell write params
model['Wcx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
model['Wch' + d] = new RandMat(hidden_size, hidden_size, 0,
0.08);
model['bc' + d] = new Mat(hidden_size, 1);
}
// decoder params
model['Whd'] = new RandMat(output_size, hidden_size, 0, 0.08);
model['bd'] = new Mat(output_size, 1);
return model;
}
var forwardLSTM = function(G, model, hidden_sizes, x, prev) {
// forward prop for a single tick of LSTM
// G is graph to append ops to
// model contains LSTM parameters
// x is 1D column vector with observation
// prev is a struct containing hidden and cell
// from previous iteration
if (prev == null || typeof prev.h === 'undefined') {
var hidden_prevs = [];
var cell_prevs = [];
for (var d = 0; d < hidden_sizes.length; d++) {
hidden_prevs.push(new R.Mat(hidden_sizes[d], 1));
cell_prevs.push(new R.Mat(hidden_sizes[d], 1));
}
} else {
var hidden_prevs = prev.h;
var cell_prevs = prev.c;
}
var hidden = [];
var cell = [];
for (var d = 0; d < hidden_sizes.length; d++) {
var input_vector = d === 0 ? x : hidden[d - 1];
var hidden_prev = hidden_prevs[d];
var cell_prev = cell_prevs[d];
// input gate
var h0 = G.mul(model['Wix' + d], input_vector);
var h1 = G.mul(model['Wih' + d], hidden_prev);
var input_gate = G.sigmoid(G.add(G.add(h0, h1), model['bi' +
d]));
// forget gate
var h2 = G.mul(model['Wfx' + d], input_vector);
var h3 = G.mul(model['Wfh' + d], hidden_prev);
var forget_gate = G.sigmoid(G.add(G.add(h2, h3), model['bf' +
d]));
// output gate
var h4 = G.mul(model['Wox' + d], input_vector);
var h5 = G.mul(model['Woh' + d], hidden_prev);
var output_gate = G.sigmoid(G.add(G.add(h4, h5), model['bo' +
d]));
// write operation on cells
var h6 = G.mul(model['Wcx' + d], input_vector);
var h7 = G.mul(model['Wch' + d], hidden_prev);
var cell_write = G.tanh(G.add(G.add(h6, h7), model['bc' + d]));
// compute new cell activation
var retain_cell = G.eltmul(forget_gate, cell_prev); // what do we
keep from cell
var write_cell = G.eltmul(input_gate, cell_write); // what do we
write to cell
var cell_d = G.add(retain_cell, write_cell); // new cell contents
// compute hidden state as gated, saturated cell activations
var hidden_d = G.eltmul(output_gate, G.tanh(cell_d));
hidden.push(hidden_d);
cell.push(cell_d);
}
// one decoder to outputs at end
var output = G.add(G.mul(model['Whd'], hidden[hidden.length - 1]),
model['bd']);
// return cell memory, hidden representation and output
return {
'h': hidden,
'c': cell,
'o': output
};
}
var sig = function(x) {
// helper function for computing sigmoid
return 1.0 / (1 + Math.exp(-x));
}
var maxi = function(w) {
// argmax of array w
var maxv = w[0];
var maxix = 0;
for (var i = 1, n = w.length; i < n; i++) {
var v = w[i];
if (v > maxv) {
maxix = i;
maxv = v;
}
}
return maxix;
}
var samplei = function(w) {
// sample argmax from w, assuming w are
// probabilities that sum to one
var r = randf(0, 1);
var x = 0.0;
var i = 0;
while (true) {
x += w[i];
if (x > r) {
return i;
}
i++;
}
return w.length - 1; // pretty sure we should never get here?
}
// various utils
global.assert = assert;
global.zeros = zeros;
global.maxi = maxi;
global.samplei = samplei;
global.randi = randi;
global.randn = randn;
global.softmax = softmax;
// classes
global.Mat = Mat;
global.RandMat = RandMat;
global.forwardLSTM = forwardLSTM;
global.initLSTM = initLSTM;
// more utils
global.updateMat = updateMat;
global.updateNet = updateNet;
global.copyMat = copyMat;
global.copyNet = copyNet;
global.netToJSON = netToJSON;
global.netFromJSON = netFromJSON;
global.netZeroGrads = netZeroGrads;
global.netFlattenGrads = netFlattenGrads;
// optimization
global.Solver = Solver;
global.Graph = Graph;
})(R);
// END OF RECURRENTJS
var RL = {};
(function(global) {
"use strict";
// syntactic sugar function for getting default parameter values
var getopt = function(opt, field_name, default_value) {
if (typeof opt === 'undefined') {
return default_value;
}
return (typeof opt[field_name] !== 'undefined') ? opt[field_name] :
default_value;
}
var zeros = R.zeros; // inherit these
var assert = R.assert;
var randi = R.randi;
var randf = R.randf;
var setConst = function(arr, c) {
for (var i = 0, n = arr.length; i < n; i++) {
arr[i] = c;
}
}
var sampleWeighted = function(p) {
var r = Math.random();
var c = 0.0;
for (var i = 0, n = p.length; i < n; i++) {
c += p[i];
if (c >= r) {
return i;
}
}
assert(false, 'wtf');
}
// ------
// AGENTS
// ------
// DPAgent performs Value Iteration
// - can also be used for Policy Iteration if you really wanted to
// - requires model of the environment :(
// - does not learn from experience :(
// - assumes finite MDP :(
var DPAgent = function(env, opt) {
this.V = null; // state value function
this.P = null; // policy distribution \pi(s,a)
this.env = env; // store pointer to environment
this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount
factor
this.reset();
}
DPAgent.prototype = {
reset: function() {
// reset the agent's policy and value function
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
this.V = zeros(this.ns);
this.P = zeros(this.ns * this.na);
// initialize uniform random policy
for (var s = 0; s < this.ns; s++) {
var poss = this.env.allowedActions(s);
for (var i = 0, n = poss.length; i < n; i++) {
this.P[poss[i] * this.ns + s] = 1.0 / poss.length;
}
}
},
act: function(s) {
// behave according to the learned policy
var poss = this.env.allowedActions(s);
var ps = [];
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
var prob = this.P[a * this.ns + s];
ps.push(prob);
}
var maxi = sampleWeighted(ps);
return poss[maxi];
},
learn: function() {
// perform a single round of value iteration
self.evaluatePolicy(); // writes this.V
self.updatePolicy(); // writes this.P
},
evaluatePolicy: function() {
// perform a synchronous update of the value function
var Vnew = zeros(this.ns);
for (var s = 0; s < this.ns; s++) {
// integrate over actions in a stochastic policy
// note that we assume that policy probability mass over
allowed actions sums to one
var v = 0.0;
var poss = this.env.allowedActions(s);
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
var prob = this.P[a * this.ns + s]; // probability of
taking action under policy
if (prob === 0) {
continue;
} // no contribution, skip for speed
var ns = this.env.nextStateDistribution(s, a);
var rs = this.env.reward(s, a, ns); // reward for s-
>a->ns transition
v += prob * (rs + this.gamma * this.V[ns]);
}
Vnew[s] = v;
}
this.V = Vnew; // swap
},
updatePolicy: function() {
// update policy to be greedy w.r.t. learned Value function
for (var s = 0; s < this.ns; s++) {
var poss = this.env.allowedActions(s);
// compute value of taking each allowed action
var vmax, nmax;
var vs = [];
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
var ns = this.env.nextStateDistribution(s, a);
var rs = this.env.reward(s, a, ns);
var v = rs + this.gamma * this.V[ns];
vs.push(v);
if (i === 0 || v > vmax) {
vmax = v;
nmax = 1;
} else if (v === vmax) {
nmax += 1;
}
}
// update policy smoothly across all argmaxy actions
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
this.P[a * this.ns + s] = (vs[i] === vmax) ? 1.0 /
nmax : 0.0;
}
}
},
}
// QAgent uses TD (Q-Learning, SARSA)
// - does not require environment model :)
// - learns from experience :)
var TDAgent = function(env, opt) {
this.update = getopt(opt, 'update', 'qlearn'); // qlearn | sarsa
this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount
factor
this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy
policy
this.alpha = getopt(opt, 'alpha', 0.01); // value function learning
rate
// class allows non-deterministic policy, and smoothly regressing
towards the optimal policy based on Q
this.smooth_policy_update = getopt(opt, 'smooth_policy_update', false);
this.beta = getopt(opt, 'beta', 0.01); // learning rate for policy, if
smooth updates are on
// eligibility traces
this.lambda = getopt(opt, 'lambda', 0); // eligibility trace decay. 0 =
no eligibility traces used
this.replacing_traces = getopt(opt, 'replacing_traces', true);
// optional optimistic initial values
this.q_init_val = getopt(opt, 'q_init_val', 0);
this.planN = getopt(opt, 'planN', 0); // number of planning steps per
learning iteration (0 = no planning)
this.Q = null; // state action value function
this.P = null; // policy distribution \pi(s,a)
this.e = null; // eligibility trace
this.env_model_s = null;; // environment model (s,a) -> (s',r)
this.env_model_r = null;; // environment model (s,a) -> (s',r)
this.env = env; // store pointer to environment
this.reset();
}
TDAgent.prototype = {
reset: function() {
// reset the agent's policy and value function
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
this.Q = zeros(this.ns * this.na);
if (this.q_init_val !== 0) {
setConst(this.Q, this.q_init_val);
}
this.P = zeros(this.ns * this.na);
this.e = zeros(this.ns * this.na);
// model/planning vars
this.env_model_s = zeros(this.ns * this.na);
setConst(this.env_model_s, -1); // init to -1 so we can test if
we saw the state before
this.env_model_r = zeros(this.ns * this.na);
this.sa_seen = [];
this.pq = zeros(this.ns * this.na);
// initialize uniform random policy
for (var s = 0; s < this.ns; s++) {
var poss = this.env.allowedActions(s);
for (var i = 0, n = poss.length; i < n; i++) {
this.P[poss[i] * this.ns + s] = 1.0 / poss.length;
}
}
// agent memory, needed for streaming updates
// (s0,a0,r0,s1,a1,r1,...)
this.r0 = null;
this.s0 = null;
this.s1 = null;
this.a0 = null;
this.a1 = null;
},
resetEpisode: function() {
// an episode finished
},
act: function(s) {
// act according to epsilon greedy policy
var poss = this.env.allowedActions(s);
var probs = [];
for (var i = 0, n = poss.length; i < n; i++) {
probs.push(this.P[poss[i] * this.ns + s]);
}
// epsilon greedy policy
if (Math.random() < this.epsilon) {
var a = poss[randi(0, poss.length)]; // random available
action
this.explored = true;
} else {
var a = poss[sampleWeighted(probs)];
this.explored = false;
}
// shift state memory
this.s0 = this.s1;
this.a0 = this.a1;
this.s1 = s;
this.a1 = a;
return a;
},
learn: function(r1) {
// takes reward for previous action, which came from a call to
act()
if (!(this.r0 == null)) {
this.learnFromTuple(this.s0, this.a0, this.r0, this.s1,
this.a1, this.lambda);
if (this.planN > 0) {
this.updateModel(this.s0, this.a0, this.r0, this.s1);
this.plan();
}
}
this.r0 = r1; // store this for next update
},
updateModel: function(s0, a0, r0, s1) {
// transition (s0,a0) -> (r0,s1) was observed. Update environment
model
var sa = a0 * this.ns + s0;
if (this.env_model_s[sa] === -1) {
// first time we see this state action
this.sa_seen.push(a0 * this.ns + s0); // add as seen state
}
this.env_model_s[sa] = s1;
this.env_model_r[sa] = r0;
},
plan: function() {
// order the states based on current priority queue information
var spq = [];
for (var i = 0, n = this.sa_seen.length; i < n; i++) {
var sa = this.sa_seen[i];
var sap = this.pq[sa];
if (sap > 1e-5) { // gain a bit of efficiency
spq.push({
sa: sa,
p: sap
});
}
}
spq.sort(function(a, b) {
return a.p < b.p ? 1 : -1
});
// perform the updates
var nsteps = Math.min(this.planN, spq.length);
for (var k = 0; k < nsteps; k++) {
// random exploration
//var i = randi(0, this.sa_seen.length); // pick random
prev seen state action
//var s0a0 = this.sa_seen[i];
var s0a0 = spq[k].sa;
this.pq[s0a0] = 0; // erase priority, since we're backing
up this state
var s0 = s0a0 % this.ns;
var a0 = Math.floor(s0a0 / this.ns);
var r0 = this.env_model_r[s0a0];
var s1 = this.env_model_s[s0a0];
var a1 = -1; // not used for Q learning
if (this.update === 'sarsa') {
// generate random action?...
var poss = this.env.allowedActions(s1);
var a1 = poss[randi(0, poss.length)];
}
this.learnFromTuple(s0, a0, r0, s1, a1, 0); // note lambda
= 0 - shouldnt use eligibility trace here
}
},
learnFromTuple: function(s0, a0, r0, s1, a1, lambda) {
var sa = a0 * this.ns + s0;
// calculate the target for Q(s,a)
if (this.update === 'qlearn') {
// Q learning target is Q(s0,a0) = r0 + gamma * max_a
Q[s1,a]
var poss = this.env.allowedActions(s1);
var qmax = 0;
for (var i = 0, n = poss.length; i < n; i++) {
var s1a = poss[i] * this.ns + s1;
var qval = this.Q[s1a];
if (i === 0 || qval > qmax) {
qmax = qval;
}
}
var target = r0 + this.gamma * qmax;
} else if (this.update === 'sarsa') {
// SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1]
var s1a1 = a1 * this.ns + s1;
var target = r0 + this.gamma * this.Q[s1a1];
}
if (lambda > 0) {
// perform an eligibility trace update
if (this.replacing_traces) {
this.e[sa] = 1;
} else {
this.e[sa] += 1;
}
var edecay = lambda * this.gamma;
var state_update = zeros(this.ns);
for (var s = 0; s < this.ns; s++) {
var poss = this.env.allowedActions(s);
for (var i = 0; i < poss.length; i++) {
var a = poss[i];
var saloop = a * this.ns + s;
var esa = this.e[saloop];
var update = this.alpha * esa * (target -
this.Q[saloop]);
this.Q[saloop] += update;
this.updatePriority(s, a, update);
this.e[saloop] *= edecay;
var u = Math.abs(update);
if (u > state_update[s]) {
state_update[s] = u;
}
}
}
for (var s = 0; s < this.ns; s++) {
if (state_update[s] > 1e-5) { // save efficiency here
this.updatePolicy(s);
}
}
if (this.explored && this.update === 'qlearn') {
// have to wipe the trace since q learning is off-
policy :(
this.e = zeros(this.ns * this.na);
}
} else {
// simpler and faster update without eligibility trace
// update Q[sa] towards it with some step size
var update = this.alpha * (target - this.Q[sa]);
this.Q[sa] += update;
this.updatePriority(s0, a0, update);
// update the policy to reflect the change (if appropriate)
this.updatePolicy(s0);
}
},
updatePriority: function(s, a, u) {
// used in planning. Invoked when Q[sa] += update
// we should find all states that lead to (s,a) and upgrade their
priority
// of being update in the next planning step
u = Math.abs(u);
if (u < 1e-5) {
return;
} // for efficiency skip small updates
if (this.planN === 0) {
return;
} // there is no planning to be done, skip.
for (var si = 0; si < this.ns; si++) {
// note we are also iterating over impossible actions at
all states,
// but this should be okay because their env_model_s should
simply be -1
// as initialized, so they will never be predicted to point
to any state
// because they will never be observed, and hence never be
added to the model
for (var ai = 0; ai < this.na; ai++) {
var siai = ai * this.ns + si;
if (this.env_model_s[siai] === s) {
// this state leads to s, add it to priority
queue
this.pq[siai] += u;
}
}
}
},
updatePolicy: function(s) {
var poss = this.env.allowedActions(s);
// set policy at s to be the action that achieves max_a Q(s,a)
// first find the maxy Q values
var qmax, nmax;
var qs = [];
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
var qval = this.Q[a * this.ns + s];
qs.push(qval);
if (i === 0 || qval > qmax) {
qmax = qval;
nmax = 1;
} else if (qval === qmax) {
nmax += 1;
}
}
// now update the policy smoothly towards the argmaxy actions
var psum = 0.0;
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
var target = (qs[i] === qmax) ? 1.0 / nmax : 0.0;
var ix = a * this.ns + s;
if (this.smooth_policy_update) {
// slightly hacky :p
this.P[ix] += this.beta * (target - this.P[ix]);
psum += this.P[ix];
} else {
// set hard target
this.P[ix] = target;
}
}
if (this.smooth_policy_update) {
// renomalize P if we're using smooth policy updates
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
this.P[a * this.ns + s] /= psum;
}
}
}
}
var DQNAgent = function(env, opt) {
this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount
factor
this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy
policy
this.alpha = getopt(opt, 'alpha', 0.01); // value function learning
rate
this.experience_add_every = getopt(opt, 'experience_add_every', 25); //
number of time steps before we add another experience to replay memory
this.experience_size = getopt(opt, 'experience_size', 5000); // size of
experience replay
this.learning_steps_per_iteration = getopt(opt,
'learning_steps_per_iteration', 10);
this.tderror_clamp = getopt(opt, 'tderror_clamp', 1.0);
this.num_hidden_units = getopt(opt, 'num_hidden_units', 100);
this.env = env;
this.reset();
}
DQNAgent.prototype = {
reset: function() {
this.nh = this.num_hidden_units; // number of hidden units
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
// nets are hardcoded for now as key (str) -> Mat
// not proud of this. better solution is to have a whole Net
object
// on top of Mats, but for now sticking with this
this.net = {};
this.net.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01);
this.net.b1 = new R.Mat(this.nh, 1, 0, 0.01);
this.net.W2 = new R.RandMat(this.na, this.nh, 0, 0.01);
this.net.b2 = new R.Mat(this.na, 1, 0, 0.01);
this.exp = []; // experience
this.expi = 0; // where to insert
this.t = 0;
this.r0 = null;
this.s0 = null;
this.s1 = null;
this.a0 = null;
this.a1 = null;
this.tderror = 0; // for visualization only...
},
toJSON: function() {
// save function
var j = {};
j.nh = this.nh;
j.ns = this.ns;
j.na = this.na;
j.net = R.netToJSON(this.net);
return j;
},
fromJSON: function(j) {
// load function
this.nh = j.nh;
this.ns = j.ns;
this.na = j.na;
this.net = R.netFromJSON(j.net);
},
forwardQ: function(net, s, needs_backprop) {
var G = new R.Graph(needs_backprop);
var a1mat = G.add(G.mul(net.W1, s), net.b1);
var h1mat = G.tanh(a1mat);
var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
this.lastG = G; // back this up. Kind of hacky isn't it
return a2mat;
},
act: function(slist) {
// convert to a Mat column vector
var s = new R.Mat(this.ns, 1);
s.setFrom(slist);
// epsilon greedy policy
if (Math.random() < this.epsilon) {
var a = randi(0, this.na);
} else {
// greedy wrt Q function
var amat = this.forwardQ(this.net, s, false);
var a = R.maxi(amat.w); // returns index of argmax action
}
// shift state memory
this.s0 = this.s1;
this.a0 = this.a1;
this.s1 = s;
this.a1 = a;
return a;
},
learn: function(r1) {
// perform an update on Q function
if (!(this.r0 == null) && this.alpha > 0) {
// learn from this tuple to get a sense of how "surprising"
it is to the agent
var tderror = this.learnFromTuple(this.s0, this.a0,
this.r0, this.s1, this.a1);
this.tderror = tderror; // a measure of surprise
// decide if we should keep this experience in the replay
if (this.t % this.experience_add_every === 0) {
this.exp[this.expi] = [this.s0, this.a0, this.r0,
this.s1, this.a1];
this.expi += 1;
if (this.expi > this.experience_size) {
this.expi = 0;
} // roll over when we run out
}
this.t += 1;
// sample some additional experience from replay memory and
learn from it
for (var k = 0; k < this.learning_steps_per_iteration; k++)
{
var ri = randi(0, this.exp.length); // todo: priority
sweeps?
var e = this.exp[ri];
this.learnFromTuple(e[0], e[1], e[2], e[3], e[4])
}
}
this.r0 = r1; // store for next update
},
learnFromTuple: function(s0, a0, r0, s1, a1) {
// want: Q(s,a) = r + gamma * max_a' Q(s',a')
// compute the target Q value
var tmat = this.forwardQ(this.net, s1, false);
var qmax = r0 + this.gamma * tmat.w[R.maxi(tmat.w)];
// now predict
var pred = this.forwardQ(this.net, s0, true);
var tderror = pred.w[a0] - qmax;
var clamp = this.tderror_clamp;
if (Math.abs(tderror) > clamp) { // huber loss to robustify
if (tderror > clamp) tderror = clamp;
if (tderror < -clamp) tderror = -clamp;
}
pred.dw[a0] = tderror;
this.lastG.backward(); // compute gradients on net params
// update net
R.updateNet(this.net, this.alpha);
return tderror;
}
}
// buggy implementation, doesnt work...
var SimpleReinforceAgent = function(env, opt) {
this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount
factor
this.epsilon = getopt(opt, 'epsilon', 0.75); // for epsilon-greedy
policy
this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate
this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate
this.env = env;
this.reset();
}
SimpleReinforceAgent.prototype = {
reset: function() {
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
this.nh = 100; // number of hidden units
this.nhb = 100; // and also in the baseline lstm
this.actorNet = {};
this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01);
this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01);
this.actorNet.W2 = new R.RandMat(this.na, this.nh, 0, 0.1);
this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01);
this.actorOutputs = [];
this.actorGraphs = [];
this.actorActions = []; // sampled ones
this.rewardHistory = [];
this.baselineNet = {};
this.baselineNet.W1 = new R.RandMat(this.nhb, this.ns, 0, 0.01);
this.baselineNet.b1 = new R.Mat(this.nhb, 1, 0, 0.01);
this.baselineNet.W2 = new R.RandMat(this.na, this.nhb, 0, 0.01);
this.baselineNet.b2 = new R.Mat(this.na, 1, 0, 0.01);
this.baselineOutputs = [];
this.baselineGraphs = [];
this.t = 0;
},
forwardActor: function(s, needs_backprop) {
var net = this.actorNet;
var G = new R.Graph(needs_backprop);
var a1mat = G.add(G.mul(net.W1, s), net.b1);
var h1mat = G.tanh(a1mat);
var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
return {
'a': a2mat,
'G': G
}
},
forwardValue: function(s, needs_backprop) {
var net = this.baselineNet;
var G = new R.Graph(needs_backprop);
var a1mat = G.add(G.mul(net.W1, s), net.b1);
var h1mat = G.tanh(a1mat);
var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
return {
'a': a2mat,
'G': G
}
},
act: function(slist) {
// convert to a Mat column vector
var s = new R.Mat(this.ns, 1);
s.setFrom(slist);
// forward the actor to get action output
var ans = this.forwardActor(s, true);
var amat = ans.a;
var ag = ans.G;
this.actorOutputs.push(amat);
this.actorGraphs.push(ag);
// forward the baseline estimator
var ans = this.forwardValue(s, true);
var vmat = ans.a;
var vg = ans.G;
this.baselineOutputs.push(vmat);
this.baselineGraphs.push(vg);
// sample action from the stochastic gaussian policy
var a = R.copyMat(amat);
var gaussVar = 0.02;
a.w[0] = R.randn(0, gaussVar);
a.w[1] = R.randn(0, gaussVar);
this.actorActions.push(a);
// shift state memory
this.s0 = this.s1;
this.a0 = this.a1;
this.s1 = s;
this.a1 = a;
return a;
},
learn: function(r1) {
// perform an update on Q function
this.rewardHistory.push(r1);
var n = this.rewardHistory.length;
var baselineMSE = 0.0;
var nup = 100; // what chunk of experience to take
var nuse = 80; // what chunk to update from
if (n >= nup) {
// lets learn and flush
// first: compute the sample values at all points
var vs = [];
for (var t = 0; t < nuse; t++) {
var mul = 1;
// compute the actual discounted reward for this time
step
var V = 0;
for (var t2 = t; t2 < n; t2++) {
V += mul * this.rewardHistory[t2];
mul *= this.gamma;
if (mul < 1e-5) {
break;
} // efficiency savings
}
// get the predicted baseline at this time step
var b = this.baselineOutputs[t].w[0];
for (var i = 0; i < this.na; i++) {
// [the action delta] * [the desirebility]
var update = -(V - b) *
(this.actorActions[t].w[i] - this.actorOutputs[t].w[i]);
if (update > 0.1) {
update = 0.1;
}
if (update < -0.1) {
update = -0.1;
}
this.actorOutputs[t].dw[i] += update;
}
var update = -(V - b);
if (update > 0.1) {
update = 0.1;
}
if (update < 0.1) {
update = -0.1;
}
this.baselineOutputs[t].dw[0] += update;
baselineMSE += (V - b) * (V - b);
vs.push(V);
}
baselineMSE /= nuse;
// backprop all the things
for (var t = 0; t < nuse; t++) {
this.actorGraphs[t].backward();
this.baselineGraphs[t].backward();
}
R.updateNet(this.actorNet, this.alpha); // update actor
network
R.updateNet(this.baselineNet, this.beta); // update
baseline network
// flush
this.actorOutputs = [];
this.rewardHistory = [];
this.actorActions = [];
this.baselineOutputs = [];
this.actorGraphs = [];
this.baselineGraphs = [];
this.tderror = baselineMSE;
}
this.t += 1;
this.r0 = r1; // store for next update
},
}
// buggy implementation as well, doesn't work
var RecurrentReinforceAgent = function(env, opt) {
this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount
factor
this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy
policy
this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate
this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate
this.env = env;
this.reset();
}
RecurrentReinforceAgent.prototype = {
reset: function() {
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
this.nh = 40; // number of hidden units
this.nhb = 40; // and also in the baseline lstm
this.actorLSTM = R.initLSTM(this.ns, [this.nh], this.na);
this.actorG = new R.Graph();
this.actorPrev = null;
this.actorOutputs = [];
this.rewardHistory = [];
this.actorActions = [];
this.baselineLSTM = R.initLSTM(this.ns, [this.nhb], 1);
this.baselineG = new R.Graph();
this.baselinePrev = null;
this.baselineOutputs = [];
this.t = 0;
this.r0 = null;
this.s0 = null;
this.s1 = null;
this.a0 = null;
this.a1 = null;
},
act: function(slist) {
// convert to a Mat column vector
var s = new R.Mat(this.ns, 1);
s.setFrom(slist);
// forward the LSTM to get action distribution
var actorNext = R.forwardLSTM(this.actorG, this.actorLSTM,
[this.nh], s, this.actorPrev);
this.actorPrev = actorNext;
var amat = actorNext.o;
this.actorOutputs.push(amat);
// forward the baseline LSTM
var baselineNext = R.forwardLSTM(this.baselineG,
this.baselineLSTM, [this.nhb], s, this.baselinePrev);
this.baselinePrev = baselineNext;
this.baselineOutputs.push(baselineNext.o);
// sample action from actor policy
var gaussVar = 0.05;
var a = R.copyMat(amat);
for (var i = 0, n = a.w.length; i < n; i++) {
a.w[0] += R.randn(0, gaussVar);
a.w[1] += R.randn(0, gaussVar);
}
this.actorActions.push(a);
// shift state memory
this.s0 = this.s1;
this.a0 = this.a1;
this.s1 = s;
this.a1 = a;
return a;
},
learn: function(r1) {
// perform an update on Q function
this.rewardHistory.push(r1);
var n = this.rewardHistory.length;
var baselineMSE = 0.0;
var nup = 100; // what chunk of experience to take
var nuse = 80; // what chunk to also update
if (n >= nup) {
// lets learn and flush
// first: compute the sample values at all points
var vs = [];
for (var t = 0; t < nuse; t++) {
var mul = 1;
var V = 0;
for (var t2 = t; t2 < n; t2++) {
V += mul * this.rewardHistory[t2];
mul *= this.gamma;
if (mul < 1e-5) {
break;
} // efficiency savings
}
var b = this.baselineOutputs[t].w[0];
// todo: take out the constants etc.
for (var i = 0; i < this.na; i++) {
// [the action delta] * [the desirebility]
var update = -(V - b) *
(this.actorActions[t].w[i] - this.actorOutputs[t].w[i]);
if (update > 0.1) {
update = 0.1;
}
if (update < -0.1) {
update = -0.1;
}
this.actorOutputs[t].dw[i] += update;
}
var update = -(V - b);
if (update > 0.1) {
update = 0.1;
}
if (update < 0.1) {
update = -0.1;
}
this.baselineOutputs[t].dw[0] += update;
baselineMSE += (V - b) * (V - b);
vs.push(V);
}
baselineMSE /= nuse;
this.actorG.backward(); // update params! woohoo!
this.baselineG.backward();
R.updateNet(this.actorLSTM, this.alpha); // update actor
network
R.updateNet(this.baselineLSTM, this.beta); // update
baseline network
// flush
this.actorG = new R.Graph();
this.actorPrev = null;
this.actorOutputs = [];
this.rewardHistory = [];
this.actorActions = [];
this.baselineG = new R.Graph();
this.baselinePrev = null;
this.baselineOutputs = [];
this.tderror = baselineMSE;
}
this.t += 1;
this.r0 = r1; // store for next update
},
}
// Currently buggy implementation, doesnt work
var DeterministPG = function(env, opt) {
this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount
factor
this.epsilon = getopt(opt, 'epsilon', 0.5); // for epsilon-greedy
policy
this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate
this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate
this.env = env;
this.reset();
}
DeterministPG.prototype = {
reset: function() {
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
this.nh = 100; // number of hidden units
// actor
this.actorNet = {};
this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01);
this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01);
this.actorNet.W2 = new R.RandMat(this.na, this.ns, 0, 0.1);
this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01);
this.ntheta = this.na * this.ns + this.na; // number of params in
actor
// critic
this.criticw = new R.RandMat(1, this.ntheta, 0, 0.01); // row
vector
this.r0 = null;
this.s0 = null;
this.s1 = null;
this.a0 = null;
this.a1 = null;
this.t = 0;
},
forwardActor: function(s, needs_backprop) {
var net = this.actorNet;
var G = new R.Graph(needs_backprop);
var a1mat = G.add(G.mul(net.W1, s), net.b1);
var h1mat = G.tanh(a1mat);
var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
return {
'a': a2mat,
'G': G
}
},
act: function(slist) {
// convert to a Mat column vector
var s = new R.Mat(this.ns, 1);
s.setFrom(slist);
// forward the actor to get action output
var ans = this.forwardActor(s, false);
var amat = ans.a;
var ag = ans.G;
// sample action from the stochastic gaussian policy
var a = R.copyMat(amat);
if (Math.random() < this.epsilon) {
var gaussVar = 0.02;
a.w[0] = R.randn(0, gaussVar);
a.w[1] = R.randn(0, gaussVar);
}
var clamp = 0.25;
if (a.w[0] > clamp) a.w[0] = clamp;
if (a.w[0] < -clamp) a.w[0] = -clamp;
if (a.w[1] > clamp) a.w[1] = clamp;
if (a.w[1] < -clamp) a.w[1] = -clamp;
// shift state memory
this.s0 = this.s1;
this.a0 = this.a1;
this.s1 = s;
this.a1 = a;
return a;
},
utilJacobianAt: function(s) {
var ujacobian = new R.Mat(this.ntheta, this.na);
for (var a = 0; a < this.na; a++) {
R.netZeroGrads(this.actorNet);
var ag = this.forwardActor(this.s0, true);
ag.a.dw[a] = 1.0;
ag.G.backward();
var gflat = R.netFlattenGrads(this.actorNet);
ujacobian.setColumn(gflat, a);
}
return ujacobian;
},
learn: function(r1) {
// perform an update on Q function
//this.rewardHistory.push(r1);
if (!(this.r0 == null)) {
var Gtmp = new R.Graph(false);
// dpg update:
// first compute the features psi:
// the jacobian matrix of the actor for s
var ujacobian0 = this.utilJacobianAt(this.s0);
// now form the features \psi(s,a)
var psi_sa0 = Gtmp.mul(ujacobian0, this.a0); // should be
[this.ntheta x 1] "feature" vector
var qw0 = Gtmp.mul(this.criticw, psi_sa0); // 1x1
// now do the same thing because we need \psi(s_{t+1},
\mu\_\theta(s\_t{t+1}))
var ujacobian1 = this.utilJacobianAt(this.s1);
var ag = this.forwardActor(this.s1, false);
var psi_sa1 = Gtmp.mul(ujacobian1, ag.a);
var qw1 = Gtmp.mul(this.criticw, psi_sa1); // 1x1
// get the td error finally
var tderror = this.r0 + this.gamma * qw1.w[0] -
qw0.w[0]; // lol
if (tderror > 0.5) tderror = 0.5; // clamp
if (tderror < -0.5) tderror = -0.5;
this.tderror = tderror;
// update actor policy with natural gradient
var net = this.actorNet;
var ix = 0;
for (var p in net) {
var mat = net[p];
if (net.hasOwnProperty(p)) {
for (var i = 0, n = mat.w.length; i < n; i++) {
mat.w[i] += this.alpha *
this.criticw.w[ix]; // natural gradient update
ix += 1;
}
}
}
// update the critic parameters too
for (var i = 0; i < this.ntheta; i++) {
var update = this.beta * tderror * psi_sa0.w[i];
this.criticw.w[i] += update;
}
}
this.r0 = r1; // store for next update
},
}
// exports
global.DPAgent = DPAgent;
global.TDAgent = TDAgent;
global.DQNAgent = DQNAgent;
//global.SimpleReinforceAgent = SimpleReinforceAgent;
//global.RecurrentReinforceAgent = RecurrentReinforceAgent;
//global.DeterministPG = DeterministPG;
})(RL);
var Trevel = {
//settings you can change
stop: true,
maxBet: 0.00001,
minBet: 0.00000002,
swap: true,
betSpeed: 2,//change this on init
verbose: true,
isTesting: false,
showEvery:100,//log details to console after bets if verbose is false
seedEvery:100, // change client seed after?
//money management
useKelly: false,//martingale performs better on live account!
kellyPercent: 5, //can't be more than 100 or less than 1
useMartingale: true, //if kelly is true this won't work
martingaleMultiplier: 2,
//bot settings, these are set automatically don't bother
currentBalance: 0,
startingBalance: 0,
betAmount: 0,
profit: 0,
totalBets: 0,
totalWins: 0,
winRate: 0,
betHistory: [], //this is a sequence of all winning bets not the sequence of
bets we placed
betOutcomes: [],
hbProbability: 0,
lbProbability: 0,
hbCount: 0,
lbcount: 0,
nextBet: "",
previousReward:0,
nextLog:0,
addBet: function(bet, outcome) {
if (bet === "LB" && outcome === "Win") {
Trevel.betHistory.push("LO");
Trevel.betOutcomes.push("W");
Trevel.totalWins++;
Trevel.lbcount++;
}
if (bet === "LB" && outcome === "Loose") {
Trevel.betHistory.push("HI");
Trevel.hbCount++;
Trevel.betOutcomes.push("L");
}
if (bet === "HB" && outcome === "Win") {
Trevel.betHistory.push("HI");
Trevel.totalWins++;
Trevel.hbCount++;
Trevel.betOutcomes.push("W");
}
if (bet === "HB" && outcome === "Loose") {
Trevel.betHistory.push("LO");
Trevel.lbcount++;
Trevel.betOutcomes.push("L");
}
Trevel.totalBets++;
},
calculateProbabilities: function() {
Trevel.hbProbability = Trevel.hbCount / Trevel.betHistory.length;
Trevel.lbProbability = Trevel.lbcount / Trevel.betHistory.length;
Trevel.winRate = Trevel.totalWins / Trevel.totalBets;
if(Trevel.isTesting===false){
Trevel.profit = Trevel.getProfit();
}
},
getCurrentBalance: function() {
return parseFloat($('#balance').html());
},
placeHighBet: function() {
$('#double_your_btc_bet_hi_button').click();
},
placeLowBet: function() {
$('#double_your_btc_bet_lo_button').click();
},
setBetAmount: function(amount) {
var elem = document.getElementById("double_your_btc_stake");
elem.value = amount;
},
changeSeed:function(){
$('#next_client_seed').val(Trevel.getNewSeed());
},
getNewSeed:function(){
var result = '';
var length = 16;
var chars =
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz';
for (var i = length; i > 0; --i) {
result += chars[Math.floor(Math.random() * chars.length)];
}
return result;
},
setOutcome: function(bet) {
if ($('#double_your_btc_bet_lose').html() !== '') {
Trevel.addBet(bet, "Loose");
} else {
Trevel.addBet(bet, "Win");
}
},
prepareBet: function() {
Trevel.calculateProbabilities();
if (Trevel.betHistory.length < 10) {
if (Trevel.useMartingale === true && Trevel.betHistory.length>12)
{
if ($('#double_your_btc_bet_lose').html() !== '' &&
parseFloat($('#double_your_btc_stake').val()) * Trevel.martingaleMultiplier <
Trevel.maxBet) {
Trevel.setBetAmount((parseFloat($
('#double_your_btc_stake').val()) * Trevel.martingaleMultiplier).toFixed(8));
} else {
Trevel.setBetAmount(Trevel.minBet);
}
}
} else {
if (Trevel.useKelly === true && Trevel.betHistory.length>12) {
Trevel.currentBalance = Trevel.getCurrentBalance();
var currMulty =
document.getElementById("double_your_btc_payout_multiplier").value;
var kellyAmount = (((Trevel.currentBalance *
Trevel.kellyPercent) / 100) * ((Trevel.winRate * currMulty - 1)) / (currMulty -
1)).toFixed(8);
if (kellyAmount > 0 && kellyAmount < Trevel.maxBet) {
Trevel.setBetAmount(kellyAmount);
} else {
Trevel.setBetAmount(Trevel.minBet);
}
} else if (Trevel.useMartingale === true &&
Trevel.betHistory.length>12) {
if ($('#double_your_btc_bet_lose').html() !== '' &&
parseFloat($('#double_your_btc_stake').val()) * Trevel.martingaleMultiplier <
Trevel.maxBet) {
Trevel.setBetAmount((parseFloat($
('#double_your_btc_stake').val()) * Trevel.martingaleMultiplier).toFixed(8));
} else {
Trevel.setBetAmount(Trevel.minBet);
}
}
}
},
placeBet: function() {
if (Trevel.nextBet === "HB") {
Trevel.placeHighBet();
} else if (Trevel.nextBet === "LB") {
Trevel.placeLowBet();
} else if (Trevel.betHistory.length > 0 && Trevel.swap === true) {
var prev = Trevel.betHistory[Trevel.betHistory.length - 1];
if (prev === "LO") {
Trevel.placeHighBet();
} else {
Trevel.placeLowBet();
}
} else {
Trevel.placeLowBet();
}
},
getProfit: function() {
return (Trevel.getCurrentBalance() -
Trevel.startingBalance).toFixed(8);
},
getNumStates: function() {
return 8;
},
getMaxNumActions: function() {
return 2;
},
getSentiment: function(bet) {
if (bet === "HI") {
return 1;
} else {
return 0;
}
},
getPreviousBets: function() {
var hist = [];
if (Trevel.betHistory.length > 12) {

hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length -
1]));

hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length -
2]));

hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length -
3]));

hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length -
4]));

hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length -
5]));

hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length -
6]));

hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length -
7]));

hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length -
8]));
} else {
hist = [0, 1, 0, 1, 0, 1, 0, 1]; //incase we just started...
}
return hist;
},
getAgentState: function() { //we'll observe the last 8 bets
var s = Trevel.getPreviousBets();
return s;
},
getReward: function() {
var reward = 0;
var out1=Trevel.betOutcomes[Trevel.betOutcomes.length - 1];
var out2=Trevel.betOutcomes[Trevel.betOutcomes.length - 2];
if(out1==="L"){
if(Trevel.previousReward<0){
reward=Trevel.previousReward;
reward+=-0.03;
if(out2==="L"){
reward+=-0.03;
}
}
else{
reward=-0.03;
if(out2==="L"){
reward+=-0.03;
}
}
}
else{
if(Trevel.previousReward>0){
reward=Trevel.previousReward;
reward+=0.01;
if(out2==="W"){
reward+=0.01;
}
}
else{
reward=0.01;
if(out2==="W"){
reward+=0.01;
}
}
}
return reward;
},
//initialize Trevel
init: function() {
Trevel.startingBalance = Trevel.currentBalance = parseFloat($
('#balance').html());
Trevel.setBetAmount(Trevel.minBet);
Trevel.stop = true;
Trevel.swap = true;
Trevel.betSpeed=3000;
Trevel.nextLog=Trevel.showEvery;
}
};
RefreshPageAfterFreePlayTimerEnds = function() {};
//Deep Q learning with reinforceJS
var spec = {}
spec.update = 'qlearn';
spec.gamma = 0.9;
//if you are loading a trainned network the values here should match your trainning
values
spec.epsilon = 0.20;
spec.alpha = 0.1;
spec.experience_add_every = 5;
spec.experience_size = 999999;
spec.learning_steps_per_iteration = 5;
spec.tderror_clamp = 1.0;
spec.num_hidden_units = 100;
// create an environment object
var env = Trevel;
var rolls=0;
if (env.isTesting === false) {
env.init();
}
// create the DQN agent
agent = new RL.DQNAgent(env, spec);
setInterval(function() {
if (env.stop === false) {
var state = env.getAgentState();
var action = agent.act(state);
var outcome = "";
if (env.isTesting === false) {
if (action === 0) {
env.nextBet = "LB";
env.prepareBet();
env.placeBet();
env.setOutcome("LB");
outcome = env.betOutcomes[env.betOutcomes.length - 1];
} else if (action === 1) {
env.nextBet = "HB";
env.prepareBet();
env.placeBet();
env.setOutcome("HB");
outcome = env.betOutcomes[env.betOutcomes.length - 1];
}
rolls++;
if(rolls>=env.seedEvery){
env.changeSeed();
}
env.calculateProbabilities();
if (env.verbose === true) {
console.log("Profit: " + env.profit+" WinRate: " +
(env.winRate*100).toFixed(2)+" Bets: "+env.betHistory.length);
}
else{
if(env.nextLogs>=env.showEvery){
console.log("Profit: " + env.profit+" WinRate: " +
(env.winRate*100).toFixed(2)+" Bets: "+env.betHistory.length);
env.nextLog+=env.showEvery;
}
}
}
else {
console.log("To test this bot or Train a network use free-
simulator.js");
env.stop=true;
}
var reward = env.getReward();
if (reward == 0) {
if (outcome === "L") {
reward = -0.03;
} else {
reward = 0.01;
}
}
agent.learn(reward);
env.previousReward=reward;
}
}, env.betSpeed);
//type all this in console press enter after every line
env.maxBet=0.0001;// set your maximum bet in BTC
env.minBet=0.00000002;//set your minimum bet in BTC cannot be less than 0.00000001
env.showEvery=100;//Log betting info to console after
env.seedEvery=100;//change seed after n bets
env.useMartingale=true;// martingale?
env.martingaleMultiplier=2;//multiplier used to increase balance on loose
env.stop=false;//start betting

You might also like