From 1185e3cad56825691d727b00a7868395d6b0744e Mon Sep 17 00:00:00 2001 From: Kiante Brantley Date: Sun, 12 Nov 2017 18:20:49 -0500 Subject: [PATCH 1/2] Completed issue 31: Support Reward Per time-step --- macarico/lts/reinforce.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/macarico/lts/reinforce.py b/macarico/lts/reinforce.py index fb506d4..ee35636 100644 --- a/macarico/lts/reinforce.py +++ b/macarico/lts/reinforce.py @@ -12,6 +12,9 @@ import macarico +class ReinforceException(Exception): + pass + class Reinforce(macarico.Learner): "REINFORCE with a scalar baseline function." @@ -30,9 +33,26 @@ def update(self, loss): if len(self.trajectory) > 0: b = self.baseline() total_loss = 0 - for a, p_a in self.trajectory: - total_loss += (loss - b) * dy.log(p_a) - self.baseline.update(loss) + + # Loss per time step of episode + if isinstance(loss, np.ndarray): + for idx, (a, p_a) in enumerate(self.trajectory): + total_loss += (loss[idx] - b) * dy.log(p_a) + + for time_step_loss in loss: + self.baseline.update(time_step_loss) + + # Loss per episode + elif isinstance(loss, (int, long, float)): + for a, p_a in self.trajectory: + total_loss += (loss - b) * dy.log(p_a) + + self.baseline.update(loss) + + # Error Unknown loss + else: + raise ReinforceException("Unknown loss type") + total_loss.forward() total_loss.backward() #torch.autograd.backward(self.trajectory[:], [None]*len(self.trajectory)) From 4f498bbadbec2f261c623da054719ab5ba23bd83 Mon Sep 17 00:00:00 2001 From: Kiante Brantley Date: Wed, 15 Nov 2017 16:10:42 -0500 Subject: [PATCH 2/2] sysadmin problem implementation; needs to be cleaned up --- macarico/tasks/sysadmin.py | 138 +++++++++++++++++++++++++++++++++++++ tests/test_sysadmin.py | 52 ++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 macarico/tasks/sysadmin.py create mode 100644 tests/test_sysadmin.py diff --git a/macarico/tasks/sysadmin.py b/macarico/tasks/sysadmin.py new file mode 100644 index 0000000..a911fdf --- /dev/null +++ b/macarico/tasks/sysadmin.py @@ -0,0 +1,138 @@ +# https://esc.fnwi.uva.nl/thesis/centraal/files/f355157274.pdf +# http://www.jmlr.org/papers/volume6/wingate05a/wingate05a.pdf + +from __future__ import division + +import random +import numpy as np +import dynet as dy +import macarico +from enum import Enum +import macarico + +help = 1 +network_size = 3 + +# modulo operator (ring topology) +class Network(object): + def __init__(self): + self.default_computer_status = np.array([0]*network_size) # 3 computers status + + # sysadmin variables + self.small_prob_failure = .075 # prob of any computer changing from working to failing + self.incr_failing_neighbor = 0.125 # failure increase due to connected to failing computer + self.gamma = 0.95 + self.n_actions = network_size + 1 + + # Probability of failing each round + self.failing_default = [1-self.small_prob_failure, self.small_prob_failure] + self.prob_failure = np.array([self.failing_default,]*network_size) + + def mk_env(self): + self.default_computer_status = np.array([0] *network_size) # 3 computers status + self.prob_failure = np.array([self.failing_default,]*network_size) + return SysAdmin(self) + +class SysAdmin(macarico.Env): + + def __init__(self, network): + self.network = network + self.t = 0 + self.reward = 0 + self.reward_array = [] + self.discount = 1 + self.comp_status = network.default_computer_status[:] + self.random_seeds = None + + # For macarico.Env + self.T = 20 # Horizon + self.n_actions = network_size + 1 + self.actions = range(self.n_actions) + + def run_episode(self, policy): + self.random_seeds = np.array([np.random.RandomState(0), np.random.RandomState(10), np.random.RandomState(8)]) + self.output = [] + for self.t in range(self.T): + if help: + print "\nt: ", str(self.t) , " --> ", np.array_str(self.comp_status) + a = policy(self) + # During each step the agent can do nothing or reboot any of the computers + a, r = self.step(a) + self.output.append(a) + self.reward += self.discount * np.sum(1 * (1-self.comp_status)) + self.reward += self.discount * np.sum(-2 * self.comp_status) + + this_reward = self.discount * np.sum(-2 * self.comp_status) + \ + self.discount * np.sum(1-self.comp_status) + \ + r + + self.reward_array.append(this_reward) + if help: + print "\t\t Reward --> ", str(self.reward) + if help: + print (" ------------------\n") + print "\t state --> ", np.array_str(self.comp_status) + print "\t Reward --> ", str(self.reward) + print "Done! ----> episode" + print "\t --------------------" + return self.output, self.reward + + def step(self, action): + tmp_reward = 0 + # computer can start to fail with a small chance + # probability of computer failing randomly .075 + fail_chance = [] + for idx, (prob_succ, prob_fail) in enumerate(self.network.prob_failure): + fail_chance.append(self.random_seeds[idx].choice([0,1], p=[prob_succ, prob_fail], size=(1))[0]) + + fail_chance = fail_chance | self.comp_status + + # If a computer is connected to a failing com + for idx, val in enumerate(fail_chance): + if val: + if help: + print "\t fail: [", str(idx), "]" + self.comp_status[idx] = 1 + for nbr in [(idx+1) % 3, (idx-1) % 3]: + if help: + print "\t\t Neighbor Failure Increase: [", str(nbr), "]" + self.network.prob_failure[nbr][0] -= self.network.incr_failing_neighbor + self.network.prob_failure[nbr][1] += self.network.incr_failing_neighbor + if help: + print "\t\t network.prob_failure: ", str(self.network.prob_failure[nbr]) + + self.network.prob_failure[nbr][0] = max([self.network.prob_failure[nbr][0], 0]) + self.network.prob_failure[nbr][1] = min([self.network.prob_failure[nbr][1], 1]) + + #Last action is to do nothing + #Else reboot the computer choosen + if action != (self.n_actions-1): + if help: + print "\t fix: [", str(action), "]" + self.comp_status[action] = 0 + self.network.prob_failure[action] = self.network.failing_default + if help: + print "\t\t network.prob_failure: ", str(self.network.prob_failure[action]) + self.reward += self.discount * -2.0 + tmp_reward = self.discount * -2.0 + + else: + if help: + print "\t fix: None - ", str(action) + + return action, tmp_reward + +class SysAdminLoss(macarico.Loss): + def __init__(self): + super(SysAdminLoss, self).__init__('reward') + + def evaluate(self, ex, state): + return (-1) * np.array(state.reward_array) + +class SysAdminFeatures(macarico.Features): + def __init__(self): + macarico.Features.__init__(self, 'computers', network_size) + + def forward(self, state): + view = np.reshape(state.comp_status, (1,network_size)) + return dy.inputTensor(view) diff --git a/tests/test_sysadmin.py b/tests/test_sysadmin.py new file mode 100644 index 0000000..0962cfd --- /dev/null +++ b/tests/test_sysadmin.py @@ -0,0 +1,52 @@ +from __future__ import division +import random +import dynet as dy +import numpy as np + +import macarico.util +macarico.util.reseed() + +from macarico.lts.reinforce import Reinforce +from macarico.annealing import EWMA +from macarico.features.sequence import AttendAt +from macarico.features.actor import TransitionRNN, TransitionBOW +from macarico.policies.linear import LinearPolicy +from test_pocman import run_environment + +from macarico.tasks.sysadmin import Network, SysAdmin, SysAdminLoss, SysAdminFeatures + +net_size = 3 + +def run_sysadmin(net, actor): + dy_model = dy.ParameterCollection() + policy = LinearPolicy(dy_model, actor(dy_model), net_size+1) + baseline = EWMA(0.8) + optimizer = dy.AdamTrainer(dy_model, alpha=0.01) + losses = [] + for epoch in xrange(3001): + dy.renew_cg() + learner = Reinforce(policy, baseline) + env = net.mk_env() + res,reward = env.run_episode(learner) + loss = SysAdminLoss()(net, env) + losses.append(np.sum(loss)) + if epoch % 10 == 0: + print epoch, ' ', sum(losses[-500:]) / len(losses[-500:]), '\t', res, reward + learner.update(loss) + optimizer.update() + + +def test(): + print '\n===\n=== \n===' + net = Network() + run_sysadmin( + net, + lambda dy_model: + TransitionBOW(dy_model, + [SysAdminFeatures()], + [AttendAt(lambda _: 0, 'computers')], + 4) + ) + +if __name__ == '__main__': + test()