Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions macarico/lts/reinforce.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

import macarico

class ReinforceException(Exception):
pass


class Reinforce(macarico.Learner):
"REINFORCE with a scalar baseline function."
Expand All @@ -30,9 +33,26 @@ def update(self, loss):
if len(self.trajectory) > 0:
b = self.baseline()
total_loss = 0
for a, p_a in self.trajectory:
total_loss += (loss - b) * dy.log(p_a)
self.baseline.update(loss)

# Loss per time step of episode
if isinstance(loss, np.ndarray):
for idx, (a, p_a) in enumerate(self.trajectory):
total_loss += (loss[idx] - b) * dy.log(p_a)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here we're assuming gamma is always 0, can we change this to support computing returns for a given gamma?

for time_step_loss in loss:
self.baseline.update(time_step_loss)

# Loss per episode
elif isinstance(loss, (int, long, float)):
for a, p_a in self.trajectory:
total_loss += (loss - b) * dy.log(p_a)

self.baseline.update(loss)

# Error Unknown loss
else:
raise ReinforceException("Unknown loss type")

total_loss.forward()
total_loss.backward()
#torch.autograd.backward(self.trajectory[:], [None]*len(self.trajectory))
Expand Down
138 changes: 138 additions & 0 deletions macarico/tasks/sysadmin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# https://esc.fnwi.uva.nl/thesis/centraal/files/f355157274.pdf
# http://www.jmlr.org/papers/volume6/wingate05a/wingate05a.pdf

from __future__ import division

import random
import numpy as np
import dynet as dy
import macarico
from enum import Enum
import macarico

help = 1
network_size = 3

# modulo operator (ring topology)
class Network(object):
def __init__(self):
self.default_computer_status = np.array([0]*network_size) # 3 computers status

# sysadmin variables
self.small_prob_failure = .075 # prob of any computer changing from working to failing
self.incr_failing_neighbor = 0.125 # failure increase due to connected to failing computer
self.gamma = 0.95
self.n_actions = network_size + 1

# Probability of failing each round
self.failing_default = [1-self.small_prob_failure, self.small_prob_failure]
self.prob_failure = np.array([self.failing_default,]*network_size)

def mk_env(self):
self.default_computer_status = np.array([0] *network_size) # 3 computers status
self.prob_failure = np.array([self.failing_default,]*network_size)
return SysAdmin(self)

class SysAdmin(macarico.Env):

def __init__(self, network):
self.network = network
self.t = 0
self.reward = 0
self.reward_array = []
self.discount = 1
self.comp_status = network.default_computer_status[:]
self.random_seeds = None

# For macarico.Env
self.T = 20 # Horizon
self.n_actions = network_size + 1
self.actions = range(self.n_actions)

def run_episode(self, policy):
self.random_seeds = np.array([np.random.RandomState(0), np.random.RandomState(10), np.random.RandomState(8)])
self.output = []
for self.t in range(self.T):
if help:
print "\nt: ", str(self.t) , " --> ", np.array_str(self.comp_status)
a = policy(self)
# During each step the agent can do nothing or reboot any of the computers
a, r = self.step(a)
self.output.append(a)
self.reward += self.discount * np.sum(1 * (1-self.comp_status))
self.reward += self.discount * np.sum(-2 * self.comp_status)

this_reward = self.discount * np.sum(-2 * self.comp_status) + \
self.discount * np.sum(1-self.comp_status) + \
r

self.reward_array.append(this_reward)
if help:
print "\t\t Reward --> ", str(self.reward)
if help:
print (" ------------------\n")
print "\t state --> ", np.array_str(self.comp_status)
print "\t Reward --> ", str(self.reward)
print "Done! ----> episode"
print "\t --------------------"
return self.output, self.reward

def step(self, action):
tmp_reward = 0
# computer can start to fail with a small chance
# probability of computer failing randomly .075
fail_chance = []
for idx, (prob_succ, prob_fail) in enumerate(self.network.prob_failure):
fail_chance.append(self.random_seeds[idx].choice([0,1], p=[prob_succ, prob_fail], size=(1))[0])

fail_chance = fail_chance | self.comp_status

# If a computer is connected to a failing com
for idx, val in enumerate(fail_chance):
if val:
if help:
print "\t fail: [", str(idx), "]"
self.comp_status[idx] = 1
for nbr in [(idx+1) % 3, (idx-1) % 3]:
if help:
print "\t\t Neighbor Failure Increase: [", str(nbr), "]"
self.network.prob_failure[nbr][0] -= self.network.incr_failing_neighbor
self.network.prob_failure[nbr][1] += self.network.incr_failing_neighbor
if help:
print "\t\t network.prob_failure: ", str(self.network.prob_failure[nbr])

self.network.prob_failure[nbr][0] = max([self.network.prob_failure[nbr][0], 0])
self.network.prob_failure[nbr][1] = min([self.network.prob_failure[nbr][1], 1])

#Last action is to do nothing
#Else reboot the computer choosen
if action != (self.n_actions-1):
if help:
print "\t fix: [", str(action), "]"
self.comp_status[action] = 0
self.network.prob_failure[action] = self.network.failing_default
if help:
print "\t\t network.prob_failure: ", str(self.network.prob_failure[action])
self.reward += self.discount * -2.0
tmp_reward = self.discount * -2.0

else:
if help:
print "\t fix: None - ", str(action)

return action, tmp_reward

class SysAdminLoss(macarico.Loss):
def __init__(self):
super(SysAdminLoss, self).__init__('reward')

def evaluate(self, ex, state):
return (-1) * np.array(state.reward_array)

class SysAdminFeatures(macarico.Features):
def __init__(self):
macarico.Features.__init__(self, 'computers', network_size)

def forward(self, state):
view = np.reshape(state.comp_status, (1,network_size))
return dy.inputTensor(view)
52 changes: 52 additions & 0 deletions tests/test_sysadmin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from __future__ import division
import random
import dynet as dy
import numpy as np

import macarico.util
macarico.util.reseed()

from macarico.lts.reinforce import Reinforce
from macarico.annealing import EWMA
from macarico.features.sequence import AttendAt
from macarico.features.actor import TransitionRNN, TransitionBOW
from macarico.policies.linear import LinearPolicy
from test_pocman import run_environment

from macarico.tasks.sysadmin import Network, SysAdmin, SysAdminLoss, SysAdminFeatures

net_size = 3

def run_sysadmin(net, actor):
dy_model = dy.ParameterCollection()
policy = LinearPolicy(dy_model, actor(dy_model), net_size+1)
baseline = EWMA(0.8)
optimizer = dy.AdamTrainer(dy_model, alpha=0.01)
losses = []
for epoch in xrange(3001):
dy.renew_cg()
learner = Reinforce(policy, baseline)
env = net.mk_env()
res,reward = env.run_episode(learner)
loss = SysAdminLoss()(net, env)
losses.append(np.sum(loss))
if epoch % 10 == 0:
print epoch, ' ', sum(losses[-500:]) / len(losses[-500:]), '\t', res, reward
learner.update(loss)
optimizer.update()


def test():
print '\n===\n=== \n==='
net = Network()
run_sysadmin(
net,
lambda dy_model:
TransitionBOW(dy_model,
[SysAdminFeatures()],
[AttendAt(lambda _: 0, 'computers')],
4)
)

if __name__ == '__main__':
test()