From 00da8635141c50fe025abc824d1d3b13d71d894f Mon Sep 17 00:00:00 2001 From: Lucas Tindall Date: Fri, 16 Aug 2019 11:57:37 -0700 Subject: [PATCH 1/8] added ppo for pytorch --- trainers/configs/1-Food.yaml | 22 +++ trainers/configs/2-Preferences.yaml | 11 ++ trainers/configs/3-Obstacles.yaml | 17 ++ trainers/configs/4-Avoidance.yaml | 11 ++ trainers/configs/5-SpatialReasoning.yaml | 22 +++ trainers/configs/6-Generalization.yaml | 22 +++ trainers/configs/7-InternalMemory.yaml | 24 +++ trainers/configs/allObjectsRandom.yaml | 150 ++++++++++++++++ trainers/configs/exampleConfig.yaml | 23 +++ trainers/configs/exampleTraining.yaml | 34 ++++ trainers/configs/forcedChoice.yaml | 43 +++++ trainers/configs/internalMemory.yaml | 32 ++++ trainers/configs/movingFood.yaml | 9 + trainers/configs/objectManipulation.yaml | 17 ++ trainers/configs/rainbow.gin | 34 ++++ trainers/configs/trainer_config.yaml | 26 +++ trainers/configs/trainer_config2.yaml | 27 +++ trainers/ppo.py | 215 +++++++++++++++++++++++ 18 files changed, 739 insertions(+) create mode 100644 trainers/configs/1-Food.yaml create mode 100644 trainers/configs/2-Preferences.yaml create mode 100644 trainers/configs/3-Obstacles.yaml create mode 100644 trainers/configs/4-Avoidance.yaml create mode 100644 trainers/configs/5-SpatialReasoning.yaml create mode 100644 trainers/configs/6-Generalization.yaml create mode 100644 trainers/configs/7-InternalMemory.yaml create mode 100644 trainers/configs/allObjectsRandom.yaml create mode 100644 trainers/configs/exampleConfig.yaml create mode 100644 trainers/configs/exampleTraining.yaml create mode 100644 trainers/configs/forcedChoice.yaml create mode 100644 trainers/configs/internalMemory.yaml create mode 100644 trainers/configs/movingFood.yaml create mode 100644 trainers/configs/objectManipulation.yaml create mode 100644 trainers/configs/rainbow.gin create mode 100644 trainers/configs/trainer_config.yaml create mode 100644 trainers/configs/trainer_config2.yaml create mode 100644 trainers/ppo.py diff --git a/trainers/configs/1-Food.yaml b/trainers/configs/1-Food.yaml new file mode 100644 index 00000000..c5992f16 --- /dev/null +++ b/trainers/configs/1-Food.yaml @@ -0,0 +1,22 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 250 + items: + - !Item + name: GoodGoal + 1: !Arena + t: 250 + items: + - !Item + name: GoodGoal + 2: !Arena + t: 250 + items: + - !Item + name: GoodGoal + 3: !Arena + t: 250 + items: + - !Item + name: GoodGoal diff --git a/trainers/configs/2-Preferences.yaml b/trainers/configs/2-Preferences.yaml new file mode 100644 index 00000000..2ea5a4cc --- /dev/null +++ b/trainers/configs/2-Preferences.yaml @@ -0,0 +1,11 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 250 + items: + - !Item + name: GoodGoal + - !Item + name: GoodGoalMulti + - !Item + name: BadGoal diff --git a/trainers/configs/3-Obstacles.yaml b/trainers/configs/3-Obstacles.yaml new file mode 100644 index 00000000..845e977c --- /dev/null +++ b/trainers/configs/3-Obstacles.yaml @@ -0,0 +1,17 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoal + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} diff --git a/trainers/configs/4-Avoidance.yaml b/trainers/configs/4-Avoidance.yaml new file mode 100644 index 00000000..f0c64551 --- /dev/null +++ b/trainers/configs/4-Avoidance.yaml @@ -0,0 +1,11 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoal + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: DeathZone diff --git a/trainers/configs/5-SpatialReasoning.yaml b/trainers/configs/5-SpatialReasoning.yaml new file mode 100644 index 00000000..d0e7c9ea --- /dev/null +++ b/trainers/configs/5-SpatialReasoning.yaml @@ -0,0 +1,22 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoalMulti + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: Ramp + colors: + - !RGB {r: 255, g: 0, b: 255} + - !RGB {r: 255, g: 0, b: 255} + - !RGB {r: 255, g: 0, b: 255} + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + - !RGB {r: 153, g: 153, b: 153} + - !RGB {r: 153, g: 153, b: 153} diff --git a/trainers/configs/6-Generalization.yaml b/trainers/configs/6-Generalization.yaml new file mode 100644 index 00000000..942058a0 --- /dev/null +++ b/trainers/configs/6-Generalization.yaml @@ -0,0 +1,22 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoalMulti + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: Ramp + - !Item + name: Ramp + - !Item + name: Ramp + - !Item + name: Wall + - !Item + name: Wall + - !Item + name: Wall \ No newline at end of file diff --git a/trainers/configs/7-InternalMemory.yaml b/trainers/configs/7-InternalMemory.yaml new file mode 100644 index 00000000..c70c2a84 --- /dev/null +++ b/trainers/configs/7-InternalMemory.yaml @@ -0,0 +1,24 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + blackouts: [-20] + items: + - !Item + name: GoodGoal + 1: !Arena + t: 100 + blackouts: [-40] + items: + - !Item + name: Wall + - !Item + name: GoodGoal + 2: !Arena + t: 100 + blackouts: [50, 55, 75, 80, 100, 105, 125] + items: + - !Item + name: WallTranparent + - !Item + name: GoodGoal diff --git a/trainers/configs/allObjectsRandom.yaml b/trainers/configs/allObjectsRandom.yaml new file mode 100644 index 00000000..aaedf9c1 --- /dev/null +++ b/trainers/configs/allObjectsRandom.yaml @@ -0,0 +1,150 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: CylinderTunnel + - !Item + name: Ramp + - !Item + name: Wall + - !Item + name: WallTransparent + - !Item + name: Cardbox1 + - !Item + name: Cardbox2 + - !Item + name: Cylinder + - !Item + name: UObject + - !Item + name: LObject + - !Item + name: GoodGoal + - !Item + name: GoodGoalBounce + - !Item + name: BadGoal + - !Item + name: BadGoalBounce + - !Item + name: GoodGoalMulti + - !Item + name: GoodGoalMultiBounce + - !Item + name: DeathZone + - !Item + name: HotZone + 1: !Arena + t: 0 + items: + - !Item + name: CylinderTunnel + - !Item + name: Ramp + - !Item + name: Wall + - !Item + name: WallTransparent + - !Item + name: Cardbox1 + - !Item + name: Cardbox2 + - !Item + name: Cylinder + - !Item + name: UObject + - !Item + name: LObject + - !Item + name: GoodGoal + - !Item + name: GoodGoalBounce + - !Item + name: BadGoal + - !Item + name: BadGoalBounce + - !Item + name: GoodGoalMulti + - !Item + name: GoodGoalMultiBounce + - !Item + name: DeathZone + - !Item + name: HotZone + 2: !Arena + t: 0 + items: + - !Item + name: CylinderTunnel + - !Item + name: Ramp + - !Item + name: Wall + - !Item + name: WallTransparent + - !Item + name: Cardbox1 + - !Item + name: Cardbox2 + - !Item + name: Cylinder + - !Item + name: UObject + - !Item + name: LObject + - !Item + name: GoodGoal + - !Item + name: GoodGoalBounce + - !Item + name: BadGoal + - !Item + name: BadGoalBounce + - !Item + name: GoodGoalMulti + - !Item + name: GoodGoalMultiBounce + - !Item + name: DeathZone + - !Item + name: HotZone + 3: !Arena + t: 0 + items: + - !Item + name: CylinderTunnel + - !Item + name: Ramp + - !Item + name: Wall + - !Item + name: WallTransparent + - !Item + name: Cardbox1 + - !Item + name: Cardbox2 + - !Item + name: Cylinder + - !Item + name: UObject + - !Item + name: LObject + - !Item + name: GoodGoal + - !Item + name: GoodGoalBounce + - !Item + name: BadGoal + - !Item + name: BadGoalBounce + - !Item + name: GoodGoalMulti + - !Item + name: GoodGoalMultiBounce + - !Item + name: DeathZone + - !Item + name: HotZone diff --git a/trainers/configs/exampleConfig.yaml b/trainers/configs/exampleConfig.yaml new file mode 100644 index 00000000..ce7e39a3 --- /dev/null +++ b/trainers/configs/exampleConfig.yaml @@ -0,0 +1,23 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: Cube + positions: + - !Vector3 {x: 10, y: 0, z: 10} + - !Vector3 {x: -1, y: 0, z: 30} + colors: + - !RGB {r: 204, g: 0, b: 204 } + rotations: [45] + sizes: + - !Vector3 {x: -1, y: 5, z: -1} + - !Item + name: Cylinder + colors: + - !RGB {r: 204, g: 0, b: 204 } + - !RGB {r: 204, g: 0, b: 204 } + - !RGB {r: 204, g: 0, b: 204 } + - !Item + name: GoodGoal \ No newline at end of file diff --git a/trainers/configs/exampleTraining.yaml b/trainers/configs/exampleTraining.yaml new file mode 100644 index 00000000..e4a952ad --- /dev/null +++ b/trainers/configs/exampleTraining.yaml @@ -0,0 +1,34 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 1000 + items: + - !Item + name: Wall + - !Item + name: CylinderTunnel + - !Item + name: GoodGoal + 1: !Arena + t: 1000 + items: + - !Item + name: TransparentWall + - !Item + name: GoodGoal + 2: !Arena + t: 1000 + items: + - !Item + name: Cardbox1 + - !Item + name: BadGoal + - !Item + name: GoodGoal + 3: !Arena + t: 1000 + items: + - !Item + name: DeathZone + - !Item + name: GoodGoal \ No newline at end of file diff --git a/trainers/configs/forcedChoice.yaml b/trainers/configs/forcedChoice.yaml new file mode 100644 index 00000000..245a235e --- /dev/null +++ b/trainers/configs/forcedChoice.yaml @@ -0,0 +1,43 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: Wall + positions: + - !Vector3 {x: 20, y: 0, z: 20} + - !Vector3 {x: 20, y: 0, z: 8.75} + - !Vector3 {x: 20, y: 0, z: 31.25} + - !Vector3 {x: 8.75, y: 0, z: 20} + - !Vector3 {x: 31.25, y: 0, z: 20} + rotations: [0,0,0,0,0] + sizes: + - !Vector3 {x: 5, y: 0.5, z: 5} + - !Vector3 {x: .1, y: 5, z: 17.5} + - !Vector3 {x: .1, y: 5, z: 17.5} + - !Vector3 {x: 17.5, y: 5, z: .1} + - !Vector3 {x: 17.5, y: 5, z: .1} + colors: + - !RGB {r: 0, g: 0, b: 255} + - !RGB {r: 200, g: 200, b: 200} + - !RGB {r: 200, g: 200, b: 200} + - !RGB {r: 200, g: 200, b: 200} + - !RGB {r: 200, g: 200, b: 200} + - !Item + name: Agent + positions: + - !Vector3 {x: 20, y: .5, z: 20} + rotations: [0] + - !Item + name: GoodGoal + positions: + - !Vector3 {x: 30, y: 0, z: 30} + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: BadGoal + positions: + - !Vector3 {x: 10, y: 0, z: 30} + sizes: + - !Vector3 {x: 1, y: 1, z: 1} \ No newline at end of file diff --git a/trainers/configs/internalMemory.yaml b/trainers/configs/internalMemory.yaml new file mode 100644 index 00000000..35007091 --- /dev/null +++ b/trainers/configs/internalMemory.yaml @@ -0,0 +1,32 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + blackouts: [-20] + items: + - !Item + name: GoodGoal + 1: !Arena + t: 100 + blackouts: [-40] + items: + - !Item + name: Wall + - !Item + name: GoodGoal + 2: !Arena + t: 100 + blackouts: [50, 55, 75, 80, 100, 105, 125] + items: + - !Item + name: WallTranparent + - !Item + name: GoodGoal + 3: !Arena + t: 100 + blackouts: [25, 30, 50, 55, 75] + items: + - !Item + name: CylinderTunnel + - !Item + name: GoodGoal diff --git a/trainers/configs/movingFood.yaml b/trainers/configs/movingFood.yaml new file mode 100644 index 00000000..6ebccd81 --- /dev/null +++ b/trainers/configs/movingFood.yaml @@ -0,0 +1,9 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoalBounce + - !Item + name: BadGoalBounce \ No newline at end of file diff --git a/trainers/configs/objectManipulation.yaml b/trainers/configs/objectManipulation.yaml new file mode 100644 index 00000000..8b957bd4 --- /dev/null +++ b/trainers/configs/objectManipulation.yaml @@ -0,0 +1,17 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoal + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: Cardbox1 + - !Item + name: Cardbox2 + - !Item + name: UObject + - !Item + name: LObject diff --git a/trainers/configs/rainbow.gin b/trainers/configs/rainbow.gin new file mode 100644 index 00000000..1cc5e979 --- /dev/null +++ b/trainers/configs/rainbow.gin @@ -0,0 +1,34 @@ +# Hyperparameters follow Hessel et al. (2018). +import dopamine.agents.rainbow.rainbow_agent +import animalai_train.dopamine.animalai_lib +import dopamine.discrete_domains.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +RainbowAgent.num_atoms = 51 +RainbowAgent.vmax = 10. +RainbowAgent.gamma = 0.99 +RainbowAgent.update_horizon = 3 +RainbowAgent.min_replay_history = 20000 # agent steps +RainbowAgent.update_period = 4 +RainbowAgent.target_update_period = 8000 # agent steps +RainbowAgent.epsilon_train = 0.01 +RainbowAgent.epsilon_eval = 0.001 +RainbowAgent.epsilon_decay_period = 250000 # agent steps +RainbowAgent.replay_scheme = 'prioritized' +RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +RainbowAgent.optimizer = @tf.train.AdamOptimizer() +RainbowAgent.network = @animalai_lib.rainbow_network + +# Note these parameters are different from C51's. +tf.train.AdamOptimizer.learning_rate = 0.0000625 +tf.train.AdamOptimizer.epsilon = 0.00015 + +create_agent.agent_name = 'rainbow' +Runner.num_iterations = 200 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 diff --git a/trainers/configs/trainer_config.yaml b/trainers/configs/trainer_config.yaml new file mode 100644 index 00000000..314ca567 --- /dev/null +++ b/trainers/configs/trainer_config.yaml @@ -0,0 +1,26 @@ +default: + trainer: ppo + +Learner: + trainer: ppo + epsilon: 0.2 + gamma: 0.99 + lambd: 0.95 + learning_rate: 3.0e-4 + memory_size: 256 + normalize: false + sequence_length: 64 + summary_freq: 1000 + use_recurrent: false + use_curiosity: true + curiosity_strength: 0.01 + curiosity_enc_size: 256 + time_horizon: 128 + batch_size: 64 + buffer_size: 2024 + hidden_units: 256 + num_layers: 1 + beta: 1.0e-2 + max_steps: 5.0e6 + num_epoch: 3 + diff --git a/trainers/configs/trainer_config2.yaml b/trainers/configs/trainer_config2.yaml new file mode 100644 index 00000000..b9a045f6 --- /dev/null +++ b/trainers/configs/trainer_config2.yaml @@ -0,0 +1,27 @@ +default: + trainer: ppo + model_path: /data1/AnimalAI-Olympics/examples/2daytrain.cptk + +Learner: + trainer: ppo + epsilon: 0.2 + gamma: 0.99 + lambd: 0.95 + learning_rate: 3.0e-4 + memory_size: 256 + normalize: false + sequence_length: 64 + summary_freq: 1000 + use_recurrent: false + use_curiosity: true + curiosity_strength: 0.01 + curiosity_enc_size: 256 + time_horizon: 128 + batch_size: 64 + buffer_size: 2024 + hidden_units: 256 + num_layers: 1 + beta: 1.0e-2 + max_steps: 5.0e6 + num_epoch: 3 + diff --git a/trainers/ppo.py b/trainers/ppo.py new file mode 100644 index 00000000..2ed68d18 --- /dev/null +++ b/trainers/ppo.py @@ -0,0 +1,215 @@ +import gym +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.distributions import Categorical +import numpy as np + +from animalai.envs import UnityEnvironment +from animalai.envs.arena_config import ArenaConfig + + +# my params +env_path = '../env/AnimalAI' +brain_name='Learner' +train_mode=True +num_actions = 9 +color_channels = 3 +env_field = 'configs/1-Food.yaml' +n_episodes = 2000 +#max_t = 100 +actions_array = np.array([[0,0],[0,1],[0,2],[1,0], [1,1],[1,2], [2,0],[2,1],[2,2]]) +n_arenas = 4 + +cuda = torch.device('cuda') + +#Hyperparameters +learning_rate = 0.0005 +gamma = 0.98 +lmbda = 0.95 +eps_clip = 0.1 +K_epoch = 2 +T_horizon = 500 + + + +class PPO(nn.Module): + def __init__(self): + super(PPO, self).__init__() + self.data = [] + + self.conv1 = nn.Conv2d(color_channels, 32, kernel_size=8, stride=4) + self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) + self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) + self.fc4 = nn.Linear(7 * 7 * 64, 512) + self.fc_pi = nn.Linear(512, num_actions) + self.fc_v = nn.Linear(512, 1) + + #self.fc1 = nn.Linear(4,256) + #self.fc_pi = nn.Linear(256,2) + #self.fc_v = nn.Linear(256,1) + self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) + + def pi(self, x, softmax_dim = 1): + #x = x.permute(2,0,1) + if x.ndim == 3: + x = x.unsqueeze(0) + #x = x.transpose(1,3) + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + x = F.relu(self.fc4(x.view(x.size(0), -1))) + x = self.fc_pi(x) + + #x = F.relu(self.fc1(x)) + #x = self.fc_pi(x) + + prob = F.softmax(x, dim=softmax_dim) + + + return prob + + def v(self, x): + + #x = x.transpose(1,3) + #print(x.shape) + #x = x.permute(2,0,1) + #x = x.unsqueeze(0) + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + x = F.relu(self.fc4(x.view(x.size(0), -1))) + + #x = F.relu(self.fc1(x)) + v = self.fc_v(x) + return v + + def put_data(self, transition): + + self.data.append(transition) + + def make_batch(self): + s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], [] + for transition in self.data: + s, a, r, s_prime, prob_a, done = transition + + + s_lst.append(s) + a_lst.append([a]) + r_lst.append([r]) + s_prime_lst.append(s_prime) + prob_a_lst.append([prob_a]) + done_mask = 0 if done else 1 + done_lst.append([done_mask]) + + s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ + torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ + torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst) + + prob_a = prob_a.cuda() + a = a.cuda() + s_prime = s_prime.cuda() + r = r.cuda() + done_mask = done_mask.cuda() + s = s.cuda() + + self.data = [] + return s, a, r, s_prime, done_mask, prob_a + + def train_net(self): + s, a, r, s_prime, done_mask, prob_a = self.make_batch() + + + for i in range(K_epoch): + td_target = r + gamma * self.v(s_prime) * done_mask + delta = td_target - self.v(s) + delta = delta.detach().cpu().numpy() + + advantage_lst = [] + advantage = 0.0 + for delta_t in delta[::-1]: + advantage = gamma * lmbda * advantage + delta_t[0] + advantage_lst.append([advantage]) + advantage_lst.reverse() + advantage = torch.tensor(advantage_lst, dtype=torch.float).cuda() + + pi = self.pi(s, softmax_dim=1) + + + pi_a = pi.gather(1,a) + ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a)) # a/b == exp(log(a)-log(b)) + + surr1 = ratio * advantage + surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage + loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach()) + + self.optimizer.zero_grad() + loss.mean().backward() + self.optimizer.step() + +def main(): + env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100)) + arena_config_in = ArenaConfig(env_field) + print(arena_config_in.arenas) + + + model = PPO() + + model = model.cuda() + + print_interval = 1 + + for n_epi in range(1, n_episodes+1): + action_info = env.reset(arenas_configurations=arena_config_in, train_mode=train_mode) + state = action_info[brain_name].visual_observations[0] + + #state = np.moveaxis(state, -1, 0) + state = np.moveaxis(state, -1, 1) + done = False + score = 0.0 + scores = [] + + while not done: + for t in range(T_horizon): + + prob = model.pi(torch.from_numpy(state).float().cuda()) + m = Categorical(prob) + + #a = m.sample().item() + a = m.sample() + action = actions_array[a.cpu().numpy().astype(int)] + #s_prime, reward, done, info = + action_info = env.step(vector_action=action) + next_state = action_info[brain_name].visual_observations[0] + #next_state = np.moveaxis(next_state, -1, 0) + next_state = np.moveaxis(next_state, -1, 1) # next state shape = [n_arenas, 3, 84, 84] + reward = action_info[brain_name].rewards # list of rewards len = n_arenas + arenas_done = action_info[brain_name].local_done + done = any(arenas_done) + + prob_a = prob[np.arange(prob.shape[0])[:,None], a.cpu().numpy().astype(int)[:,None]] + + for (s, a, r, n_s, p_a, d) in zip (state, a, reward, next_state, prob_a, arenas_done): + model.put_data((s, a, r, n_s, p_a, d)) + scores.append(r) + #model.put_data((state, a, reward, next_state, prob[0][a].item(), done)) + #model.put_data((state, a, reward, next_state, prob_a, done)) + state = next_state + + #score += reward + if done: + break + + + model.train_net() + + #scores.append(score) + + if n_epi%print_interval==0 and n_epi!=0: + print("Episode: {}, avg score: {:.1f}".format(n_epi, np.mean(scores)/n_arenas)) + + env.close() + +if __name__ == '__main__': + main() From ce19dcb32482f64e9369276094e9c70104c08154 Mon Sep 17 00:00:00 2001 From: Lucas Tindall Date: Sat, 17 Aug 2019 00:40:25 -0700 Subject: [PATCH 2/8] update ppo --- trainers/ppo.py | 150 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 129 insertions(+), 21 deletions(-) diff --git a/trainers/ppo.py b/trainers/ppo.py index 2ed68d18..f392de45 100644 --- a/trainers/ppo.py +++ b/trainers/ppo.py @@ -5,31 +5,44 @@ import torch.optim as optim from torch.distributions import Categorical import numpy as np +import time +import argparse +import os from animalai.envs import UnityEnvironment from animalai.envs.arena_config import ArenaConfig +parser = argparse.ArgumentParser(description="Train ppo agent for AnimalAI.") +parser.add_argument('--config', type=str, default='configs/1-Food.yaml', help='Environment config file') +parser.add_argument('--load_model', type=str, default='saved_models/ppo.pth', help='Saved model to load') +parser.add_argument('--inference', default=False, action='store_true', help='Run in inference mode') + +args = parser.parse_args() + # my params env_path = '../env/AnimalAI' brain_name='Learner' train_mode=True num_actions = 9 color_channels = 3 -env_field = 'configs/1-Food.yaml' -n_episodes = 2000 +env_field = args.config +n_episodes = 20000 #max_t = 100 actions_array = np.array([[0,0],[0,1],[0,2],[1,0], [1,1],[1,2], [2,0],[2,1],[2,2]]) -n_arenas = 4 +n_arenas = 3 +print_interval = 1 +save_interval = 10 +save_path = 'saved_models/' -cuda = torch.device('cuda') +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") #Hyperparameters learning_rate = 0.0005 gamma = 0.98 lmbda = 0.95 eps_clip = 0.1 -K_epoch = 2 +K_epoch = 4 T_horizon = 500 @@ -107,12 +120,12 @@ def make_batch(self): torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst) - prob_a = prob_a.cuda() - a = a.cuda() - s_prime = s_prime.cuda() - r = r.cuda() - done_mask = done_mask.cuda() - s = s.cuda() + prob_a = prob_a.to(device) + a = a.to(device) + s_prime = s_prime.to(device) + r = r.to(device) + done_mask = done_mask.to(device) + s = s.to(device) self.data = [] return s, a, r, s_prime, done_mask, prob_a @@ -132,7 +145,7 @@ def train_net(self): advantage = gamma * lmbda * advantage + delta_t[0] advantage_lst.append([advantage]) advantage_lst.reverse() - advantage = torch.tensor(advantage_lst, dtype=torch.float).cuda() + advantage = torch.tensor(advantage_lst, dtype=torch.float).to(device) pi = self.pi(s, softmax_dim=1) @@ -148,17 +161,21 @@ def train_net(self): loss.mean().backward() self.optimizer.step() -def main(): - env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100)) +def train(): + env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100), inference=args.inference) arena_config_in = ArenaConfig(env_field) - print(arena_config_in.arenas) + #print(arena_config_in.arenas) model = PPO() + if os.path.exists(args.load_model): + model.load_state_dict(torch.load(args.load_model)) + print("Successfully loaded saved model from {}".format(args.load_model)) - model = model.cuda() + model = model.to(device) - print_interval = 1 + + total_obs = 0 for n_epi in range(1, n_episodes+1): action_info = env.reset(arenas_configurations=arena_config_in, train_mode=train_mode) @@ -170,10 +187,13 @@ def main(): score = 0.0 scores = [] + start_episode = time.time() + n_obs = 0 while not done: for t in range(T_horizon): + n_obs += n_arenas - prob = model.pi(torch.from_numpy(state).float().cuda()) + prob = model.pi(torch.from_numpy(state).float().to(device)) m = Categorical(prob) #a = m.sample().item() @@ -201,15 +221,103 @@ def main(): if done: break - + start_train = time.time() model.train_net() + end_train = time.time() + #print('time to train: ',end_train - start_train) + + end_episode = time.time() + + #print('{} observations/second'.format(n_obs/(end_episode - start_episode))) #scores.append(score) if n_epi%print_interval==0 and n_epi!=0: - print("Episode: {}, avg score: {:.1f}".format(n_epi, np.mean(scores)/n_arenas)) + print("Episode: {}, avg score: {:.4f}, [{:.0f}] observations/second".format(n_epi, np.mean(scores)/n_arenas, n_obs/(end_episode - start_episode))) + + if n_epi%save_interval==0 and n_epi!=0: + torch.save(model.state_dict(), save_path+'ppo.pth') + env.close() +def inference(): + env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100), inference=args.inference) + arena_config_in = ArenaConfig(env_field) + #print(arena_config_in.arenas) + + + model = PPO() + if os.path.exists(args.load_model): + model.load_state_dict(torch.load(args.load_model)) + print("Successfully loaded saved model from {}".format(args.load_model)) + + model = model.to(device) + + + total_obs = 0 + + for n_epi in range(1, n_episodes+1): + action_info = env.reset(arenas_configurations=arena_config_in, train_mode=train_mode) + state = action_info[brain_name].visual_observations[0] + + #state = np.moveaxis(state, -1, 0) + state = np.moveaxis(state, -1, 1) + done = False + score = 0.0 + + start_episode = time.time() + n_obs = 0 + while not done: + for t in range(T_horizon): + n_obs += n_arenas + + prob = model.pi(torch.from_numpy(state).float().to(device)) + m = Categorical(prob) + + #a = m.sample().item() + a = m.sample() + action = actions_array[a.cpu().numpy().astype(int)] + #s_prime, reward, done, info = + action_info = env.step(vector_action=action) + next_state = action_info[brain_name].visual_observations[0] + next_state = np.moveaxis(next_state, -1, 1) # next state shape = [n_arenas, 3, 84, 84] + reward = action_info[brain_name].rewards # list of rewards len = n_arenas + arenas_done = action_info[brain_name].local_done + done = any(arenas_done) + + #prob_a = prob[np.arange(prob.shape[0])[:,None], a.cpu().numpy().astype(int)[:,None]] + + #for (s, a, r, n_s, p_a, d) in zip (state, a, reward, next_state, prob_a, arenas_done): + # model.put_data((s, a, r, n_s, p_a, d)) + # scores.append(r) + + state = next_state + + score += reward[0] + if done: + break + + #start_train = time.time() + #model.train_net() + #end_train = time.time() + #print('time to train: ',end_train - start_train) + + end_episode = time.time() + + #print('{} observations/second'.format(n_obs/(end_episode - start_episode))) + + #scores.append(score) + + if n_epi%print_interval==0 and n_epi!=0: + print("Episode: {}, avg score: {:.4f}, [{:.0f}] observations/second".format(n_epi, score/n_obs, n_obs/(end_episode - start_episode))) + + + + env.close() if __name__ == '__main__': - main() + + if not args.inference: + train() + else: + inference() From 76102f6624c7d9a52dc6c2e6360671e1dd4210cc Mon Sep 17 00:00:00 2001 From: Lucas Tindall Date: Sat, 17 Aug 2019 13:28:10 -0700 Subject: [PATCH 3/8] Added to ppo.py and added more arenas to 3-Obstacles.yaml --- trainers/configs/3-Obstacles.yaml | 30 +++++++++++++++++++++++++++--- trainers/ppo.py | 19 +++++++++++++++---- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/trainers/configs/3-Obstacles.yaml b/trainers/configs/3-Obstacles.yaml index 845e977c..a9f686ad 100644 --- a/trainers/configs/3-Obstacles.yaml +++ b/trainers/configs/3-Obstacles.yaml @@ -1,12 +1,36 @@ !ArenaConfig arenas: 0: !Arena - t: 0 + t: 500 + items: + - !Item + name: GoodGoal + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + 1: !Arena + t: 500 + items: + - !Item + name: GoodGoal + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + 2: !Arena + t: 500 items: - !Item name: GoodGoal - sizes: - - !Vector3 {x: 1, y: 1, z: 1} - !Item name: Wall colors: diff --git a/trainers/ppo.py b/trainers/ppo.py index f392de45..927f7e23 100644 --- a/trainers/ppo.py +++ b/trainers/ppo.py @@ -8,18 +8,26 @@ import time import argparse import os +import datetime from animalai.envs import UnityEnvironment from animalai.envs.arena_config import ArenaConfig parser = argparse.ArgumentParser(description="Train ppo agent for AnimalAI.") -parser.add_argument('--config', type=str, default='configs/1-Food.yaml', help='Environment config file') -parser.add_argument('--load_model', type=str, default='saved_models/ppo.pth', help='Saved model to load') -parser.add_argument('--inference', default=False, action='store_true', help='Run in inference mode') +parser.add_argument('--train_name', type=str, help='Will save model with this name. Default: random') +parser.add_argument('--config', type=str, default='configs/1-Food.yaml', help='Environment config file. Default: "configs/1-Food.yaml"') +parser.add_argument('--load_model', type=str, default='saved_models/ppo.pth', help='Saved model to load. Default: "saved_models/ppo.pth"') +parser.add_argument('--inference', default=False, action='store_true', help='Run in inference mode. Default: False') args = parser.parse_args() + +if not args.inference: + if args.train_name is not None: + train_filename = '{}.pth'.format(args.train_name) + else: + train_filename = 'ppo_{}.pth'.format(np.random.randint(100000,999999)) # my params env_path = '../env/AnimalAI' brain_name='Learner' @@ -236,7 +244,8 @@ def train(): print("Episode: {}, avg score: {:.4f}, [{:.0f}] observations/second".format(n_epi, np.mean(scores)/n_arenas, n_obs/(end_episode - start_episode))) if n_epi%save_interval==0 and n_epi!=0: - torch.save(model.state_dict(), save_path+'ppo.pth') + print("Saving model to {}ppo.pth at {}".format(save_path, datetime.datetime.now())) + torch.save(model.state_dict(), save_path+train_filename) env.close() @@ -318,6 +327,8 @@ def inference(): if __name__ == '__main__': if not args.inference: + print("Starting agent in train mode...") train() else: + print("Starting agent in inference mode...") inference() From 78457553eaaf265ec5e5788c8568722ab685b2c0 Mon Sep 17 00:00:00 2001 From: Lucas Tindall Date: Sat, 17 Aug 2019 15:45:42 -0700 Subject: [PATCH 4/8] Added requirements. --- requirements.txt | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..34ccaf4d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,35 @@ +absl-py==0.7.1 +animalai==1.0.3 +animalai-train==1.0.3 +astor==0.8.0 +atari-py==0.2.6 +cloudpickle==1.2.1 +cycler==0.10.0 +dopamine-rl==2.0.5 +future==0.17.1 +gast==0.2.2 +gin-config==0.2.0 +grpcio==1.11.1 +gym==0.13.1 +h5py==2.9.0 +jsonpickle==1.2 +Keras-Applications==1.0.8 +Keras-Preprocessing==1.1.0 +kiwisolver==1.1.0 +Markdown==3.1.1 +matplotlib==3.1.1 +numpy==1.14.5 +opencv-python==4.1.0.25 +Pillow==5.4.1 +protobuf==3.6.1 +pyglet==1.3.2 +pyparsing==2.4.0 +python-dateutil==2.8.0 +PyYAML==5.1.1 +scipy==1.3.0 +six==1.12.0 +tensorboard==1.12.2 +tensorflow==1.12.2 +termcolor==1.1.0 +torch==1.2.0 +Werkzeug==0.15.5 From dec3c50f01b7b70f2e662b7ec4addace157f99f9 Mon Sep 17 00:00:00 2001 From: Lucas Tindall Date: Fri, 23 Aug 2019 00:03:09 +0000 Subject: [PATCH 5/8] Added some code to generate env configs in Python so I can track the positions of agents and items. --- trainers/env_utils.py | 116 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 trainers/env_utils.py diff --git a/trainers/env_utils.py b/trainers/env_utils.py new file mode 100644 index 00000000..f56a4d4b --- /dev/null +++ b/trainers/env_utils.py @@ -0,0 +1,116 @@ +from animalai.envs.arena_config import Vector3, RGB, Item, Arena, ArenaConfig +from collections import defaultdict +import numpy as np +import pprint + + +pp = pprint.PrettyPrinter(indent=4) + + + +class better_env(): + + def __init__(self, n_arenas=3): + + self.n_arenas = n_arenas + #self.details = {} + self.env_config = self.create_env(n_arenas=n_arenas) + self.details = self.get_details() + + + def create_env(self, n_arenas=3): + + include_items = {'Agent':1, 'GoodGoal':1, 'Wall':2} + + + env_config = ArenaConfig() + + # Loop over arenas + for i in range(n_arenas): + env_config.arenas[i] = Arena() + + #self.details[i] = {} + + + item_list = [] + # Loop over item types in each arena + for item_type, item_count in include_items.items(): + + #self.details[i][item_type] = defaultdict(list) + + name = item_type + colors = [] + positions = [] + + # Loop over item counts + for j in range(item_count): + if item_type == 'Wall': + colors.append(RGB(r=153, g=153, b=153)) + #self.details[i][item_type]['colors'].append((153,153,153)) + + + elif item_type == 'GoodGoal': + x = np.random.randint(1,39) + y = np.random.randint(1,39) + z = np.random.randint(1,39) + #self.details[i][item_type]['positions'].append((x,y,z)) + + positions.append(Vector3(x=x, y=y, z=z)) + + elif item_type == 'Agent': + x = np.random.randint(1,39) + y = np.random.randint(1,39) + z = np.random.randint(1,39) + #self.details[i][item_type]['positions'].append((x,y,z)) + + positions.append(Vector3(x=x, y=y, z=z)) + + item_list.append(Item(name=name, positions=positions, colors=colors)) + env_config.arenas[i].items = item_list + + return env_config + + def get_details(self): + + details = {} + + for i, arena in self.env_config.arenas.items(): + details[i] = {} + + for j, item in enumerate(arena.items): + details[i][item.name] = {} + details[i][item.name]['positions'] = [] + details[i][item.name]['rotations'] = [] + details[i][item.name]['sizes'] = [] + details[i][item.name]['colors'] = [] + + for position in item.positions: + details[i][item.name]['positions'].append((position.x, position.y, position.z)) + for rotation in item.rotations: + details[i][item.name]['rotations'].append(rotation) + for size in item.sizes: + details[i][item.name]['sizes'].append((size.x, size.y, size.z)) + for color in item.colors: + details[i][item.name]['colors'].append((color.r, color.g, color.b)) + + return details + + + +def env_info(env_config): + + for i, arena in env_config.arenas.items(): + print("Arena Config #{}".format(i)) + print("max time steps = {}".format(arena.t)) + for j, item in enumerate(arena.items): + print("{:4s}Item name: {}".format('',item.name)) + print("{:8s}Item positions: {}".format('',item.positions)) + print("{:8s}Item rotations: {}".format('',item.rotations)) + print("{:8s}Item sizes: {}".format('',item.sizes)) + print("{:8s}Item colors: {}".format('',item.colors)) + +env = better_env() +env_config = env.env_config +env_info(env_config) +pp.pprint(env.details) +#pp.pprint(env.details2) From 37a4f480827088602485a583bbd16068dcfc80ad Mon Sep 17 00:00:00 2001 From: Lucas Tindall Date: Fri, 23 Aug 2019 18:58:50 +0000 Subject: [PATCH 6/8] Track position --- trainers/env_utils.py | 40 ++++++++++++++++++++++++++++ trainers/ppo.py | 62 ++++++++++++++++++++++++++++++------------- 2 files changed, 84 insertions(+), 18 deletions(-) diff --git a/trainers/env_utils.py b/trainers/env_utils.py index f56a4d4b..3bd68c0b 100644 --- a/trainers/env_utils.py +++ b/trainers/env_utils.py @@ -8,6 +8,25 @@ +class position_tracker(): + + def __init__(self, starting_positions): + + self.agent_start = starting_positions['Agent'] + self.good_goal_start = starting_positions['GoodGoal'] + + self.current_position = np.array(self.agent_start).astype('float64') + + + def position_step(self, velocity_obs): + + velocity_obs = np.array(velocity_obs) + delta_distance = 0.0595 * velocity_obs + + self.current_position += delta_distance + + + class better_env(): def __init__(self, n_arenas=3): @@ -95,6 +114,21 @@ def get_details(self): return details + def get_start_positions(self): + + start_positions = {'Agent': [], 'GoodGoal': []} + + for arena_idx, arena in self.env_config.arenas.items(): + + for item_idx, item in enumerate(arena.items): + if item.name == 'Agent' or item.name == 'GoodGoal': + for position in item.positions: + start_positions[item.name].append([position.x, position.y, position.z]) + + return start_positions + + + def env_info(env_config): @@ -114,3 +148,9 @@ def env_info(env_config): env_info(env_config) pp.pprint(env.details) #pp.pprint(env.details2) + +pp.pprint(env.get_start_positions()) + + +ps = position_tracker(env.get_start_positions()) +print(ps.current_position) diff --git a/trainers/ppo.py b/trainers/ppo.py index 927f7e23..5ee2ad6b 100644 --- a/trainers/ppo.py +++ b/trainers/ppo.py @@ -14,6 +14,8 @@ from animalai.envs.arena_config import ArenaConfig +from env_utils import * + parser = argparse.ArgumentParser(description="Train ppo agent for AnimalAI.") parser.add_argument('--train_name', type=str, help='Will save model with this name. Default: random') parser.add_argument('--config', type=str, default='configs/1-Food.yaml', help='Environment config file. Default: "configs/1-Food.yaml"') @@ -23,15 +25,15 @@ args = parser.parse_args() -if not args.inference: - if args.train_name is not None: +if not args.inference: + if args.train_name is not None: train_filename = '{}.pth'.format(args.train_name) - else: - train_filename = 'ppo_{}.pth'.format(np.random.randint(100000,999999)) + else: + train_filename = 'ppo_{}.pth'.format(np.random.randint(100000,999999)) # my params env_path = '../env/AnimalAI' -brain_name='Learner' -train_mode=True +brain_name = 'Learner' +train_mode = True num_actions = 9 color_channels = 3 env_field = args.config @@ -176,14 +178,14 @@ def train(): model = PPO() - if os.path.exists(args.load_model): + if os.path.exists(args.load_model): model.load_state_dict(torch.load(args.load_model)) print("Successfully loaded saved model from {}".format(args.load_model)) model = model.to(device) - total_obs = 0 + total_obs = 0 for n_epi in range(1, n_episodes+1): action_info = env.reset(arenas_configurations=arena_config_in, train_mode=train_mode) @@ -210,6 +212,9 @@ def train(): #s_prime, reward, done, info = action_info = env.step(vector_action=action) next_state = action_info[brain_name].visual_observations[0] + velocity_obs = action_info[brain_name].vector_observations + print(velocity_obs) + asdf #next_state = np.moveaxis(next_state, -1, 0) next_state = np.moveaxis(next_state, -1, 1) # next state shape = [n_arenas, 3, 84, 84] reward = action_info[brain_name].rewards # list of rewards len = n_arenas @@ -229,7 +234,7 @@ def train(): if done: break - start_train = time.time() + start_train = time.time() model.train_net() end_train = time.time() #print('time to train: ',end_train - start_train) @@ -246,25 +251,41 @@ def train(): if n_epi%save_interval==0 and n_epi!=0: print("Saving model to {}ppo.pth at {}".format(save_path, datetime.datetime.now())) torch.save(model.state_dict(), save_path+train_filename) - + env.close() +def env_info(env_config): + + for i, arena in env_config.arenas.items(): + print("Arena Config #{}".format(i)) + print("max time steps = {}".format(arena.t)) + for j, item in enumerate(arena.items): + print("Item name: {}".format(item.name)) + print("Item positions: {}".format(item.positions)) + print("Item rotations: {}".format(item.rotations)) + print("Item sizes: {}".format(item.sizes)) + print("Item colors: {}".format(item.colors)) + def inference(): env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100), inference=args.inference) - arena_config_in = ArenaConfig(env_field) - #print(arena_config_in.arenas) + #arena_config_in = ArenaConfig(env_field) + + + b_env = better_env(n_arenas = 1) + arena_config_in = b_env.env_config + ps = position_tracker(b_env.get_start_positions()) model = PPO() - if os.path.exists(args.load_model): + if os.path.exists(args.load_model): model.load_state_dict(torch.load(args.load_model)) print("Successfully loaded saved model from {}".format(args.load_model)) model = model.to(device) - total_obs = 0 + total_obs = 0 for n_epi in range(1, n_episodes+1): action_info = env.reset(arenas_configurations=arena_config_in, train_mode=train_mode) @@ -290,6 +311,11 @@ def inference(): #s_prime, reward, done, info = action_info = env.step(vector_action=action) next_state = action_info[brain_name].visual_observations[0] + velocity_obs = action_info[brain_name].vector_observations + #ps.position_step(velocity_obs) + + #print('Current position = {}'.format(ps.current_position)) + next_state = np.moveaxis(next_state, -1, 1) # next state shape = [n_arenas, 3, 84, 84] reward = action_info[brain_name].rewards # list of rewards len = n_arenas arenas_done = action_info[brain_name].local_done @@ -307,7 +333,7 @@ def inference(): if done: break - #start_train = time.time() + #start_train = time.time() #model.train_net() #end_train = time.time() #print('time to train: ',end_train - start_train) @@ -321,14 +347,14 @@ def inference(): if n_epi%print_interval==0 and n_epi!=0: print("Episode: {}, avg score: {:.4f}, [{:.0f}] observations/second".format(n_epi, score/n_obs, n_obs/(end_episode - start_episode))) - + env.close() if __name__ == '__main__': - if not args.inference: + if not args.inference: print("Starting agent in train mode...") train() - else: + else: print("Starting agent in inference mode...") inference() From 46a13ec46695c3f23c49b39f8cd9bb84db78d6ed Mon Sep 17 00:00:00 2001 From: Lucas Tindall Date: Fri, 23 Aug 2019 17:00:50 -0700 Subject: [PATCH 7/8] more position code --- trainers/env_utils.py | 48 +++++++++++++++++++++++++++++++++---------- trainers/ppo.py | 48 ++++++++++++++++++++++++------------------- 2 files changed, 64 insertions(+), 32 deletions(-) diff --git a/trainers/env_utils.py b/trainers/env_utils.py index 3bd68c0b..df341f6f 100644 --- a/trainers/env_utils.py +++ b/trainers/env_utils.py @@ -10,22 +10,39 @@ class position_tracker(): - def __init__(self, starting_positions): + def __init__(self, starting_positions, starting_rotations): self.agent_start = starting_positions['Agent'] self.good_goal_start = starting_positions['GoodGoal'] + self.current_position = np.array(self.agent_start).astype('float64') + self.current_rotation = np.array(starting_rotations['Agent']).astype('float64') + + + def position_step(self, velocity_obs, action): - def position_step(self, velocity_obs): - velocity_obs = np.array(velocity_obs) + action = np.array(action) + self.current_rotation[np.where(action[:,1] == 1)] -= 6 + self.current_rotation[np.where(action[:,1] == 2)] += 6 + + rot_mat = get_rot_mat(deg_to_rad(self.current_rotation[0][0])) + + velocity_obs = np.dot(rot_mat, np.array(velocity_obs).T).T delta_distance = 0.0595 * velocity_obs self.current_position += delta_distance +def deg_to_rad(deg): + return deg * (np.pi/180) + +def get_rot_mat(rad): + return np.array([[np.cos(rad), 0, -np.sin(rad)],[0, 1, 0],[np.sin(rad), 0, np.cos(rad)]]) + + class better_env(): @@ -39,7 +56,7 @@ def __init__(self, n_arenas=3): def create_env(self, n_arenas=3): - include_items = {'Agent':1, 'GoodGoal':1, 'Wall':2} + include_items = {'Agent':1}#, 'GoodGoal':1, 'Wall':2} env_config = ArenaConfig() @@ -60,6 +77,7 @@ def create_env(self, n_arenas=3): name = item_type colors = [] positions = [] + rotations = [] # Loop over item counts for j in range(item_count): @@ -77,14 +95,18 @@ def create_env(self, n_arenas=3): positions.append(Vector3(x=x, y=y, z=z)) elif item_type == 'Agent': - x = np.random.randint(1,39) - y = np.random.randint(1,39) - z = np.random.randint(1,39) + #x = np.random.randint(1,39) + #y = np.random.randint(1,39) + #z = np.random.randint(1,39) + x = 0.5 + y = 0.5 + z = 0.5 #self.details[i][item_type]['positions'].append((x,y,z)) positions.append(Vector3(x=x, y=y, z=z)) + rotations.append(0) - item_list.append(Item(name=name, positions=positions, colors=colors)) + item_list.append(Item(name=name, positions=positions, rotations=rotations, colors=colors)) env_config.arenas[i].items = item_list return env_config @@ -117,6 +139,7 @@ def get_details(self): def get_start_positions(self): start_positions = {'Agent': [], 'GoodGoal': []} + start_rotations = {'Agent':[]} for arena_idx, arena in self.env_config.arenas.items(): @@ -124,8 +147,11 @@ def get_start_positions(self): if item.name == 'Agent' or item.name == 'GoodGoal': for position in item.positions: start_positions[item.name].append([position.x, position.y, position.z]) + if item.name == 'Agent': + for rotation in item.rotations: + start_rotations[item.name].append([rotation]) - return start_positions + return start_positions, start_rotations @@ -151,6 +177,6 @@ def env_info(env_config): pp.pprint(env.get_start_positions()) - -ps = position_tracker(env.get_start_positions()) +start_pos, start_rot = env.get_start_positions() +ps = position_tracker(start_pos, start_rot) print(ps.current_position) diff --git a/trainers/ppo.py b/trainers/ppo.py index 5ee2ad6b..33dc71bd 100644 --- a/trainers/ppo.py +++ b/trainers/ppo.py @@ -268,13 +268,14 @@ def env_info(env_config): print("Item colors: {}".format(item.colors)) def inference(): - env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100), inference=args.inference) + env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100), play=False,inference=args.inference) #arena_config_in = ArenaConfig(env_field) b_env = better_env(n_arenas = 1) arena_config_in = b_env.env_config - ps = position_tracker(b_env.get_start_positions()) + start_positions, start_rotations = b_env.get_start_positions() + ps = position_tracker(start_positions, start_rotations) model = PPO() @@ -288,16 +289,30 @@ def inference(): total_obs = 0 for n_epi in range(1, n_episodes+1): - action_info = env.reset(arenas_configurations=arena_config_in, train_mode=train_mode) + + action_info = env.reset(arenas_configurations=arena_config_in, train_mode=False) state = action_info[brain_name].visual_observations[0] - #state = np.moveaxis(state, -1, 0) state = np.moveaxis(state, -1, 1) done = False score = 0.0 start_episode = time.time() n_obs = 0 + action = [[0,1]] + action_info = env.step(vector_action=action) + velocity_obs = action_info[brain_name].vector_observations + ps.position_step(velocity_obs, action) + action_info = env.step(vector_action=action) + velocity_obs = action_info[brain_name].vector_observations + ps.position_step(velocity_obs, action) + action_info = env.step(vector_action=action) + velocity_obs = action_info[brain_name].vector_observations + ps.position_step(velocity_obs, action) + action_info = env.step(vector_action=action) + velocity_obs = action_info[brain_name].vector_observations + ps.position_step(velocity_obs, action) + while not done: for t in range(T_horizon): n_obs += n_arenas @@ -305,27 +320,25 @@ def inference(): prob = model.pi(torch.from_numpy(state).float().to(device)) m = Categorical(prob) - #a = m.sample().item() a = m.sample() - action = actions_array[a.cpu().numpy().astype(int)] - #s_prime, reward, done, info = + #action = actions_array[a.cpu().numpy().astype(int)] + #if np.random.randint(0,2): + # action = [0,1] + #else: + # action = [0,2] action_info = env.step(vector_action=action) + action = [[1,0]] next_state = action_info[brain_name].visual_observations[0] velocity_obs = action_info[brain_name].vector_observations - #ps.position_step(velocity_obs) - #print('Current position = {}'.format(ps.current_position)) + ps.position_step(velocity_obs, action) + print('Current position = {}, velocity = {}'.format(ps.current_position, velocity_obs)) next_state = np.moveaxis(next_state, -1, 1) # next state shape = [n_arenas, 3, 84, 84] reward = action_info[brain_name].rewards # list of rewards len = n_arenas arenas_done = action_info[brain_name].local_done done = any(arenas_done) - #prob_a = prob[np.arange(prob.shape[0])[:,None], a.cpu().numpy().astype(int)[:,None]] - - #for (s, a, r, n_s, p_a, d) in zip (state, a, reward, next_state, prob_a, arenas_done): - # model.put_data((s, a, r, n_s, p_a, d)) - # scores.append(r) state = next_state @@ -333,16 +346,9 @@ def inference(): if done: break - #start_train = time.time() - #model.train_net() - #end_train = time.time() - #print('time to train: ',end_train - start_train) end_episode = time.time() - #print('{} observations/second'.format(n_obs/(end_episode - start_episode))) - - #scores.append(score) if n_epi%print_interval==0 and n_epi!=0: print("Episode: {}, avg score: {:.4f}, [{:.0f}] observations/second".format(n_epi, score/n_obs, n_obs/(end_episode - start_episode))) From 13c5219ac7a7e06199c587741ed512118d4676e8 Mon Sep 17 00:00:00 2001 From: Lucas Tindall Date: Thu, 5 Sep 2019 11:46:36 -0700 Subject: [PATCH 8/8] Position, angle and visited squares tracking. --- trainers/a3c_src/env.py | 108 +++++++++++++++ trainers/a3c_src/model.py | 70 ++++++++++ trainers/a3c_src/optimizer.py | 18 +++ trainers/a3c_src/process.py | 240 ++++++++++++++++++++++++++++++++++ trainers/a3c_test.py | 124 ++++++++++++++++++ trainers/a3c_train.py | 93 +++++++++++++ trainers/env_utils.py | 131 +++++++++++++++---- trainers/ppo.py | 53 ++++---- 8 files changed, 787 insertions(+), 50 deletions(-) create mode 100644 trainers/a3c_src/env.py create mode 100644 trainers/a3c_src/model.py create mode 100644 trainers/a3c_src/optimizer.py create mode 100644 trainers/a3c_src/process.py create mode 100644 trainers/a3c_test.py create mode 100644 trainers/a3c_train.py diff --git a/trainers/a3c_src/env.py b/trainers/a3c_src/env.py new file mode 100644 index 00000000..dfa5a78c --- /dev/null +++ b/trainers/a3c_src/env.py @@ -0,0 +1,108 @@ +""" +@author: Viet Nguyen +""" + +import gym_super_mario_bros +from gym.spaces import Box +from gym import Wrapper +from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv +from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY +import cv2 +import numpy as np +import subprocess as sp + + +class Monitor: + def __init__(self, width, height, saved_path): + + self.command = ["ffmpeg", "-y", "-f", "rawvideo", "-vcodec", "rawvideo", "-s", "{}X{}".format(width, height), + "-pix_fmt", "rgb24", "-r", "80", "-i", "-", "-an", "-vcodec", "mpeg4", saved_path] + try: + self.pipe = sp.Popen(self.command, stdin=sp.PIPE, stderr=sp.PIPE) + except FileNotFoundError: + pass + + def record(self, image_array): + self.pipe.stdin.write(image_array.tostring()) + + +def process_frame(frame): + if frame is not None: + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + frame = cv2.resize(frame, (84, 84))[None, :, :] / 255. + return frame + else: + return np.zeros((1, 84, 84)) + + +class CustomReward(Wrapper): + def __init__(self, env=None, monitor=None): + super(CustomReward, self).__init__(env) + self.observation_space = Box(low=0, high=255, shape=(1, 84, 84)) + self.curr_score = 0 + if monitor: + self.monitor = monitor + else: + self.monitor = None + + def step(self, action): + state, reward, done, info = self.env.step(action) + if self.monitor: + self.monitor.record(state) + state = process_frame(state) + reward += (info["score"] - self.curr_score) / 40. + self.curr_score = info["score"] + if done: + if info["flag_get"]: + reward += 50 + else: + reward -= 50 + return state, reward / 10., done, info + + def reset(self): + self.curr_score = 0 + return process_frame(self.env.reset()) + + +class CustomSkipFrame(Wrapper): + def __init__(self, env, skip=4): + super(CustomSkipFrame, self).__init__(env) + self.observation_space = Box(low=0, high=255, shape=(4, 84, 84)) + self.skip = skip + + def step(self, action): + total_reward = 0 + states = [] + state, reward, done, info = self.env.step(action) + for i in range(self.skip): + if not done: + state, reward, done, info = self.env.step(action) + total_reward += reward + states.append(state) + else: + states.append(state) + states = np.concatenate(states, 0)[None, :, :, :] + return states.astype(np.float32), reward, done, info + + def reset(self): + state = self.env.reset() + states = np.concatenate([state for _ in range(self.skip)], 0)[None, :, :, :] + return states.astype(np.float32) + + +def create_train_env(world, stage, action_type, output_path=None): + env = gym_super_mario_bros.make("SuperMarioBros-{}-{}-v0".format(world, stage)) + if output_path: + monitor = Monitor(256, 240, output_path) + else: + monitor = None + if action_type == "right": + actions = RIGHT_ONLY + elif action_type == "simple": + actions = SIMPLE_MOVEMENT + else: + actions = COMPLEX_MOVEMENT + env = BinarySpaceToDiscreteSpaceEnv(env, actions) + env = CustomReward(env, monitor) + env = CustomSkipFrame(env) + return env, env.observation_space.shape[0], len(actions) diff --git a/trainers/a3c_src/model.py b/trainers/a3c_src/model.py new file mode 100644 index 00000000..50627d65 --- /dev/null +++ b/trainers/a3c_src/model.py @@ -0,0 +1,70 @@ +""" +@author: Viet Nguyen +""" + +import torch.nn as nn +import torch.nn.functional as F + + +class ActorCritic(nn.Module): + def __init__(self, num_inputs, num_actions): + super(ActorCritic, self).__init__() + self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1) + self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.lstm = nn.LSTMCell(32 * 6 * 6, 512) + self.critic_linear = nn.Linear(512, 1) + self.actor_linear = nn.Linear(512, num_actions) + self._initialize_weights() + + def _initialize_weights(self): + for module in self.modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight) + # nn.init.kaiming_uniform_(module.weight) + nn.init.constant_(module.bias, 0) + elif isinstance(module, nn.LSTMCell): + nn.init.constant_(module.bias_ih, 0) + nn.init.constant_(module.bias_hh, 0) + + def forward(self, x, hx, cx): + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + x = F.relu(self.conv4(x)) + hx, cx = self.lstm(x.view(x.size(0), -1), (hx, cx)) + return self.actor_linear(hx), self.critic_linear(hx), hx, cx + + + +class Mapper(nn.Module): + def __init__(self, num_inputs): + super(Mapper, self).__init__() + self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1) + self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.lstm = nn.LSTMCell(32 * 6 * 6, 400) + self.map_final = nn.Linear(400, 1600) + self._initialize_weights() + + def _initialize_weights(self): + for module in self.modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight) + # nn.init.kaiming_uniform_(module.weight) + nn.init.constant_(module.bias, 0) + elif isinstance(module, nn.LSTMCell): + nn.init.constant_(module.bias_ih, 0) + nn.init.constant_(module.bias_hh, 0) + + def forward(self, x, hx, cx): + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + x = F.relu(self.conv4(x)) + hx, cx = self.lstm(x.view(x.size(0), -1), (hx, cx)) + return self.map_final(hx), hx, cx + + diff --git a/trainers/a3c_src/optimizer.py b/trainers/a3c_src/optimizer.py new file mode 100644 index 00000000..385b5264 --- /dev/null +++ b/trainers/a3c_src/optimizer.py @@ -0,0 +1,18 @@ +""" +@author: Viet Nguyen +""" + +import torch + +class GlobalAdam(torch.optim.Adam): + def __init__(self, params, lr): + super(GlobalAdam, self).__init__(params, lr=lr) + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p.data) + state['exp_avg_sq'] = torch.zeros_like(p.data) + + state['exp_avg'].share_memory_() + state['exp_avg_sq'].share_memory_() diff --git a/trainers/a3c_src/process.py b/trainers/a3c_src/process.py new file mode 100644 index 00000000..fe01b07c --- /dev/null +++ b/trainers/a3c_src/process.py @@ -0,0 +1,240 @@ +""" +@author: Viet Nguyen +""" + +import torch +#from src.env import create_train_env +from a3c_src.model import ActorCritic +import torch.nn.functional as F +from torch.distributions import Categorical +from collections import deque +from tensorboardX import SummaryWriter +import timeit + +from animalai.envs import UnityEnvironment +from animalai.envs.arena_config import ArenaConfig + +from env_utils import * + +#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +device = torch.device("cpu") +actions_array = np.array([[0,0],[0,1],[0,2],[1,0],[2,0]]) +brain_name = 'Learner' + +def local_train(index, opt, global_model, optimizer, save=False): + torch.manual_seed(123 + index) + if save: + start_time = timeit.default_timer() + writer = SummaryWriter(opt.log_path) + + + # Unity + #env_path = '../env/AnimalAI' + #n_arenas=1 + #env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100), play=False,inference=False) + + + + b_env = better_env(n_arenas = 1) + env = b_env.env + #arena_config_in = b_env.env_config + #start_positions, start_rotations = b_env.get_start_positions() + #ps = position_tracker(start_positions, start_rotations) + # end unity + num_states = 3 + num_actions = 5 + + #env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) + local_model = ActorCritic(num_states, num_actions).to(device) + local_model.train() + + + + action_info = env.reset(arenas_configurations=b_env.env_config, train_mode=True) + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + + + + done = True + curr_step = 0 + curr_episode = 0 + while True: + if save: + if curr_episode % opt.save_interval == 0 and curr_episode > 0: + torch.save(global_model.state_dict(), + "{}/{}_{}".format(opt.saved_path, opt.saved_filepath, curr_episode)) + #print("Process {}. Episode {}".format(index, curr_episode)) + if curr_episode > 0: + print("Process {}. Episode {}, total_loss = {}".format(index, curr_episode, total_loss.item())) + curr_episode += 1 + local_model.load_state_dict(global_model.state_dict()) + if done: + h_0 = torch.zeros((1, 512), dtype=torch.float) + c_0 = torch.zeros((1, 512), dtype=torch.float) + else: + h_0 = h_0.detach() + c_0 = c_0.detach() + + h_0 = h_0.to(device) + c_0 = c_0.to(device) + #if opt.use_gpu: + # h_0 = h_0.cuda() + # c_0 = c_0.cuda() + + log_policies = [] + values = [] + rewards = [] + entropies = [] + + for _ in range(opt.num_local_steps): + curr_step += 1 + logits, value, h_0, c_0 = local_model(state, h_0, c_0) + policy = F.softmax(logits, dim=1) + log_policy = F.log_softmax(logits, dim=1) + entropy = -(policy * log_policy).sum(1, keepdim=True) + + m = Categorical(policy) + action_idx = m.sample().item() + + action = actions_array[action_idx] + #action = actions_array[action.cpu().numpy().astype(int)] + #state, reward, done, _ = env.step(action) + action_info = env.step(vector_action=action) + + + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + velocity_obs = action_info[brain_name].vector_observations + b_env.position_tracker.position_step(velocity_obs, action) + #print("Distance to goal = {}".format(ps.distance_to_goal())) + #print('Current position = {}, velocity = {}'.format(ps.current_position, velocity_obs)) + reward = action_info[brain_name].rewards # list of rewards len = n_arenas + reward = reward[0] + + # reward based on visiting squares + total_unvisited = np.sum(b_env.position_tracker.visited) + reward -= total_unvisited/10000 + reward -= b_env.position_tracker.distance_to_goal()/500 + reward -= b_env.position_tracker.angle_to_goal()/1000 + #print("{} reward = {}".format(index, reward)) + + arenas_done = action_info[brain_name].local_done + done = any(arenas_done) + + + + + + #state = torch.from_numpy(state) + + if opt.use_gpu: + state = state.cuda() + if curr_step > opt.num_global_steps: + done = True + + #if curr_step > 500: + # done = True + + if done: + curr_step = 0 + + #b_env = better_env(n_arenas = 1) + #arena_config_in = b_env.env_config + #start_positions, start_rotations = b_env.get_start_positions() + #ps = position_tracker(start_positions, start_rotations) + b_env.generate_new_config() + action_info = env.reset(arenas_configurations=b_env.env_config, train_mode=True) + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + #state = torch.from_numpy(env.reset()) + #if opt.use_gpu: + # state = state.cuda() + + values.append(value) + log_policies.append(log_policy[0, action_idx]) + rewards.append(reward) + entropies.append(entropy) + + if done: + break + + + R = torch.zeros((1, 1), dtype=torch.float) + if opt.use_gpu: + R = R.cuda() + if not done: + _, R, _, _ = local_model(state, h_0, c_0) + + gae = torch.zeros((1, 1), dtype=torch.float) + if opt.use_gpu: + gae = gae.cuda() + actor_loss = 0 + critic_loss = 0 + entropy_loss = 0 + next_value = R + + for value, log_policy, reward, entropy in list(zip(values, log_policies, rewards, entropies))[::-1]: + gae = gae * opt.gamma * opt.tau + gae = gae + reward + opt.gamma * next_value.detach() - value.detach() + next_value = value + actor_loss = actor_loss + log_policy * gae + R = R * opt.gamma + reward + critic_loss = critic_loss + (R - value) ** 2 / 2 + entropy_loss = entropy_loss + entropy + + total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss + #print("Loss = {}".format(total_loss)) + writer.add_scalar("Train_{}/Loss".format(index), total_loss, curr_episode) + optimizer.zero_grad() + total_loss.backward() + + for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): + if global_param.grad is not None: + break + global_param._grad = local_param.grad + + optimizer.step() + + if curr_episode == int(opt.num_global_steps / opt.num_local_steps): + print("Training process {} terminated".format(index)) + if save: + end_time = timeit.default_timer() + print('The code runs for %.2f s ' % (end_time - start_time)) + return + + +def local_test(index, opt, global_model): + torch.manual_seed(123 + index) + env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) + local_model = ActorCritic(num_states, num_actions) + local_model.eval() + state = torch.from_numpy(env.reset()) + done = True + curr_step = 0 + actions = deque(maxlen=opt.max_actions) + while True: + curr_step += 1 + if done: + local_model.load_state_dict(global_model.state_dict()) + with torch.no_grad(): + if done: + h_0 = torch.zeros((1, 512), dtype=torch.float) + c_0 = torch.zeros((1, 512), dtype=torch.float) + else: + h_0 = h_0.detach() + c_0 = c_0.detach() + + logits, value, h_0, c_0 = local_model(state, h_0, c_0) + policy = F.softmax(logits, dim=1) + action = torch.argmax(policy).item() + state, reward, done, _ = env.step(action) + env.render() + actions.append(action) + if curr_step > opt.num_global_steps or actions.count(actions[0]) == actions.maxlen: + done = True + if done: + curr_step = 0 + actions.clear() + state = env.reset() + state = torch.from_numpy(state) diff --git a/trainers/a3c_test.py b/trainers/a3c_test.py new file mode 100644 index 00000000..75a435ab --- /dev/null +++ b/trainers/a3c_test.py @@ -0,0 +1,124 @@ +""" +@author: Viet Nguyen +""" + +import os + +os.environ['OMP_NUM_THREADS'] = '1' +import argparse +import torch +from a3c_src.model import ActorCritic +import torch.nn.functional as F +import numpy as np +from animalai.envs import UnityEnvironment +from animalai.envs.arena_config import ArenaConfig + +from env_utils import * + + +def get_args(): + parser = argparse.ArgumentParser( + """Implementation of model described in the paper: Asynchronous Methods for Deep Reinforcement Learning for Super Mario Bros""") + parser.add_argument("--saved_filepath", type=str, default="trained_models/a3c_animalai") + args = parser.parse_args() + return args + + +def test(opt): + + + # AnimalAI + device = torch.device("cpu") + num_states = 3 + num_actions = 5 + actions_array = np.array([[0,0],[0,1],[0,2],[1,0],[2,0]]) + brain_name = 'Learner' + # AnimalAI + + torch.manual_seed(123) + + + #env=UnityEnvironment(file_name='../env/AnimalAI', n_arenas=1, worker_id=np.random.randint(1,100), play=False,inference=True) + b_env = better_env(n_arenas = 1, walls=1,t=100, inference=True) + env = b_env.env + #arena_config_in = b_env.env_config + #start_positions, start_rotations = b_env.get_start_positions() + #ps = position_tracker(start_positions, start_rotations) + + + + + model = ActorCritic(num_states, num_actions) + + basepath = opt.saved_filepath.split('/')[0] + basename = opt.saved_filepath.split('/')[1] + + found_models = [int(filenames.split('_')[-1]) for filenames in os.listdir(basepath) if basename in filenames] + if len(found_models) > 0: + latest = max(found_models) + model.load_state_dict(torch.load("{}_{}".format(opt.saved_filepath, latest))) + model = model.to(device) + print("Loaded saved model from {}_{}".format(opt.saved_filepath, latest)) + else: + print("Could not find model to load.") + raise + + + ''' + if torch.cuda.is_available(): + model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))) + model.cuda() + else: + model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage), + map_location=lambda storage, loc: storage)) + ''' + + model.eval() + + action_info = env.reset(arenas_configurations=b_env.env_config, train_mode=False) + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + done = True + while True: + if done: + h_0 = torch.zeros((1, 512), dtype=torch.float) + c_0 = torch.zeros((1, 512), dtype=torch.float) + #b_env = better_env(n_arenas = 1) + #arena_config_in = b_env.env_config + b_env.generate_new_config() + action_info = env.reset(arenas_configurations=b_env.env_config, train_mode=False) + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + else: + h_0 = h_0.detach() + c_0 = c_0.detach() + + h_0 = h_0.to(device) + c_0 = c_0.to(device) + state = state.to(device) + + logits, value, h_0, c_0 = model(state, h_0, c_0) + policy = F.softmax(logits, dim=1) + action_idx = torch.argmax(policy).item() + action_idx = int(action_idx) + action = actions_array[action_idx] + action_info = env.step(vector_action=action) + + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + + velocity_obs = action_info[brain_name].vector_observations + b_env.position_tracker.position_step(velocity_obs, action) + + #print("{}__{}".format(b_env.position_tracker.current_rotation,b_env.position_tracker.angle_to_goal())) + print("Current position = {}".format(b_env.position_tracker.current_position)) + + + arenas_done = action_info[brain_name].local_done + done = any(arenas_done) + + + +if __name__ == "__main__": + opt = get_args() + test(opt) diff --git a/trainers/a3c_train.py b/trainers/a3c_train.py new file mode 100644 index 00000000..9a20b1f2 --- /dev/null +++ b/trainers/a3c_train.py @@ -0,0 +1,93 @@ +""" +modified by: Lucas Tindall +@author: Viet Nguyen +""" + +import os +os.environ['OMP_NUM_THREADS'] = '1' +import argparse +import torch +#from src.env import create_train_env +from a3c_src.model import ActorCritic +from a3c_src.optimizer import GlobalAdam +from a3c_src.process import local_train, local_test +import torch.multiprocessing as _mp +import shutil + +from animalai.envs import UnityEnvironment +from animalai.envs.arena_config import ArenaConfig + + +from env_utils import * + +def get_args(): + parser = argparse.ArgumentParser( + """Implementation of model described in the paper: Asynchronous Methods for Deep Reinforcement Learning for Super Mario Bros""") + #parser.add_argument('--config', type=str, default='configs/1-Food.yaml', help='Environment config file. Default: "configs/1-Food.yaml"') + #parser.add_argument('--load_model', type=str, default='saved_models/ppo.pth', help='Saved model to load. Default: "saved_models/ppo.pth"') + #parser.add_argument('--inference', default=False, action='store_true', help='Run in inference mode. Default: False') + #parser.add_argument("--world", type=int, default=1) + #parser.add_argument("--stage", type=int, default=1) + #parser.add_argument("--action_type", type=str, default="complex") + parser.add_argument('--lr', type=float, default=1e-4) + parser.add_argument('--gamma', type=float, default=0.9, help='discount factor for rewards') + parser.add_argument('--tau', type=float, default=1.0, help='parameter for GAE') + parser.add_argument('--beta', type=float, default=0.01, help='entropy coefficient') + parser.add_argument("--num_local_steps", type=int, default=50) + parser.add_argument("--num_global_steps", type=int, default=5e6) + parser.add_argument("--num_processes", type=int, default=4) + parser.add_argument("--save_interval", type=int, default=500, help="Number of steps between savings") + parser.add_argument("--max_actions", type=int, default=200, help="Maximum repetition steps in test phase") + parser.add_argument("--log_path", type=str, default="tensorboard/a3c_super_mario_bros") + parser.add_argument("--saved_path", type=str, default="trained_models") + parser.add_argument("--saved_filepath", type=str, default="a3c_animalai") + parser.add_argument("--load_model", type=str, default="") + #parser.add_argument("--load_from_previous_stage", type=bool, default=False, + # help="Load weight from previous trained stage") + parser.add_argument("--use_gpu", type=bool, default=False) + args = parser.parse_args() + return args + + +def train(opt): + torch.manual_seed(123) + if os.path.isdir(opt.log_path): + shutil.rmtree(opt.log_path) + os.makedirs(opt.log_path) + if not os.path.isdir(opt.saved_path): + os.makedirs(opt.saved_path) + mp = _mp.get_context("spawn") + #mp = _mp.get_context("fork") + + num_states = 3 + num_actions = 5 + global_model = ActorCritic(num_states, num_actions) + + if opt.use_gpu: + global_model.cuda() + global_model.share_memory() + + if os.path.isfile("{}/{}".format(opt.saved_path, opt.load_model)): + print("loaded global model from {}/{}".format(opt.saved_path, opt.load_model)) + global_model.load_state_dict(torch.load("{}/{}".format(opt.saved_path, opt.load_model))) + + optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr) + processes = [] + for index in range(opt.num_processes): + print("local train {}".format(index)) + if index == 0: + process = mp.Process(target=local_train, args=(index, opt, global_model, optimizer, True)) + else: + process = mp.Process(target=local_train, args=(index, opt, global_model, optimizer)) + process.start() + processes.append(process) + #process = mp.Process(target=local_test, args=(opt.num_processes, opt, global_model)) + #process.start() + #processes.append(process) + for process in processes: + process.join() + + +if __name__ == "__main__": + opt = get_args() + train(opt) diff --git a/trainers/env_utils.py b/trainers/env_utils.py index df341f6f..2a8cb5e1 100644 --- a/trainers/env_utils.py +++ b/trainers/env_utils.py @@ -1,4 +1,5 @@ from animalai.envs.arena_config import Vector3, RGB, Item, Arena, ArenaConfig +from animalai.envs import UnityEnvironment from collections import defaultdict import numpy as np import pprint @@ -6,27 +7,35 @@ pp = pprint.PrettyPrinter(indent=4) - +np.set_printoptions(threshold=np.inf) class position_tracker(): def __init__(self, starting_positions, starting_rotations): self.agent_start = starting_positions['Agent'] - self.good_goal_start = starting_positions['GoodGoal'] + self.good_goal_start = np.array(starting_positions['GoodGoal']).astype('float64') self.current_position = np.array(self.agent_start).astype('float64') self.current_rotation = np.array(starting_rotations['Agent']).astype('float64') + self.visited = np.ones((40,40)) + + def position_step(self, velocity_obs, action): action = np.array(action) - self.current_rotation[np.where(action[:,1] == 1)] -= 6 - self.current_rotation[np.where(action[:,1] == 2)] += 6 + + if len(action.shape) > 1: + self.current_rotation[np.where(action[:,1] == 1)] -= 7 + self.current_rotation[np.where(action[:,1] == 2)] += 7 + else: + self.current_rotation[np.where(action[1] == 1)] -= 7 + self.current_rotation[np.where(action[1] == 2)] += 7 rot_mat = get_rot_mat(deg_to_rad(self.current_rotation[0][0])) @@ -35,10 +44,52 @@ def position_step(self, velocity_obs, action): self.current_position += delta_distance + square_coord = np.floor(self.current_position[0]).astype(int)[[0,2]] + + if all(square_coord >= 0) and all(square_coord < 40): + self.visited[square_coord[1],square_coord[0]] = 0 + + + + + + + def distance_to_goal(self): + + + distance = 0 + for g_pos, a_pos in zip(self.good_goal_start[0], self.current_position[0]): + + distance += (g_pos - a_pos)**2 + distance = distance ** (0.5) + + return distance + + def angle_to_goal(self): + + + agent_to_goal_vec = self.good_goal_start - self.current_position + agent_to_goal_vec = np.delete(agent_to_goal_vec, 1, 1) + + agent_face_vec = np.array([-np.sin(deg_to_rad(self.current_rotation[0][0])), np.cos(deg_to_rad(self.current_rotation[0][0]))]) + + angle = np.arccos(np.dot(agent_to_goal_vec, agent_face_vec)/(np.linalg.norm(agent_to_goal_vec)*np.linalg.norm(agent_face_vec))) + + deg = rad_to_deg(angle) + + if np.isnan(deg): + return 0 + else: + return deg[0] + + def deg_to_rad(deg): return deg * (np.pi/180) +def rad_to_deg(rad): + return rad * (180/np.pi) + def get_rot_mat(rad): return np.array([[np.cos(rad), 0, -np.sin(rad)],[0, 1, 0],[np.sin(rad), 0, np.cos(rad)]]) @@ -46,24 +97,47 @@ def get_rot_mat(rad): class better_env(): - def __init__(self, n_arenas=3): + def __init__(self, n_arenas=2, walls=2, t=250, play=False, inference=False): + print(n_arenas) self.n_arenas = n_arenas - #self.details = {} - self.env_config = self.create_env(n_arenas=n_arenas) - self.details = self.get_details() + self.walls = walls + self.t = t + #self.env_config = self.create_env() + self.generate_new_config() + self.env = UnityEnvironment(file_name='../env/AnimalAI', n_arenas=n_arenas, worker_id=np.random.randint(1,100), play=play,inference=inference) + + + start_positions, start_rotations = self.get_start_positions() + self.position_tracker = position_tracker(start_positions, start_rotations) + def generate_new_config(self): + self.env_config = self.create_env() + start_positions, start_rotations = self.get_start_positions() + self.position_tracker = position_tracker(start_positions, start_rotations) - def create_env(self, n_arenas=3): - include_items = {'Agent':1}#, 'GoodGoal':1, 'Wall':2} + def create_env(self): + + #print("Creating {} arenas!!!".format(self.n_arenas)) + + #include_items = {'Agent':1}#, 'GoodGoal':1, 'Wall':2} + include_items = {'Agent':1, 'GoodGoal':1} + if self.walls > 0: + include_items['Wall'] = self.walls + + if True: + include_items['GoodGoalMulti'] = 1 + + if True: + include_items['BadGoal'] = 1 env_config = ArenaConfig() # Loop over arenas - for i in range(n_arenas): - env_config.arenas[i] = Arena() + for i in range(self.n_arenas): + env_config.arenas[i] = Arena(t=self.t) #self.details[i] = {} @@ -86,21 +160,24 @@ def create_env(self, n_arenas=3): #self.details[i][item_type]['colors'].append((153,153,153)) - elif item_type == 'GoodGoal': + elif item_type in ['GoodGoal', 'GoodGoalMulti', 'BadGoal']: x = np.random.randint(1,39) - y = np.random.randint(1,39) + #y = np.random.randint(1,39) + y = 1 + z = np.random.randint(1,39) #self.details[i][item_type]['positions'].append((x,y,z)) positions.append(Vector3(x=x, y=y, z=z)) elif item_type == 'Agent': - #x = np.random.randint(1,39) + x = np.random.randint(1,39) #y = np.random.randint(1,39) - #z = np.random.randint(1,39) - x = 0.5 - y = 0.5 - z = 0.5 + y = 1 + z = np.random.randint(1,39) + #x = 0.5 + #y = 0.5 + #z = 0.5 #self.details[i][item_type]['positions'].append((x,y,z)) positions.append(Vector3(x=x, y=y, z=z)) @@ -169,14 +246,14 @@ def env_info(env_config): print("{:8s}Item sizes: {}".format('',item.sizes)) print("{:8s}Item colors: {}".format('',item.colors)) -env = better_env() -env_config = env.env_config -env_info(env_config) -pp.pprint(env.details) +#env = better_env() +#env_config = env.env_config +#env_info(env_config) +#pp.pprint(env.details) #pp.pprint(env.details2) -pp.pprint(env.get_start_positions()) +#pp.pprint(env.get_start_positions()) -start_pos, start_rot = env.get_start_positions() -ps = position_tracker(start_pos, start_rot) -print(ps.current_position) +#start_pos, start_rot = env.get_start_positions() +#ps = position_tracker(start_pos, start_rot) +#print(ps.current_position) diff --git a/trainers/ppo.py b/trainers/ppo.py index 33dc71bd..ca69b400 100644 --- a/trainers/ppo.py +++ b/trainers/ppo.py @@ -34,13 +34,16 @@ env_path = '../env/AnimalAI' brain_name = 'Learner' train_mode = True -num_actions = 9 color_channels = 3 env_field = args.config n_episodes = 20000 #max_t = 100 -actions_array = np.array([[0,0],[0,1],[0,2],[1,0], [1,1],[1,2], [2,0],[2,1],[2,2]]) -n_arenas = 3 +#num_actions = 9 +#actions_array = np.array([[0,0],[0,1],[0,2],[1,0], [1,1],[1,2], [2,0],[2,1],[2,2]]) +num_actions = 5 +actions_array = np.array([[0,0],[0,1],[0,2],[1,0],[2,0]]) + +n_arenas = 1 print_interval = 1 save_interval = 10 save_path = 'saved_models/' @@ -173,7 +176,7 @@ def train_net(self): def train(): env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100), inference=args.inference) - arena_config_in = ArenaConfig(env_field) + #arena_config_in = ArenaConfig(env_field) #print(arena_config_in.arenas) @@ -188,6 +191,11 @@ def train(): total_obs = 0 for n_epi in range(1, n_episodes+1): + b_env = better_env(n_arenas = 1) + arena_config_in = b_env.env_config + start_positions, start_rotations = b_env.get_start_positions() + ps = position_tracker(start_positions, start_rotations) + action_info = env.reset(arenas_configurations=arena_config_in, train_mode=train_mode) state = action_info[brain_name].visual_observations[0] @@ -213,11 +221,16 @@ def train(): action_info = env.step(vector_action=action) next_state = action_info[brain_name].visual_observations[0] velocity_obs = action_info[brain_name].vector_observations - print(velocity_obs) - asdf + + ps.position_step(velocity_obs, action) + #print('Current position = {}, velocity = {}'.format(ps.current_position, velocity_obs)) + #print('Distance to goal = {}'.format(ps.distance_to_goal())) + #next_state = np.moveaxis(next_state, -1, 0) next_state = np.moveaxis(next_state, -1, 1) # next state shape = [n_arenas, 3, 84, 84] reward = action_info[brain_name].rewards # list of rewards len = n_arenas + reward -= ps.distance_to_goal()/100 + #print(reward) arenas_done = action_info[brain_name].local_done done = any(arenas_done) @@ -249,7 +262,7 @@ def train(): print("Episode: {}, avg score: {:.4f}, [{:.0f}] observations/second".format(n_epi, np.mean(scores)/n_arenas, n_obs/(end_episode - start_episode))) if n_epi%save_interval==0 and n_epi!=0: - print("Saving model to {}ppo.pth at {}".format(save_path, datetime.datetime.now())) + print("Saving model to {} at {}".format(save_path+train_filename, datetime.datetime.now())) torch.save(model.state_dict(), save_path+train_filename) @@ -272,10 +285,6 @@ def inference(): #arena_config_in = ArenaConfig(env_field) - b_env = better_env(n_arenas = 1) - arena_config_in = b_env.env_config - start_positions, start_rotations = b_env.get_start_positions() - ps = position_tracker(start_positions, start_rotations) model = PPO() @@ -290,6 +299,11 @@ def inference(): for n_epi in range(1, n_episodes+1): + b_env = better_env(n_arenas = 1) + arena_config_in = b_env.env_config + start_positions, start_rotations = b_env.get_start_positions() + ps = position_tracker(start_positions, start_rotations) + action_info = env.reset(arenas_configurations=arena_config_in, train_mode=False) state = action_info[brain_name].visual_observations[0] @@ -303,15 +317,7 @@ def inference(): action_info = env.step(vector_action=action) velocity_obs = action_info[brain_name].vector_observations ps.position_step(velocity_obs, action) - action_info = env.step(vector_action=action) - velocity_obs = action_info[brain_name].vector_observations - ps.position_step(velocity_obs, action) - action_info = env.step(vector_action=action) - velocity_obs = action_info[brain_name].vector_observations - ps.position_step(velocity_obs, action) - action_info = env.step(vector_action=action) - velocity_obs = action_info[brain_name].vector_observations - ps.position_step(velocity_obs, action) + print('Start position = {}, velocity = {}'.format(ps.current_position, velocity_obs)) while not done: for t in range(T_horizon): @@ -321,18 +327,19 @@ def inference(): m = Categorical(prob) a = m.sample() - #action = actions_array[a.cpu().numpy().astype(int)] + action = actions_array[a.cpu().numpy().astype(int)] #if np.random.randint(0,2): # action = [0,1] #else: # action = [0,2] action_info = env.step(vector_action=action) - action = [[1,0]] + #action = [[1,0]] next_state = action_info[brain_name].visual_observations[0] velocity_obs = action_info[brain_name].vector_observations ps.position_step(velocity_obs, action) - print('Current position = {}, velocity = {}'.format(ps.current_position, velocity_obs)) + #print('Current position = {}, velocity = {}'.format(ps.current_position, velocity_obs)) + print('Distance to goal = {}'.format(ps.distance_to_goal())) next_state = np.moveaxis(next_state, -1, 1) # next state shape = [n_arenas, 3, 84, 84] reward = action_info[brain_name].rewards # list of rewards len = n_arenas