diff --git a/rllib/agents/dqn/dqn_torch_policy.py b/rllib/agents/dqn/dqn_torch_policy.py index 70b90a05d..9a1d2383d 100644 --- a/rllib/agents/dqn/dqn_torch_policy.py +++ b/rllib/agents/dqn/dqn_torch_policy.py @@ -180,8 +180,6 @@ def build_q_model_and_distribution( # generically into ModelCatalog. add_layer_norm=add_layer_norm) - policy.q_func_vars = model.variables() - policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, @@ -201,8 +199,6 @@ def build_q_model_and_distribution( # generically into ModelCatalog. add_layer_norm=add_layer_norm) - policy.target_q_func_vars = policy.target_q_model.variables() - return model, TorchCategorical @@ -237,6 +233,7 @@ def build_q_losses(policy: Policy, model, _, Returns: TensorType: A single loss tensor. """ + config = policy.config # Q-network evaluation. q_t, q_logits_t, q_probs_t, _ = compute_q_values( @@ -302,6 +299,13 @@ def build_q_losses(policy: Policy, model, _, def adam_optimizer(policy: Policy, config: TrainerConfigDict) -> "torch.optim.Optimizer": + + # By this time, the models have been moved to the GPU - if any - and we + # can define our optimizers using the correct CUDA variables. + if not hasattr(policy, "q_func_vars"): + policy.q_func_vars = policy.model.variables() + policy.target_q_func_vars = policy.target_q_model.variables() + return torch.optim.Adam( policy.q_func_vars, lr=policy.cur_lr, eps=config["adam_epsilon"]) diff --git a/rllib/agents/dqn/r2d2_torch_policy.py b/rllib/agents/dqn/r2d2_torch_policy.py index dd6a38f41..9cdc35b7f 100644 --- a/rllib/agents/dqn/r2d2_torch_policy.py +++ b/rllib/agents/dqn/r2d2_torch_policy.py @@ -6,7 +6,7 @@ import gym import ray from ray.rllib.agents.dqn.dqn_tf_policy import (PRIO_WEIGHTS, postprocess_nstep_and_prio) -from ray.rllib.agents.dqn.dqn_torch_policy import \ +from ray.rllib.agents.dqn.dqn_torch_policy import adam_optimizer, \ build_q_model_and_distribution, compute_q_values from ray.rllib.agents.dqn.r2d2_tf_policy import \ get_distribution_inputs_and_class @@ -232,12 +232,6 @@ class ComputeTDErrorMixin: self.compute_td_error = compute_td_error -def adam_optimizer(policy: Policy, - config: TrainerConfigDict) -> "torch.optim.Optimizer": - return torch.optim.Adam( - policy.q_func_vars, lr=policy.cur_lr, eps=config["adam_epsilon"]) - - def build_q_stats(policy: Policy, batch) -> Dict[str, TensorType]: return dict({ "cur_lr": policy.cur_lr,