mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
137 lines
5.2 KiB
Python
137 lines
5.2 KiB
Python
"""Test whether a CQLTrainer can learn from an offline Pendulum-v0 file.
|
|
|
|
It does demonstrate, how to use CQL with a simple json offline file.
|
|
|
|
Important node: Make sure that your offline data file contains only
|
|
a single timestep per line to mimic the way SAC pulls samples from
|
|
the buffer.
|
|
|
|
Generate the offline json file by running an SAC algo until it reaches expert
|
|
level on your command line:
|
|
$ cd ray
|
|
$ rllib train -f rllib/tuned_examples/sac/pendulum-sac.yaml
|
|
|
|
Also make sure that in the above SAC yaml file, you specify an
|
|
additional "output" key with any path on your local file system.
|
|
In that path, the offline json file will be written to.
|
|
|
|
Use the generated file(s) as "input" in the CQL config below, then run
|
|
this script.
|
|
"""
|
|
|
|
import numpy as np
|
|
import os
|
|
|
|
from ray.rllib.agents import cql as cql
|
|
from ray.rllib.utils.framework import try_import_torch
|
|
|
|
torch, _ = try_import_torch()
|
|
|
|
if __name__ == "__main__":
|
|
|
|
# See rllib/tuned_examples/cql/pendulum-cql.yaml for comparison.
|
|
|
|
config = cql.CQL_DEFAULT_CONFIG.copy()
|
|
config["num_workers"] = 0 # Run locally.
|
|
config["horizon"] = 200
|
|
config["soft_horizon"] = True
|
|
config["no_done_at_end"] = True
|
|
config["n_step"] = 3
|
|
config["bc_iters"] = 0
|
|
config["clip_actions"] = False
|
|
config["normalize_actions"] = True
|
|
config["learning_starts"] = 256
|
|
config["rollout_fragment_length"] = 1
|
|
config["prioritized_replay"] = False
|
|
config["tau"] = 0.005
|
|
config["target_entropy"] = "auto"
|
|
config["Q_model"] = {
|
|
"fcnet_hiddens": [256, 256],
|
|
"fcnet_activation": "relu",
|
|
}
|
|
config["policy_model"] = {
|
|
"fcnet_hiddens": [256, 256],
|
|
"fcnet_activation": "relu",
|
|
}
|
|
config["optimization"] = {
|
|
"actor_learning_rate": 3e-4,
|
|
"critic_learning_rate": 3e-4,
|
|
"entropy_learning_rate": 3e-4,
|
|
}
|
|
config["train_batch_size"] = 256
|
|
config["target_network_update_freq"] = 1
|
|
config["timesteps_per_iteration"] = 1000
|
|
data_file = "/path/to/my/json_file.json"
|
|
print("data_file={} exists={}".format(data_file,
|
|
os.path.isfile(data_file)))
|
|
config["input"] = [data_file]
|
|
config["log_level"] = "INFO"
|
|
config["env"] = "Pendulum-v0"
|
|
|
|
# Set up evaluation.
|
|
config["evaluation_num_workers"] = 1
|
|
config["evaluation_interval"] = 1
|
|
config["evaluation_num_episodes"] = 10
|
|
# This should be False b/c iterations are very long and this would
|
|
# cause evaluation to lag one iter behind training.
|
|
config["evaluation_parallel_to_training"] = False
|
|
# Evaluate on actual environment.
|
|
config["evaluation_config"] = {"input": "sampler"}
|
|
|
|
# Check, whether we can learn from the given file in `num_iterations`
|
|
# iterations, up to a reward of `min_reward`.
|
|
num_iterations = 5
|
|
min_reward = -300
|
|
|
|
# Test for torch framework (tf not implemented yet).
|
|
trainer = cql.CQLTrainer(config=config)
|
|
learnt = False
|
|
for i in range(num_iterations):
|
|
print(f"Iter {i}")
|
|
eval_results = trainer.train().get("evaluation")
|
|
if eval_results:
|
|
print("... R={}".format(eval_results["episode_reward_mean"]))
|
|
# Learn until some reward is reached on an actual live env.
|
|
if eval_results["episode_reward_mean"] >= min_reward:
|
|
learnt = True
|
|
break
|
|
if not learnt:
|
|
raise ValueError("CQLTrainer did not reach {} reward from expert "
|
|
"offline data!".format(min_reward))
|
|
|
|
# Get policy, model, and replay-buffer.
|
|
pol = trainer.get_policy()
|
|
cql_model = pol.model
|
|
from ray.rllib.agents.cql.cql import replay_buffer
|
|
|
|
# If you would like to query CQL's learnt Q-function for arbitrary
|
|
# (cont.) actions, do the following:
|
|
obs_batch = torch.from_numpy(np.random.random(size=(5, 3)))
|
|
action_batch = torch.from_numpy(np.random.random(size=(5, 1)))
|
|
q_values = cql_model.get_q_values(obs_batch, action_batch)
|
|
# If you are using the "twin_q", there'll be 2 Q-networks and
|
|
# we usually consider the min of the 2 outputs, like so:
|
|
twin_q_values = cql_model.get_twin_q_values(obs_batch, action_batch)
|
|
final_q_values = torch.min(q_values, twin_q_values)
|
|
print(final_q_values)
|
|
|
|
# Example on how to do evaluation on the trained Trainer
|
|
# using the data from our buffer.
|
|
# Get a sample (MultiAgentBatch -> SampleBatch).
|
|
batch = replay_buffer.replay().policy_batches["default_policy"]
|
|
obs = torch.from_numpy(batch["obs"])
|
|
# Pass the observations through our model to get the
|
|
# features, which then to pass through the Q-head.
|
|
model_out, _ = cql_model({"obs": obs})
|
|
# The estimated Q-values from the (historic) actions in the batch.
|
|
q_values_old = cql_model.get_q_values(model_out,
|
|
torch.from_numpy(batch["actions"]))
|
|
# The estimated Q-values for the new actions computed
|
|
# by our trainer policy.
|
|
actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0]
|
|
q_values_new = cql_model.get_q_values(model_out,
|
|
torch.from_numpy(actions_new))
|
|
print(f"Q-val batch={q_values_old}")
|
|
print(f"Q-val policy={q_values_new}")
|
|
|
|
trainer.stop()
|