# Given a SAC-generated offline file generated via:
# rllib train -f tuned_examples/sac/pendulum-sac.yaml --no-ray-ui

# Pendulum CQL can attain ~ -300 reward in 10k from that file.
pendulum-cql:
    env: Pendulum-v1
    run: CQL
    stop:
        evaluation/episode_reward_mean: -700
        timesteps_total: 800000
    config:
        # Works for both torch and tf.
        framework: tf

        # Use one or more offline files or "input: sampler" for online learning.
        input: 'dataset'
        input_config:         
            paths: ["tests/data/pendulum/enormous.zip"]
            format: 'json'
        # Our input file above comes from an SAC run. Actions in there
        # are already normalized (produced by SquashedGaussian).
        actions_in_input_normalized: true
        clip_actions: true

        twin_q: true
        train_batch_size: 2000
        bc_iters: 100
        num_workers: 2
        min_time_s_per_iteration: 10

        metrics_smoothing_episodes: 5

        # Evaluate in an actual environment.
        evaluation_interval: 1
        evaluation_num_workers: 2
        evaluation_duration: 10
        evaluation_parallel_to_training: true
        evaluation_config:
            input: sampler
            explore: False