ray/rllib/tuned_examples/pg/multi-agent-cartpole-crashing-with-remote-envs-pg.yaml

multi-agent-cartpole-crashing-pg:
    env: ray.rllib.examples.env.cartpole_crashing.MultiAgentCartPoleCrashing
    run: PG
    stop:
        evaluation/episode_reward_mean: 320.0
        num_env_steps_sampled: 300000
    config:
        # Works for both torch and tf.
        framework: tf

        env_config:
            config:
                num_agents: 2
                # Crash roughly every 300 ts. This should be ok to measure 300+
                # reward (episodes are 200 ts long).
                p_crash: 0.0025  # prob to crash during step()
                p_crash_reset: 0.01  # prob to crash during reset()
                # Time for the env to initialize when newly created.
                # Every time a remote sub-environment crashes, a new env is created
                # in its place and will take this long (sleep) to "initialize".
                init_time_s: 1.0
        horizon: 200
        num_workers: 4
        num_envs_per_worker: 3
        # Use parallel remote envs.
        remote_worker_envs: true

        # Switch on resiliency for failed sub environments (within a vectorized stack).
        restart_failed_sub_environments: true

        evaluation_num_workers: 1
        evaluation_interval: 1
        evaluation_duration: 10
        evaluation_duration_unit: episodes
        evaluation_parallel_to_training: true
        evaluation_config:
          explore: false
[RLlib] `restart_failed_sub_environments` now works for MA cases and crashes during `reset()`; +more tests and logging; add eval worker sub-env fault tolerance test. (#26276) 2022-07-15 08:55:14 +02:00			`multi-agent-cartpole-crashing-pg:`
			`env: ray.rllib.examples.env.cartpole_crashing.MultiAgentCartPoleCrashing`
			`run: PG`
			`stop:`
			`evaluation/episode_reward_mean: 320.0`
			`num_env_steps_sampled: 300000`
			`config:`
			`# Works for both torch and tf.`
			`framework: tf`

			`env_config:`
			`config:`
			`num_agents: 2`
			`# Crash roughly every 300 ts. This should be ok to measure 300+`
			`# reward (episodes are 200 ts long).`
			`p_crash: 0.0025 # prob to crash during step()`
			`p_crash_reset: 0.01 # prob to crash during reset()`
			`# Time for the env to initialize when newly created.`
			`# Every time a remote sub-environment crashes, a new env is created`
			`# in its place and will take this long (sleep) to "initialize".`
			`init_time_s: 1.0`
			`horizon: 200`
			`num_workers: 4`
			`num_envs_per_worker: 3`
			`# Use parallel remote envs.`
			`remote_worker_envs: true`

			`# Switch on resiliency for failed sub environments (within a vectorized stack).`
			`restart_failed_sub_environments: true`

			`evaluation_num_workers: 1`
			`evaluation_interval: 1`
			`evaluation_duration: 10`
			`evaluation_duration_unit: episodes`
			`evaluation_parallel_to_training: true`
			`evaluation_config:`
			`explore: false`