recomm-sys001-ppo: env: ray.rllib.examples.env.recommender_system_envs.RecommSys001 run: PPO stop: #evaluation/episode_reward_mean: 48.0 timesteps_total: 200000 config: framework: tf metrics_num_episodes_for_smoothing: 1000 # Env c'tor kwargs: env_config: # Number of different categories a doc can have and a user can # have a preference for. num_categories: 2 # Number of docs to choose (a slate) from each timestep. num_docs_to_select_from: 10 # Slate size. slate_size: 1 # Re-sample docs each timesteps. num_docs_in_db: 100 # Re-sample user each episode. num_users_in_db: 100 # User time budget (determines lengths of episodes). user_time_budget: 60.0 # Larger networks seem to help (large obs/action spaces). model: fcnet_hiddens: [256, 256] # Larger batch sizes seem to help (more stability, even with higher lr). #train_batch_size: 32 #num_workers: 2 #num_gpus: 0 #lr_choice_model: 0.002 #lr_q_model: 0.002 #target_network_update_freq: 500 #tau: 1.0 # Evaluation settings. evaluation_interval: 1 evaluation_num_workers: 4 evaluation_duration: 200 evaluation_duration_unit: episodes evaluation_parallel_to_training: true