"""
Dummy in-place replacement for the unity3d_client.py script
in case you don't have an actual Unity3D engine installed or just want
to test client/server connectivity with the unity3d_server.py script.

This client script simply uses RLlib's RandomMultiAgentEnv to mimic
one of the ML Agents (Unity3D) example games (e.g. "3DBall").

To run this script on possibly different machines
against a central Policy server:

1) Run (two separate shells/machines):
$ python unity3d_server.py --env 3DBall
$ python unity3d_dummy_client.py --env 3DBall --inference-mode=local
"""

import argparse

from ray.rllib.env.policy_client import PolicyClient
from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv
from ray.rllib.examples.env.random_env import RandomMultiAgentEnv

SERVER_ADDRESS = "localhost"
SERVER_PORT = 9900

parser = argparse.ArgumentParser()
parser.add_argument(
    "--env",
    type=str,
    default="3DBall",
    choices=[
        "3DBall", "3DBallHard", "FoodCollector", "GridFoodCollector",
        "Pyramids", "Sorter", "Tennis", "VisualHallway", "Walker"
    ],
    help="The name of the Env to mimic. Only those examples supported so "
    "far for which all agents have the same "
    "observation- and action spaces (feel free to add more to this script!)")
parser.add_argument(
    "--horizon",
    type=int,
    default=200,
    help="The max. number of `step()`s for any episode (per agent) before "
    "it'll be reset again automatically.")
parser.add_argument(
    "--server",
    type=str,
    default=SERVER_ADDRESS,
    help="The Policy server's address to connect to from this client.")
parser.add_argument(
    "--port",
    type=int,
    default=SERVER_PORT,
    help="The port to use (on --server).")
parser.add_argument(
    "--no-train",
    action="store_true",
    help="Whether to disable training (on the server side).")
parser.add_argument(
    "--inference-mode",
    type=str,
    default="local",
    choices=["local", "remote"],
    help="Whether to compute actions `local`ly or `remote`ly. Note that "
    "`local` is much faster b/c observations/actions do not have to be "
    "sent via the network.")
parser.add_argument(
    "--update-interval-local-mode",
    type=float,
    default=10.0,
    help="For `inference-mode=local`, every how many seconds do we update "
    "learnt policy weights from the server?")
parser.add_argument(
    "--num-episodes",
    type=int,
    default=10,
    help="Stop once the specified number of episodes have been played.")

if __name__ == "__main__":
    args = parser.parse_args()

    # Start the client for sending environment information (e.g. observations,
    # actions) to a policy server (listening on port 9900).
    client = PolicyClient(
        "http://" + args.server + ":" + str(args.port),
        inference_mode=args.inference_mode,
        update_interval=args.update_interval_local_mode)

    # Get the multi-agent policies dict and agent->policy
    # mapping-fn.
    policies, policy_mapping_fn = \
        Unity3DEnv.get_policy_configs_for_game(args.env)

    # Make sure all policies' obs- and action spaces are the same.
    # If not, we won't be able to mimic the Unity3D env using RLlib's
    # RandomMultiAgentEnv.
    first_policy_spec = next(iter(policies.values()))
    for pid, policy_spec in policies.items():
        assert policy_spec.observation_space == \
               first_policy_spec.observation_space
        assert policy_spec.action_space == first_policy_spec.action_space

    # Start and reset the actual Unity3DEnv (either already running Unity3D
    # editor or a binary (game) to be started automatically).
    env = RandomMultiAgentEnv({
        # Same number of agents as the actual Unity3D game would have.
        "num_agents": len(policies),
        # Make sure we stick to the user given horizons using our
        # RandomMultiAgentEnv options.
        "max_episode_len": args.horizon,
        "p_done": 0.0,
        # Same obs- action spaces as the actual Unity3D game would have.
        "observation_space": first_policy_spec.observation_space,
        "action_space": first_policy_spec.action_space,
    })
    obs = env.reset()
    eid = client.start_episode(training_enabled=not args.no_train)

    # Keep track of the total reward per episode.
    total_rewards_this_episode = 0.0

    # Loop through the env until n episodes completed.
    num_episodes = 0
    while True:
        # Get actions from the Policy server given our current obs.
        actions = client.get_action(eid, obs)
        # Apply actions to our env.
        obs, rewards, dones, infos = env.step(actions)
        total_rewards_this_episode += sum(rewards.values())
        # Log rewards and single-agent dones.
        client.log_returns(eid, rewards, infos, multiagent_done_dict=dones)
        # Check whether all agents are done and end the episode, if necessary.
        if dones["__all__"]:
            print("Episode done: Reward={}".format(total_rewards_this_episode))

            num_episodes += 1
            if num_episodes >= args.num_episodes:
                quit(0)

            # End the episode and reset dummy Env.
            total_rewards_this_episode = 0.0
            client.end_episode(eid, obs)
            obs = env.reset()
            # Start a new episode.
            eid = client.start_episode(training_enabled=not args.no_train)