Policy iteration on FrozenLake-v0 (Python)

import gym
import numpy as np
import matplotlib.pyplot as plt

from src.algorithms.dp.policy_iteration import PolicyIteration, DPAlgoConfig
from src.policies.uniform_policy import UniformPolicy
from src.policies.max_action_policy_adaptor import MaxActionPolicyAdaptor
from src.worlds.world_helpers import n_actions, n_states
from src.trainers.rl_serial_agent_trainer import RLSerialTrainerConfig, RLSerialAgentTrainer
def plot_values(v):
    # reshape value function
    V_sq = np.reshape(v, (4, 4))

    # plot the state-value function
    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(111)
    im = ax.imshow(V_sq, cmap='cool')
    for (j, i), label in np.ndenumerate(V_sq):
        ax.text(i, j, np.round(label, 5), ha='center', va='center', fontsize=14)
    plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
    plt.title('State-Value Function')
    plt.show()
if __name__ == '__main__':
    env = gym.make("FrozenLake-v0")

    policy_init = UniformPolicy(n_actions=n_actions(env),
                                n_states=n_states(env),
                                init_val=None)
    policy_adaptor = MaxActionPolicyAdaptor()

    agent_config = DPAlgoConfig()
    agent_config.gamma = 1.0
    agent_config.n_itrs_per_episode = 100
    agent_config.policy = policy_init

    agent = PolicyIteration(algo_config=agent_config, policy_adaptor=policy_adaptor)

    config = RLSerialTrainerConfig()
    config.n_episodes = 100

    trainer = RLSerialAgentTrainer(agent=agent, config=config)

    ctrl_res = trainer.train(env)

    print(f"Converged {ctrl_res.converged}")
    print(f"Number of iterations {ctrl_res.n_itrs}")
    print(f"Residual {ctrl_res.residual}")

    print("\nOptimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):")
    print(agent.policy.policy, "\n")

    plot_values(agent.v)