admin管理员组

文章数量:1316023

I'm trying to train an A2C model in stable-baselines3 and the EvalCallback appears to freeze when it is called. I cannot figure out why. Below you will find a script that recreates this problem. Before that, here is the output of the various print statements spread throughout to monitor where the program is:

Creating environments for SubprocVecEnv...
Defining the A2C model...
Making an evaluation callback...
Training the A2C model...
Number of time steps at training start: 0
_on_step: CustomEvalCallback has been called 1 times
Number of time steps at rollout end: 3
_on_step: CustomEvalCallback has been called 2 times
Number of time steps at rollout end: 6
_on_step: CustomEvalCallback has been called 3 times
Number of time steps at rollout end: 9
_on_step: CustomEvalCallback has been called 4 times
Number of time steps at rollout end: 12
_on_step: CustomEvalCallback has been called 5 times
    eval will be performed now...

The script will sit here for as long as I will let it. Why is this happening and how do I fix it?

The script:

import numpy as np
import gymnasium as gym
import torch as th

from stable_baselines3 import A2C
from stable_baselines3mon.vec_env import SubprocVecEnv
from stable_baselines3mon.monitor import Monitor
from stable_baselines3mon.callbacks import BaseCallback, EvalCallback

from typing import Any


class DummyVecEnv(gym.Env):
    def __init__(self, n_controls: int = 1):
        super().__init__()
        self.n_controls = n_controls

        self.observation_space = gym.spaces.Box(
            low=0, high=1e4, shape=(1,),
            dtype=np.float32
        )
        self.action_space = gym.spaces.Box(
            low=-1e4, high=1e4, shape=(self.n_controls,),
            dtype=np.float32
        )

    def _get_obs(self) -> np.ndarray:
        return self.observation_space.sample()

    def reset(
            self,
            *,
            seed: int | None = None,
            config: dict[str, Any] = {},
    ) -> tuple[np.ndarray, dict[str, Any]]:
        return self._get_obs(), {}

    def step(
            self,
            action: np.ndarray
    ) -> (np.ndarray, float, bool, bool, dict):
        obs = self._get_obs()
        reward = np.random.rand()
        terminated = False
        truncated = False
        return obs, reward, terminated, truncated, {}


class MakeDummyEnv:
    def __init__(self, n_controls: int = 1, is_eval_env: bool = False):
        self.n_controls = n_controls
        self.is_eval_env = is_eval_env

    def __call__(self):
        env = DummyVecEnv(n_controls=self.n_controls)
        if self.is_eval_env:
            return Monitor(env)
        else:
            return env


class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """

    def __init__(self, verbose=0):
        super().__init__(verbose)

    def _on_rollout_end(self) -> None:
        """ the rollout is lost before _on_step is called, log returns here """
        if hasattr(self.model, "num_timesteps"):
            print('Number of time steps at rollout end:', self.model.num_timesteps)
        if hasattr(self.model, "rollout_buffer"):
            rollout_data = next(self.model.rollout_buffer.get(batch_size=None))
            self.logger.record('train/buffer_size', len(rollout_data.actions))
            self.logger.record('train/return_mean', rollout_data.returns.mean().item())
            self.logger.record('train/return_std', rollout_data.returns.std().item())
            self.logger.record('train/first_env_temp',
                               rollout_data.observations.flatten()[0].item()
                               )

        if not hasattr(self.model, "n_iterations"):
            self.model.n_iterations = 0
        self.model.n_iterations += 1

    def _on_step(self) -> bool:
        """ log the std of each output here """
        if hasattr(self.model, "num_timesteps"):
            self.logger.record('train/num_timesteps', self.model.num_timesteps)
        if hasattr(self.model.policy, "log_std"):
            for i in range(len(self.model.policy.log_std)):
                self.logger.record(
                    f"train/std_{i:d}",
                    th.exp(self.model.policy.log_std[i]).item()
                )
        return True

    def _on_training_start(self) -> None:
        if hasattr(self.model, "num_timesteps"):
            print('Number of time steps at training start:', self.model.num_timesteps)

    def _on_training_end(self) -> None:
        if hasattr(self.model, "num_timesteps"):
            print('Number of time steps at training end:', self.model.num_timesteps)


class CustomEvalCallback(EvalCallback):
    def _on_step(self) -> bool:
        """ this is called by self.on_step which is called when EvalCallback is triggered """
        print(f"_on_step: CustomEvalCallback has been called {self.n_calls:d} times")
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            print('\teval will be performed now...')
        super()._on_step()
        return True


if __name__ == "__main__":
    print('Creating environments for SubprocVecEnv...')
    vec_env = SubprocVecEnv(
        env_fns=[MakeDummyEnv(n_controls=3) for _ in range(3)]
    )

    print('Defining the A2C model...')
    model = A2C(
        policy="MlpPolicy",
        env=vec_env,
        n_steps=1,
        verbose=0,
        device='cpu',
        tensorboard_log="./logs"
    )

    print('Making an evaluation callback...')
    n_eval_env = 2
    eval_env = SubprocVecEnv(
        env_fns=[
            MakeDummyEnv(n_controls=3, is_eval_env=True)
            for _ in range(n_eval_env)
        ]
    )
    eval_callback = CustomEvalCallback(
        eval_env=eval_env,
        callback_on_new_best=None,
        callback_after_eval=None,
        n_eval_episodes=n_eval_env,
        eval_freq=5,  # this appears to be number of iterations, not time steps
        log_path=None,  # leave this as None
        best_model_save_path='saves',  # saves as 'best_model' here
        deterministic=True,
        render=False,
        verbose=0,
        warn=True,
    )

    print('Training the A2C model...')
    model.learn(
        total_timesteps=25,
        progress_bar=False,
        log_interval=1,
        callback=[TensorboardCallback(), eval_callback],
        tb_log_name='dummy_log',
    )


I'm trying to train an A2C model in stable-baselines3 and the EvalCallback appears to freeze when it is called. I cannot figure out why. Below you will find a script that recreates this problem. Before that, here is the output of the various print statements spread throughout to monitor where the program is:

Creating environments for SubprocVecEnv...
Defining the A2C model...
Making an evaluation callback...
Training the A2C model...
Number of time steps at training start: 0
_on_step: CustomEvalCallback has been called 1 times
Number of time steps at rollout end: 3
_on_step: CustomEvalCallback has been called 2 times
Number of time steps at rollout end: 6
_on_step: CustomEvalCallback has been called 3 times
Number of time steps at rollout end: 9
_on_step: CustomEvalCallback has been called 4 times
Number of time steps at rollout end: 12
_on_step: CustomEvalCallback has been called 5 times
    eval will be performed now...

The script will sit here for as long as I will let it. Why is this happening and how do I fix it?

The script:

import numpy as np
import gymnasium as gym
import torch as th

from stable_baselines3 import A2C
from stable_baselines3mon.vec_env import SubprocVecEnv
from stable_baselines3mon.monitor import Monitor
from stable_baselines3mon.callbacks import BaseCallback, EvalCallback

from typing import Any


class DummyVecEnv(gym.Env):
    def __init__(self, n_controls: int = 1):
        super().__init__()
        self.n_controls = n_controls

        self.observation_space = gym.spaces.Box(
            low=0, high=1e4, shape=(1,),
            dtype=np.float32
        )
        self.action_space = gym.spaces.Box(
            low=-1e4, high=1e4, shape=(self.n_controls,),
            dtype=np.float32
        )

    def _get_obs(self) -> np.ndarray:
        return self.observation_space.sample()

    def reset(
            self,
            *,
            seed: int | None = None,
            config: dict[str, Any] = {},
    ) -> tuple[np.ndarray, dict[str, Any]]:
        return self._get_obs(), {}

    def step(
            self,
            action: np.ndarray
    ) -> (np.ndarray, float, bool, bool, dict):
        obs = self._get_obs()
        reward = np.random.rand()
        terminated = False
        truncated = False
        return obs, reward, terminated, truncated, {}


class MakeDummyEnv:
    def __init__(self, n_controls: int = 1, is_eval_env: bool = False):
        self.n_controls = n_controls
        self.is_eval_env = is_eval_env

    def __call__(self):
        env = DummyVecEnv(n_controls=self.n_controls)
        if self.is_eval_env:
            return Monitor(env)
        else:
            return env


class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """

    def __init__(self, verbose=0):
        super().__init__(verbose)

    def _on_rollout_end(self) -> None:
        """ the rollout is lost before _on_step is called, log returns here """
        if hasattr(self.model, "num_timesteps"):
            print('Number of time steps at rollout end:', self.model.num_timesteps)
        if hasattr(self.model, "rollout_buffer"):
            rollout_data = next(self.model.rollout_buffer.get(batch_size=None))
            self.logger.record('train/buffer_size', len(rollout_data.actions))
            self.logger.record('train/return_mean', rollout_data.returns.mean().item())
            self.logger.record('train/return_std', rollout_data.returns.std().item())
            self.logger.record('train/first_env_temp',
                               rollout_data.observations.flatten()[0].item()
                               )

        if not hasattr(self.model, "n_iterations"):
            self.model.n_iterations = 0
        self.model.n_iterations += 1

    def _on_step(self) -> bool:
        """ log the std of each output here """
        if hasattr(self.model, "num_timesteps"):
            self.logger.record('train/num_timesteps', self.model.num_timesteps)
        if hasattr(self.model.policy, "log_std"):
            for i in range(len(self.model.policy.log_std)):
                self.logger.record(
                    f"train/std_{i:d}",
                    th.exp(self.model.policy.log_std[i]).item()
                )
        return True

    def _on_training_start(self) -> None:
        if hasattr(self.model, "num_timesteps"):
            print('Number of time steps at training start:', self.model.num_timesteps)

    def _on_training_end(self) -> None:
        if hasattr(self.model, "num_timesteps"):
            print('Number of time steps at training end:', self.model.num_timesteps)


class CustomEvalCallback(EvalCallback):
    def _on_step(self) -> bool:
        """ this is called by self.on_step which is called when EvalCallback is triggered """
        print(f"_on_step: CustomEvalCallback has been called {self.n_calls:d} times")
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            print('\teval will be performed now...')
        super()._on_step()
        return True


if __name__ == "__main__":
    print('Creating environments for SubprocVecEnv...')
    vec_env = SubprocVecEnv(
        env_fns=[MakeDummyEnv(n_controls=3) for _ in range(3)]
    )

    print('Defining the A2C model...')
    model = A2C(
        policy="MlpPolicy",
        env=vec_env,
        n_steps=1,
        verbose=0,
        device='cpu',
        tensorboard_log="./logs"
    )

    print('Making an evaluation callback...')
    n_eval_env = 2
    eval_env = SubprocVecEnv(
        env_fns=[
            MakeDummyEnv(n_controls=3, is_eval_env=True)
            for _ in range(n_eval_env)
        ]
    )
    eval_callback = CustomEvalCallback(
        eval_env=eval_env,
        callback_on_new_best=None,
        callback_after_eval=None,
        n_eval_episodes=n_eval_env,
        eval_freq=5,  # this appears to be number of iterations, not time steps
        log_path=None,  # leave this as None
        best_model_save_path='saves',  # saves as 'best_model' here
        deterministic=True,
        render=False,
        verbose=0,
        warn=True,
    )

    print('Training the A2C model...')
    model.learn(
        total_timesteps=25,
        progress_bar=False,
        log_interval=1,
        callback=[TensorboardCallback(), eval_callback],
        tb_log_name='dummy_log',
    )


Share Improve this question edited Jan 30 at 15:58 globglogabgalab 4633 silver badges14 bronze badges asked Jan 29 at 22:34 Finncent PriceFinncent Price 8371 gold badge12 silver badges24 bronze badges 1
  • There is a while loop in stable-baselines3's evaluate_policy that it is getting stuck in. github/DLR-RM/stable-baselines3/blob/master/… – Finncent Price Commented Jan 29 at 23:36
Add a comment  | 

1 Answer 1

Reset to default 0

The problem was that DummyVecEnv is never done, i.e. it never return terminated or truncated as True. This means that evaluate_policy (called here) never increments the count of episodes run and the while loop never ends.

本文标签: pythonEvalCallback hangs in stablebaselines3Stack Overflow