Skip to content

Commit

Permalink
feat: support isaac gym interface (#325)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gaiejj authored May 2, 2024
1 parent 3c9a235 commit 924f74c
Show file tree
Hide file tree
Showing 15 changed files with 692 additions and 19 deletions.
5 changes: 3 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,13 @@ repos:
^examples/|
^tests/|
^setup.py$|
^docs/source/conf.py$|
^omnisafe/envs/classic_control/envs_from_crabs.py$|
^omnisafe/common/control_barrier_function/crabs/models.py$|
^omnisafe/common/control_barrier_function/crabs/optimizers.py$|
^omnisafe/common/control_barrier_function/crabs/utils.py$|
^omnisafe/algorithms/off_policy/crabs.py$
^omnisafe/algorithms/off_policy/crabs.py$|
^omnisafe/utils/isaac_gym_utils.py$|
^docs/source/conf.py$
)
- repo: https://github.com/pycqa/pydocstyle
rev: 6.3.0
Expand Down
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ ignore=CVS,.vscode,.history
# ignore-list. The regex matches against paths and can be in Posix or Windows
# format. Because '\' represents the directory delimiter on Windows systems, it
# can't be used as an escape character.
ignore-paths=^examples/$,^tests/$
ignore-paths=^examples/$,^tests/$,^omnisafe/utils/isaac_gym_utils.py$,

# Files or directories matching the regular expression patterns are skipped.
# The regex matches against base names, not paths. The default value ignores
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,21 @@ Here is a list of environments that [Safety-Gymnasium](https://www.safety-gymnas
<td>HalfCheetah, Hopper, Swimmer, Walker2d, Ant, Humanoid</td>
<td>SafetyHumanoidVelocity-v1</td>
</tr>
<tr>
<td rowspan="4">Safe Isaac Gym</td>
<td>OverSafeFinger</td>
<td rowspan="4">ShadowHand</td>
<td rowspan="4">ShadowHandOverSafeFinger</td>
</tr>
<tr>
<td>OverSafeJoint</td>
</tr>
<tr>
<td>CatchOver2UnderarmSafeFinger</td>
</tr>
<tr>
<td>CatchOver2UnderarmSafeJoint</td>
</tr>
</tbody>
</table>

Expand Down
6 changes: 6 additions & 0 deletions omnisafe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
# ==============================================================================
"""OmniSafe: A comprehensive and reliable benchmark for safe reinforcement learning."""

from contextlib import suppress


with suppress(ImportError):
from isaacgym import gymutil

from omnisafe import algorithms
from omnisafe.algorithms import ALGORITHMS
from omnisafe.algorithms.algo_wrapper import AlgoWrapper as Agent
Expand Down
4 changes: 2 additions & 2 deletions omnisafe/adapter/crabs_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@ def eval_policy( # pylint: disable=too-many-locals
"""
for _ in range(episode):
ep_ret, ep_cost, ep_len = 0.0, 0.0, 0
obs, _ = self._eval_env.reset()
obs, _ = self._eval_env.reset() # type: ignore
obs = obs.to(self._device)

done = False
while not done:
act = agent.step(obs, deterministic=False)
obs, reward, cost, terminated, truncated, info = self._eval_env.step(act)
obs, reward, cost, terminated, truncated, info = self._eval_env.step(act) # type: ignore
obs, reward, cost, terminated, truncated = (
torch.as_tensor(x, dtype=torch.float32, device=self._device)
for x in (obs, reward, cost, terminated, truncated)
Expand Down
1 change: 1 addition & 0 deletions omnisafe/adapter/offpolicy_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def eval_policy( # pylint: disable=too-many-locals
agent (ConstraintActorCritic): Agent.
logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``.
"""
assert self._eval_env, 'Environment for evaluation has not been set!'
for _ in range(episode):
ep_ret, ep_cost, ep_len = 0.0, 0.0, 0
obs, _ = self._eval_env.reset()
Expand Down
46 changes: 35 additions & 11 deletions omnisafe/adapter/online_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,17 @@ def __init__( # pylint: disable=too-many-arguments
env_cfgs = self._cfgs.env_cfgs.todict()

self._env: CMDP = make(env_id, num_envs=num_envs, device=self._device, **env_cfgs)
self._eval_env: CMDP = make(env_id, num_envs=1, device=self._device, **env_cfgs)

self._wrapper(
obs_normalize=cfgs.algo_cfgs.obs_normalize,
reward_normalize=cfgs.algo_cfgs.reward_normalize,
cost_normalize=cfgs.algo_cfgs.cost_normalize,
)

self._eval_env: CMDP | None = None
if self._env.need_evaluation:
self._eval_env = make(env_id, num_envs=1, device=self._device, **env_cfgs)
self._wrapper_eval(obs_normalize=cfgs.algo_cfgs.obs_normalize)

self._env.set_seed(seed)

def _wrapper(
Expand Down Expand Up @@ -116,32 +119,53 @@ def _wrapper(
"""
if self._env.need_time_limit_wrapper:
assert (
self._env.max_episode_steps and self._eval_env.max_episode_steps
self._env.max_episode_steps
), 'You must define max_episode_steps as an integer\
or cancel the use of the time_limit wrapper.'
\nor cancel the use of the time_limit wrapper.'
self._env = TimeLimit(
self._env,
time_limit=self._env.max_episode_steps,
device=self._device,
)
self._eval_env = TimeLimit(
self._eval_env,
time_limit=self._eval_env.max_episode_steps,
device=self._device,
)
if self._env.need_auto_reset_wrapper:
self._env = AutoReset(self._env, device=self._device)
if obs_normalize:
self._env = ObsNormalize(self._env, device=self._device)
self._eval_env = ObsNormalize(self._eval_env, device=self._device)
if reward_normalize:
self._env = RewardNormalize(self._env, device=self._device)
if cost_normalize:
self._env = CostNormalize(self._env, device=self._device)
self._env = ActionScale(self._env, low=-1.0, high=1.0, device=self._device)
self._eval_env = ActionScale(self._eval_env, low=-1.0, high=1.0, device=self._device)
if self._env.num_envs == 1:
self._env = Unsqueeze(self._env, device=self._device)

def _wrapper_eval(
self,
obs_normalize: bool = True,
) -> None:
"""Wrapper the environment for evaluation.
Args:
obs_normalize (bool, optional): Whether to normalize the observation. Defaults to True.
reward_normalize (bool, optional): Whether to normalize the reward. Defaults to True.
cost_normalize (bool, optional): Whether to normalize the cost. Defaults to True.
"""
assert self._eval_env, 'Your environment for evaluation does not exist!'
if self._env.need_time_limit_wrapper:
assert (
self._eval_env.max_episode_steps
), 'You must define max_episode_steps as an\
\ninteger or cancel the use of the time_limit wrapper.'
self._eval_env = TimeLimit(
self._eval_env,
time_limit=self._eval_env.max_episode_steps,
device=self._device,
)
if self._env.need_auto_reset_wrapper:
self._eval_env = AutoReset(self._eval_env, device=self._device)
if obs_normalize:
self._eval_env = ObsNormalize(self._eval_env, device=self._device)
self._eval_env = ActionScale(self._eval_env, low=-1.0, high=1.0, device=self._device)
self._eval_env = Unsqueeze(self._eval_env, device=self._device)

@property
Expand Down
11 changes: 8 additions & 3 deletions omnisafe/adapter/onpolicy_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,20 @@ def rollout( # pylint: disable=too-many-locals

obs = next_obs
epoch_end = step >= steps_per_epoch - 1
if epoch_end:
num_dones = int(terminated.contiguous().sum())
if self._env.num_envs - num_dones:
logger.log(
f'\nWarning: trajectory cut off when rollout by epoch\
in {self._env.num_envs - num_dones} of {self._env.num_envs} environments.',
)

for idx, (done, time_out) in enumerate(zip(terminated, truncated)):
if epoch_end or done or time_out:
last_value_r = torch.zeros(1)
last_value_c = torch.zeros(1)
if not done:
if epoch_end:
logger.log(
f'Warning: trajectory cut off when rollout by epoch at {self._ep_len[idx]} steps.',
)
_, last_value_r, last_value_c, _ = agent.step(obs[idx])
if time_out:
_, last_value_r, last_value_c, _ = agent.step(
Expand Down
156 changes: 156 additions & 0 deletions omnisafe/configs/on-policy/PPO.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,159 @@ defaults:
activation: tanh
# learning rate
lr: 0.0003

ShadowHandCatchOver2UnderarmSafeFinger:
# training configurations
train_cfgs:
# number of vectorized environments
vector_env_nums: 256
# total number of steps to train
total_steps: 100000000
# algorithm configurations
algo_cfgs:
# number of steps to update the policy
steps_per_epoch: 38400
# number of iterations to update the policy
update_iters: 8
# batch size for each iteration
batch_size: 8192
# target kl divergence
target_kl: 0.016
# max gradient norm
max_grad_norm: 1.0
# use critic norm
use_critic_norm: False
# reward discount factor
gamma: 0.96
# normalize reward
reward_normalize: False
# normalize cost
cost_normalize: False
# normalize observation
obs_normalize: False
# model configurations
model_cfgs:
# actor network configurations
actor:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
critic:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
# learning rate
lr: 0.0006

ShadowHandOverSafeFinger:
# training configurations
train_cfgs:
# number of vectorized environments
vector_env_nums: 256
# total number of steps to train
total_steps: 100000000
# algorithm configurations
algo_cfgs:
# number of steps to update the policy
steps_per_epoch: 38400
# number of iterations to update the policy
update_iters: 8
# batch size for each iteration
batch_size: 8192
# target kl divergence
target_kl: 0.016
# max gradient norm
max_grad_norm: 1.0
# use critic norm
use_critic_norm: False
# reward discount factor
gamma: 0.96
# normalize observation
obs_normalize: False
# model configurations
model_cfgs:
# actor network configurations
actor:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
critic:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
# learning rate
lr: 0.0006

ShadowHandCatchOver2UnderarmSafeJoint:
# training configurations
train_cfgs:
# number of vectorized environments
vector_env_nums: 256
# total number of steps to train
total_steps: 100000000
# algorithm configurations
algo_cfgs:
# number of steps to update the policy
steps_per_epoch: 38400
# number of iterations to update the policy
update_iters: 8
# batch size for each iteration
batch_size: 8192
# target kl divergence
target_kl: 0.016
# max gradient norm
max_grad_norm: 1.0
# use critic norm
use_critic_norm: False
# reward discount factor
gamma: 0.96
# normalize reward
reward_normalize: False
# normalize cost
cost_normalize: False
# normalize observation
obs_normalize: False
# model configurations
model_cfgs:
# actor network configurations
actor:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
critic:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
# learning rate
lr: 0.0006

ShadowHandOverSafeJoint:
# training configurations
train_cfgs:
# number of vectorized environments
vector_env_nums: 256
# total number of steps to train
total_steps: 100000000
# algorithm configurations
algo_cfgs:
# number of steps to update the policy
steps_per_epoch: 38400
# number of iterations to update the policy
update_iters: 8
# batch size for each iteration
batch_size: 8192
# target kl divergence
target_kl: 0.016
# max gradient norm
max_grad_norm: 1.0
# use critic norm
use_critic_norm: False
# reward discount factor
gamma: 0.96
# normalize observation
obs_normalize: False
# model configurations
model_cfgs:
# actor network configurations
actor:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
critic:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
# learning rate
lr: 0.0006
Loading

0 comments on commit 924f74c

Please sign in to comment.