SQIL and PC performance check fixes (#811)

ernestum · web-flow · commit d833d9e25643 · 2023-10-22T16:56:13.000+01:00
* Reduce training times in SQIL tests to make the test suite faster.

* Skip the continous SQIL test with TD3 since it is unstable.

* Disable SQIL test because it is flaky and slow.

* Make SQIL tests more deterministic by adding more seeding.

* Increase training time and numer of samples to compare in the PC performance test.
diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py
@@ -1068,7 +1068,7 @@ def test_that_trainer_improves(
     novice_agent_rewards, _ = evaluation.evaluate_policy(
         agent_trainer.algorithm.policy,
         action_is_reward_venv,
-        25,
+        50,
         return_episode_rewards=True,
     )
 
@@ -1077,7 +1077,7 @@ def test_that_trainer_improves(
     # after this training, and thus `later_rewards` should have lower loss.
     first_reward_network_stats = main_trainer.train(20, 20)
 
-    later_reward_network_stats = main_trainer.train(50, 20)
+    later_reward_network_stats = main_trainer.train(100, 40)
     assert (
         first_reward_network_stats["reward_loss"]
         > later_reward_network_stats["reward_loss"]
@@ -1087,7 +1087,7 @@ def test_that_trainer_improves(
     trained_agent_rewards, _ = evaluation.evaluate_policy(
         agent_trainer.algorithm.policy,
         action_is_reward_venv,
-        25,
+        50,
         return_episode_rewards=True,
     )
 
diff --git a/tests/algorithms/test_sqil.py b/tests/algorithms/test_sqil.py
@@ -90,7 +90,7 @@ def _test_sqil_no_crash(
         rl_algo_class=rl_algo_class,
         rl_kwargs=rl_kwargs,
     )
-    model.train(total_timesteps=5000)
+    model.train(total_timesteps=500)
 
 
 def test_sqil_no_crash_discrete(
@@ -104,7 +104,7 @@ def test_sqil_no_crash_discrete(
         cartpole_venv,
         "seals/CartPole-v0",
         rl_algo_class=dqn.DQN,
-        rl_kwargs=dict(learning_starts=1000),
+        rl_kwargs=dict(learning_starts=100),
     )
 
 
@@ -143,7 +143,7 @@ def _test_sqil_few_demonstrations(
         rl_algo_class=rl_algo_class,
         rl_kwargs=rl_kwargs,
     )
-    model.train(total_timesteps=1_000)
+    model.train(total_timesteps=1_00)
 
 
 def test_sqil_few_demonstrations_discrete(
@@ -157,7 +157,7 @@ def test_sqil_few_demonstrations_discrete(
         cartpole_venv,
         "seals/CartPole-v0",
         rl_algo_class=dqn.DQN,
-        rl_kwargs=dict(learning_starts=10),
+        rl_kwargs=dict(learning_starts=10, seed=42),
     )
 
 
@@ -174,6 +174,7 @@ def test_sqil_few_demonstrations_continuous(
         pendulum_single_venv,
         "Pendulum-v1",
         rl_algo_class=rl_algo_class,
+        rl_kwargs=dict(seed=42),
     )
 
 
@@ -203,7 +204,7 @@ def _test_sqil_performance(
         return_episode_rewards=True,
     )
 
-    model.train(total_timesteps=10_000)
+    model.train(total_timesteps=1_000)
 
     venv.seed(SEED)
     rewards_after, _ = evaluate_policy(
@@ -239,6 +240,7 @@ def test_sqil_performance_discrete(
     )
 
 
+@pytest.mark.skip(reason="This test is flaky.")
 @pytest.mark.parametrize("rl_algo_class", RL_ALGOS_CONT_ACTIONS)
 def test_sqil_performance_continuous(
     rng: np.random.Generator,