stash ctrl_qsm

typoverflow · typoverflow · commit 448aae3736b6 · 2025-11-04T16:03:31.000-05:00
diff --git a/flowrl/agent/online/ctrl/ctrl_qsm.py b/flowrl/agent/online/ctrl/ctrl_qsm.py
@@ -107,7 +107,6 @@ def get_q_value(action: jnp.ndarray, obs: jnp.ndarray) -> jnp.ndarray:
         return q.min(axis=0).mean()
     q_grad_fn = jax.vmap(jax.grad(get_q_value))
     q_grad = q_grad_fn(at, batch.obs)
-    q_grad = alpha1 * q_grad - alpha2 * at
     eps_estimation = - alpha2 * q_grad / temp / (jnp.abs(q_grad).mean() + 1e-6)
 
     def actor_loss_fn(actor_params: Param, dropout_rng: PRNGKey) -> Tuple[jnp.ndarray, Metric]:
@@ -123,6 +122,7 @@ def actor_loss_fn(actor_params: Param, dropout_rng: PRNGKey) -> Tuple[jnp.ndarra
         return loss, {
             "loss/actor_loss": loss,
             "misc/eps_estimation_l1": jnp.abs(eps_estimation).mean(),
+            "misc/eps_estimation_std": jnp.std(eps_estimation, axis=0).mean(),
         }
 
     new_actor, actor_metrics = actor.apply_gradient(actor_loss_fn)
diff --git a/flowrl/config/online/mujoco/algo/ctrl/ctrl_qsm.py b/flowrl/config/online/mujoco/algo/ctrl/ctrl_qsm.py
@@ -37,4 +37,5 @@ class CtrlQSMConfig(BaseAlgoConfig):
     ranking: bool
 
     num_samples: int
+    temp: float
     diffusion: QSMDiffusionConfig
diff --git a/scripts/dmc/ctrl_qsm.sh b/scripts/dmc/ctrl_qsm.sh
@@ -0,0 +1,68 @@
+# Specify which GPUs to use
+GPUS=(0 1 2 3 4 5 6 7)  # Modify this array to specify which GPUs to use
+SEEDS=(0 1 2 3)
+NUM_EACH_GPU=3
+
+PARALLEL=$((NUM_EACH_GPU * ${#GPUS[@]}))
+
+TASKS=(
+    "acrobot-swingup"
+    "ball_in_cup-catch"
+    "cartpole-balance"
+    "cartpole-balance_sparse"
+    "cartpole-swingup"
+    "cartpole-swingup_sparse"
+    "cheetah-run"
+    "dog-run"
+    "dog-stand"
+    "dog-trot"
+    "dog-walk"
+    "finger-spin"
+    "finger-turn_easy"
+    "finger-turn_hard"
+    "fish-swim"
+    "hopper-hop"
+    "hopper-stand"
+    "humanoid-run"
+    "humanoid-stand"
+    "humanoid-walk"
+    "pendulum-swingup"
+    "quadruped-run"
+    "quadruped-walk"
+    "reacher-easy"
+    "reacher-hard"
+    "walker-run"
+    "walker-stand"
+    "walker-walk"
+)
+
+SHARED_ARGS=(
+    "algo=ctrl_qsm"
+    "log.tag=default"
+    "log.project=flow-rl"
+    "log.entity=lambda-rl"
+)
+
+run_task() {
+    task=$1
+    seed=$2
+    slot=$3
+    num_gpus=${#GPUS[@]}
+    device_idx=$((slot % num_gpus))
+    device=${GPUS[$device_idx]}
+    echo "Running $env $seed on GPU $device"
+    command="python3 examples/online/main_dmc_offpolicy.py task=$task device=$device seed=$seed ${SHARED_ARGS[@]}"
+    if [ -n "$DRY_RUN" ]; then
+        echo $command
+    else
+        echo $command
+        $command
+    fi
+}
+
+. env_parallel.bash
+if [ -n "$DRY_RUN" ]; then
+    env_parallel -P${PARALLEL} run_task {1} {2} {%} ::: ${TASKS[@]} ::: ${SEEDS[@]}
+else
+    env_parallel --bar --results log/parallel/$name -P${PARALLEL} run_task {1} {2} {%} ::: ${TASKS[@]} ::: ${SEEDS[@]}
+fi