OpenEuroLLM · kargibora · Apr 14, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 29, 2026
diff --git a/README.md b/README.md
@@ -251,6 +251,30 @@ judgearena \
 | `--n_bootstraps` | `20` | Bootstrap samples for ELO confidence intervals |
 | `--swap_mode` | `fixed` | `fixed`: single judge pass; `both`: correct for position bias |
 | `--result_folder` | `results` | Directory where annotations and results are saved |
+| `--soft-elo` | off | Use continuous judge preferences (soft Bradley-Terry) instead of hard win/loss/tie labels |
+| `--soft-elo-temperature` | `0.3` | Initial softmax temperature for `--soft-elo`; overridden if `--calibrate-temperature` succeeds |
+| `--calibrate-temperature` | off | MLE-calibrate the score-to-preference temperature against human arena annotations (requires `--soft-elo`) |
+| `--calibration-size` | all | Number of human battles to sample for calibration (requires `--calibrate-temperature`) |
+
+### Soft-ELO & temperature calibration
+
+By default, judge scores are discretised to hard win/loss/tie labels. Passing `--soft-elo` instead converts the raw score
+difference into a continuous preference via a softmax, which is then fed into a soft Bradley-Terry model.
+
+To let the data choose the best temperature automatically, add `--calibrate-temperature`.
+JudgeArena will run the judge on a sample of human-annotated arena battles, fit the temperature $T^*$ by MLE, and
+use it for the full evaluation:
+
+```bash
+judgearena \
+  --task elo-lmarena-100k \
+  --model_A Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \
+  --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \
+  --n_instructions 200 \
+  --soft-elo \
+  --calibrate-temperature \
+  --calibration-size 300
+```
 
 ### Output
 

diff --git a/judgearena/cli.py b/judgearena/cli.py
@@ -113,6 +113,32 @@ def _build_parser() -> argparse.ArgumentParser:
         default=None,
         help="[elo] Model anchored at 1000 ELO (ratings are reported relative to it).",
     )
+    parser.add_argument(
+        "--soft-elo",
+        action="store_true",
+        help="[elo] Use continuous judge preferences as soft BT targets instead of "
+        "discretising to hard win/loss/tie.",
+    )
+    parser.add_argument(
+        "--soft-elo-temperature",
+        type=float,
+        default=0.3,
+        help="[elo] Initial PairScore temperature for --soft-elo. "
+        "Overridden by --calibrate-temperature if calibration succeeds.",
+    )
+    parser.add_argument(
+        "--calibrate-temperature",
+        action="store_true",
+        help="[elo] MLE-fit the PairScore temperature against human-labeled arena "
+        "battles before the main run. Requires --soft-elo.",
+    )
+    parser.add_argument(
+        "--calibration-size",
+        type=int,
+        default=None,
+        help="[elo] Number of human arena battles to sample for temperature "
+        "calibration. Defaults to all. Requires --calibrate-temperature.",
+    )
     add_common_arguments(parser)
     return parser
 
@@ -191,6 +217,10 @@ def _build_elo_args(
         n_bootstraps=args.n_bootstraps,
         seed=args.seed,
         baseline_model=args.baseline_model,
+        soft_elo=args.soft_elo,
+        soft_elo_temperature=args.soft_elo_temperature,
+        calibrate_temperature=args.calibrate_temperature,
+        calibration_size=args.calibration_size,
         judge_model=args.judge_model,
         n_instructions=args.n_instructions,
         provide_explanation=args.provide_explanation,