|
29 | 29 |
|
30 | 30 | import logging |
31 | 31 | import json |
32 | | -import pandas as pd |
| 32 | +import random |
| 33 | +import statistics |
33 | 34 |
|
34 | 35 |
|
35 | 36 | AggregationMethod = Literal["mean", "max", "min", "bootstrap"] |
@@ -122,30 +123,25 @@ async def execute_run_with_progress(run_idx: int, config: RolloutProcessorConfig |
122 | 123 | raise |
123 | 124 |
|
124 | 125 |
|
125 | | -def calculate_bootstrap_scores(all_scores: list[float]) -> float: |
| 126 | +def calculate_bootstrap_scores(all_scores: list[float], n_boot: int = 100, seed: int | None = None) -> float: |
126 | 127 | """ |
127 | | - Calculate bootstrap confidence intervals for individual scores. |
| 128 | + Calculate the mean of bootstrap sample means for a list of scores. |
128 | 129 |
|
129 | 130 | Args: |
130 | | - all_scores: List of individual scores from all rows |
| 131 | + all_scores: List of individual scores from all rows. |
| 132 | + n_boot: Number of bootstrap resamples to draw (default 100). |
| 133 | + seed: Optional RNG seed for reproducibility. |
131 | 134 |
|
132 | 135 | Returns: |
133 | | - Mean bootstrap score |
| 136 | + Mean bootstrap score (float). Returns 0.0 if all_scores is empty. |
134 | 137 | """ |
135 | 138 | if not all_scores: |
136 | 139 | return 0.0 |
137 | 140 |
|
138 | | - # Create DataFrame (single column of scores) |
139 | | - battles = pd.DataFrame({"score": all_scores}) |
140 | | - |
141 | | - # Bootstrap sampling for calculating relative performance |
142 | | - bootstrap_means = [battles.sample(frac=1.0, replace=True)["score"].mean() for _ in range(100)] |
143 | | - |
144 | | - # Calculate final scores |
145 | | - bootstraps = pd.Series(bootstrap_means) |
146 | | - mean_score = bootstraps.mean() |
147 | | - |
148 | | - return float(mean_score) |
| 141 | + rng = random.Random(seed) if seed is not None else random |
| 142 | + k = len(all_scores) |
| 143 | + bootstrap_means = [statistics.fmean(rng.choices(all_scores, k=k)) for _ in range(n_boot)] |
| 144 | + return float(statistics.fmean(bootstrap_means)) |
149 | 145 |
|
150 | 146 |
|
151 | 147 | def aggregate(scores: list[float], method: AggregationMethod) -> float: |
|
0 commit comments