gpu2grid · jaywonchung · May 7, 2026 · May 7, 2026
diff --git a/docs/guide/data-pipeline.md b/docs/guide/data-pipeline.md
@@ -87,7 +87,7 @@ For each model and batch size, it:
 
 1. Extracts power timelines from benchmark runs
 2. Resamples to a median-duration grid
-3. Fits [`ITLMixtureModel`][mlenergy_data.modeling.ITLMixtureModel] distributions per batch size
+3. Fits [`ITLMixtureModel`][mlenergy.data.modeling.ITLMixtureModel] distributions per batch size
 
 ```
 ML.ENERGY Benchmark Dataset                 mlenergy-data
@@ -129,20 +129,20 @@ $$p(x) = \frac{P_{\max}}{1 + \exp(-k_p(x - x_{0,p}))} + p_0, \quad x \triangleq
 
 where $P_{\max}$ is the saturation magnitude, $k_p$ controls transition sharpness, $x_{0,p}$ is the characteristic batch size threshold, and $p_0$ is an offset term. Latency and throughput use the same functional form with their own parameters.
 
-OpenG2G uses [`LogisticModel`][mlenergy_data.modeling.LogisticModel] from [`mlenergy-data`](https://ml.energy/data) at both stages:
+OpenG2G uses [`LogisticModel`][mlenergy.data.modeling.LogisticModel] from [`mlenergy-data`](https://ml.energy/data) at both stages:
 
-- **Generation**: [`LogisticModel.fit(x, y)`][mlenergy_data.modeling.logistic.LogisticModel.fit] fits the curve to benchmark data
-- **Runtime**: [`LogisticModel.eval(batch)`][mlenergy_data.modeling.logistic.LogisticModel.eval] evaluates the curve, and [`LogisticModel.deriv_wrt_x(x)`][mlenergy_data.modeling.logistic.LogisticModel.deriv_wrt_x] computes gradients for the OFO controller
+- **Generation**: [`LogisticModel.fit(x, y)`][mlenergy.data.modeling.logistic.LogisticModel.fit] fits the curve to benchmark data
+- **Runtime**: [`LogisticModel.eval(batch)`][mlenergy.data.modeling.logistic.LogisticModel.eval] evaluates the curve, and [`LogisticModel.deriv_wrt_x(x)`][mlenergy.data.modeling.logistic.LogisticModel.deriv_wrt_x] computes gradients for the OFO controller
 
 ## ITL Mixture Model
 
 Historical ITL measurements exhibit heavy-tailed behavior.
 The generation step captures this using a weighted mixture of two lognormal distributions per batch size.
 
-OpenG2G uses [`ITLMixtureModel`][mlenergy_data.modeling.ITLMixtureModel] from [`mlenergy-data`](https://ml.energy/data) at both stages:
+OpenG2G uses [`ITLMixtureModel`][mlenergy.data.modeling.ITLMixtureModel] from [`mlenergy-data`](https://ml.energy/data) at both stages:
 
-- **Generation**: [`ITLMixtureModel.fit(samples)`][mlenergy_data.modeling.latency.ITLMixtureModel.fit] fits the mixture to raw ITL samples
-- **Runtime**: [`ITLMixtureModel.sample_avg(n_replicas, rng)`][mlenergy_data.modeling.latency.ITLMixtureModel.sample_avg] draws average latency across replicas
+- **Generation**: [`ITLMixtureModel.fit(samples)`][mlenergy.data.modeling.latency.ITLMixtureModel.fit] fits the mixture to raw ITL samples
+- **Runtime**: [`ITLMixtureModel.sample_avg(n_replicas, rng)`][mlenergy.data.modeling.latency.ITLMixtureModel.sample_avg] draws average latency across replicas
 
 ## Training Trace Generation
 
@@ -164,5 +164,5 @@ To use the dataset:
 
 At simulation time, the generated artifacts are consumed by two components:
 
-- **[`OfflineDatacenter`][openg2g.datacenter.offline.OfflineDatacenter]**: Uses [`InferenceData`][openg2g.datacenter.workloads.inference.InferenceData] to replay periodic per-GPU power templates. Latency fits ([`ITLMixtureModel`][mlenergy_data.modeling.ITLMixtureModel]) are sampled at each control interval.
+- **[`OfflineDatacenter`][openg2g.datacenter.offline.OfflineDatacenter]**: Uses [`InferenceData`][openg2g.datacenter.workloads.inference.InferenceData] to replay periodic per-GPU power templates. Latency fits ([`ITLMixtureModel`][mlenergy.data.modeling.ITLMixtureModel]) are sampled at each control interval.
 - **[`OFOBatchSizeController`][openg2g.controller.ofo.OFOBatchSizeController]**: Uses [`LogisticModelStore`][openg2g.controller.ofo.LogisticModelStore] for logistic curve evaluation. Calls `eval()` and `deriv_wrt_x()` at each control step to compute gradients.
diff --git a/examples/model_insights/plots.py b/examples/model_insights/plots.py
@@ -41,10 +41,36 @@
 
 COLORS = {
     "baseline": "#9A9A9A",
-    "ofo": "#4C72B0",
-    "h100": "#4C72B0",
-    "b200": "#C44E52",
-    "hardware": "#C44E52",
+    "ofo": "#1F77B4",  # tab10 blue
+    "h100": "#1F77B4",  # tab10 blue
+    "b200": "#D62728",  # tab10 red
+    "hardware": "#D62728",  # tab10 red
+}
+
+_TAB10 = plt.get_cmap("tab10").colors
+
+# Stable per-model colors used wherever a model is identified by color
+# (model-size figure, hardware figure, etc.). The hardware figure uses
+# blue for Qwen 3 32B and red for Qwen 3 8B; that pair is canonical and
+# the rest of the palette is filled in around it.
+MODEL_COLORS: dict[str, str] = {
+    "Qwen 3 32B": _TAB10[0],  # blue
+    "Qwen 3 30B A3B": _TAB10[1],  # orange
+    "GPT-OSS 120B": _TAB10[2],  # green
+    "Qwen 3 8B": _TAB10[3],  # red
+    "Qwen 3 235B A22B": _TAB10[4],  # purple
+    "Qwen 3 235B A22B Thinking": _TAB10[4],  # same as 235B A22B
+    "Llama 3.1 70B": _TAB10[5],  # brown
+    "Llama 3.1 405B": _TAB10[6],  # pink
+}
+
+# Distinct color pairs for the two parallelism panels so (a) and (b) read as
+# separate experiments rather than the same one twice. The first pair reuses
+# the default h100/b200 hues (blue, red); the second pair picks two of the
+# remaining tab10 colors used by the model-size figure that are not in (a).
+PARALLELISM_PAIR_COLORS = {
+    "gpt-oss-120b": (COLORS["h100"], COLORS["b200"]),
+    "qwen-235b-a22b-thinking": (_TAB10[1], _TAB10[2]),
 }
 
 DISPLAY_LABELS = {
@@ -356,15 +382,21 @@ def plot_model_size(
     agg = _aggregate(df)
     all_variants = df["variant"].drop_duplicates().tolist()
 
-    # Sort variants by power-swing range (least flexible → most flexible).
+    # Compute power-swing range per variant.
     def _swing(variant: str) -> float:
         row = df[(df["variant"] == variant) & (df["mode"] == OFO_MODE)].iloc[0].to_dict()
         _, p_mw, _ = _pareto_from_row(row, logistic_models)
         return float(p_mw.max() - p_mw.min()) if len(p_mw) else 0.0
 
-    variants = sorted(all_variants, key=_swing)
-    palette = plt.get_cmap("tab10").colors
-    color_map = {v: palette[i % len(palette)] for i, v in enumerate(variants)}
+    # Color is fixed per model identity (MODEL_COLORS), so the same model
+    # appears in the same color across model-size, hardware, and other
+    # figures.
+    color_map = {v: MODEL_COLORS[_pretty(v)] for v in all_variants}
+
+    # Plot order: widest → narrowest, so the reference model with the largest
+    # feasible power range anchors the leftmost position and other models
+    # trail as comparisons.
+    variants = sorted(all_variants, key=_swing, reverse=True)
 
     def _val(v, mode, col):
         row = agg[(agg.variant == v) & (agg["mode"] == mode)]
@@ -420,7 +452,8 @@ def _v(v, mode, col, sub=sub):
             r = sub[(sub["variant"] == v) & (sub["mode"] == mode)]
             return float(r[col].iloc[0]) if not r.empty else math.nan
 
-        color_map = {labels[i]: (COLORS["h100"] if i == 0 else COLORS["b200"]) for i in range(len(variants))}
+        pair_palette = PARALLELISM_PAIR_COLORS.get(pair, (COLORS["h100"], COLORS["b200"]))
+        color_map = {labels[i]: pair_palette[i] for i in range(len(variants))}
 
         # (a) integral violation — colored bars, hatch = baseline vs OFO
         fig, ax = _make_panel()
@@ -484,9 +517,11 @@ def _v(pair_, hw, mode, col):
         r = agg[(agg["variant"] == v) & (agg["mode"] == mode)]
         return float(r[col].iloc[0]) if not r.empty else math.nan
 
-    # Two-color encoding (blue for the first model pair, red for the
-    # second), matching the parallelism and precision figures.
-    pair_colors = {p: (COLORS["h100"] if i == 0 else COLORS["b200"]) for i, p in enumerate(pairs)}
+    # Color each pair by its canonical model color (MODEL_COLORS), so
+    # e.g. Qwen 3 8B is the same red here as in the model-size figure.
+    pair_colors = {
+        p: MODEL_COLORS[_pretty(next(v for (pp, _), v in variants_by_pair_hw.items() if pp == p))] for p in pairs
+    }
 
     # (a) — integral violation. X-axis = hardware. Within each hardware
     # group, 4 bars: 2 models × (uncoord, coord). Color = model, hatch =
@@ -654,7 +689,10 @@ def _v(v, mode, col):
     def _display_prec(label: str) -> str:
         return "BF16" if label == "bf16" else label
 
-    handles = [_combo_handle(color_map[lbl], _display_prec(lbl)) for lbl in sorted(set(labels))]
+    # Order: BF16 first (blue), then FP8 (red), matching the blue-then-red
+    # order used in the model-size, hardware, and parallelism legends.
+    ordered_labels = [lbl for lbl in ("bf16", "FP8") if lbl in set(labels)]
+    handles = [_combo_handle(color_map[lbl], _display_prec(lbl)) for lbl in ordered_labels]
     _save_legend(handles, out / f"precision_legend{suffix}", ncol=len(set(labels)), width=1.8)
 
 

diff --git a/openg2g/controller/ofo.py b/openg2g/controller/ofo.py
@@ -14,8 +14,8 @@
 from typing import Any
 
 import numpy as np
-from mlenergy_data.modeling import LogisticModel
-from mlenergy_data.records import LLMRuns
+from mlenergy.data.modeling import LogisticModel
+from mlenergy.data.records import LLMRuns
 from pydantic import BaseModel, ConfigDict
 
 from openg2g.clock import SimulationClock

diff --git a/openg2g/datacenter/workloads/inference.py b/openg2g/datacenter/workloads/inference.py
@@ -14,8 +14,8 @@
 
 import numpy as np
 import pandas as pd
-from mlenergy_data.modeling import ITLMixtureModel
-from mlenergy_data.records import LLMRuns
+from mlenergy.data.modeling import ITLMixtureModel
+from mlenergy.data.records import LLMRuns
 from pydantic import BaseModel, ConfigDict
 
 import openg2g

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,7 @@ dependencies = [
     "pydantic>=2.0",
     "aiohttp",
     "zeus>=0.15.0",
-    "mlenergy-data",
+    "mlenergy-data>=0.4.0",
 ]
 
 [project.urls]

diff --git a/tests/test_logistic.py b/tests/test_logistic.py
@@ -4,7 +4,7 @@
 
 import math
 
-from mlenergy_data.modeling import LogisticModel
+from mlenergy.data.modeling import LogisticModel
 
 
 def test_eval_at_midpoint():

diff --git a/tests/test_offline_dc.py b/tests/test_offline_dc.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 import pytest
-from mlenergy_data.modeling import ITLMixtureModel
+from mlenergy.data.modeling import ITLMixtureModel
 
 from openg2g.clock import SimulationClock
 from openg2g.coordinator import SimulationLog

diff --git a/tests/test_ofo_internals.py b/tests/test_ofo_internals.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import pytest
-from mlenergy_data.modeling import LogisticModel
+from mlenergy.data.modeling import LogisticModel
 
 from openg2g.controller.ofo import (
     OFOConfig,

diff --git a/tests/test_ofo_observed_latency.py b/tests/test_ofo_observed_latency.py
@@ -3,7 +3,7 @@
 from fractions import Fraction
 
 import numpy as np
-from mlenergy_data.modeling import LogisticModel
+from mlenergy.data.modeling import LogisticModel
 
 from openg2g.clock import SimulationClock
 from openg2g.common import ThreePhase

diff --git a/uv.lock b/uv.lock