diff --git a/docs/guide/data-pipeline.md b/docs/guide/data-pipeline.md
index a064649..a058300 100644
--- a/docs/guide/data-pipeline.md
+++ b/docs/guide/data-pipeline.md
@@ -87,7 +87,7 @@ For each model and batch size, it:
 
 1. Extracts power timelines from benchmark runs
 2. Resamples to a median-duration grid
-3. Fits [`ITLMixtureModel`][mlenergy_data.modeling.ITLMixtureModel] distributions per batch size
+3. Fits [`ITLMixtureModel`][mlenergy.data.modeling.ITLMixtureModel] distributions per batch size
 
 ```
 ML.ENERGY Benchmark Dataset                 mlenergy-data
@@ -129,20 +129,20 @@ $$p(x) = \frac{P_{\max}}{1 + \exp(-k_p(x - x_{0,p}))} + p_0, \quad x \triangleq
 
 where $P_{\max}$ is the saturation magnitude, $k_p$ controls transition sharpness, $x_{0,p}$ is the characteristic batch size threshold, and $p_0$ is an offset term. Latency and throughput use the same functional form with their own parameters.
 
-OpenG2G uses [`LogisticModel`][mlenergy_data.modeling.LogisticModel] from [`mlenergy-data`](https://ml.energy/data) at both stages:
+OpenG2G uses [`LogisticModel`][mlenergy.data.modeling.LogisticModel] from [`mlenergy-data`](https://ml.energy/data) at both stages:
 
-- **Generation**: [`LogisticModel.fit(x, y)`][mlenergy_data.modeling.logistic.LogisticModel.fit] fits the curve to benchmark data
-- **Runtime**: [`LogisticModel.eval(batch)`][mlenergy_data.modeling.logistic.LogisticModel.eval] evaluates the curve, and [`LogisticModel.deriv_wrt_x(x)`][mlenergy_data.modeling.logistic.LogisticModel.deriv_wrt_x] computes gradients for the OFO controller
+- **Generation**: [`LogisticModel.fit(x, y)`][mlenergy.data.modeling.logistic.LogisticModel.fit] fits the curve to benchmark data
+- **Runtime**: [`LogisticModel.eval(batch)`][mlenergy.data.modeling.logistic.LogisticModel.eval] evaluates the curve, and [`LogisticModel.deriv_wrt_x(x)`][mlenergy.data.modeling.logistic.LogisticModel.deriv_wrt_x] computes gradients for the OFO controller
 
 ## ITL Mixture Model
 
 Historical ITL measurements exhibit heavy-tailed behavior.
 The generation step captures this using a weighted mixture of two lognormal distributions per batch size.
 
-OpenG2G uses [`ITLMixtureModel`][mlenergy_data.modeling.ITLMixtureModel] from [`mlenergy-data`](https://ml.energy/data) at both stages:
+OpenG2G uses [`ITLMixtureModel`][mlenergy.data.modeling.ITLMixtureModel] from [`mlenergy-data`](https://ml.energy/data) at both stages:
 
-- **Generation**: [`ITLMixtureModel.fit(samples)`][mlenergy_data.modeling.latency.ITLMixtureModel.fit] fits the mixture to raw ITL samples
-- **Runtime**: [`ITLMixtureModel.sample_avg(n_replicas, rng)`][mlenergy_data.modeling.latency.ITLMixtureModel.sample_avg] draws average latency across replicas
+- **Generation**: [`ITLMixtureModel.fit(samples)`][mlenergy.data.modeling.latency.ITLMixtureModel.fit] fits the mixture to raw ITL samples
+- **Runtime**: [`ITLMixtureModel.sample_avg(n_replicas, rng)`][mlenergy.data.modeling.latency.ITLMixtureModel.sample_avg] draws average latency across replicas
 
 ## Training Trace Generation
 
@@ -164,5 +164,5 @@ To use the dataset:
 
 At simulation time, the generated artifacts are consumed by two components:
 
-- **[`OfflineDatacenter`][openg2g.datacenter.offline.OfflineDatacenter]**: Uses [`InferenceData`][openg2g.datacenter.workloads.inference.InferenceData] to replay periodic per-GPU power templates. Latency fits ([`ITLMixtureModel`][mlenergy_data.modeling.ITLMixtureModel]) are sampled at each control interval.
+- **[`OfflineDatacenter`][openg2g.datacenter.offline.OfflineDatacenter]**: Uses [`InferenceData`][openg2g.datacenter.workloads.inference.InferenceData] to replay periodic per-GPU power templates. Latency fits ([`ITLMixtureModel`][mlenergy.data.modeling.ITLMixtureModel]) are sampled at each control interval.
 - **[`OFOBatchSizeController`][openg2g.controller.ofo.OFOBatchSizeController]**: Uses [`LogisticModelStore`][openg2g.controller.ofo.LogisticModelStore] for logistic curve evaluation. Calls `eval()` and `deriv_wrt_x()` at each control step to compute gradients.
diff --git a/examples/model_insights/plots.py b/examples/model_insights/plots.py
index 7fcf200..a13d390 100644
--- a/examples/model_insights/plots.py
+++ b/examples/model_insights/plots.py
@@ -41,10 +41,36 @@
 
 COLORS = {
     "baseline": "#9A9A9A",
-    "ofo": "#4C72B0",
-    "h100": "#4C72B0",
-    "b200": "#C44E52",
-    "hardware": "#C44E52",
+    "ofo": "#1F77B4",  # tab10 blue
+    "h100": "#1F77B4",  # tab10 blue
+    "b200": "#D62728",  # tab10 red
+    "hardware": "#D62728",  # tab10 red
+}
+
+_TAB10 = plt.get_cmap("tab10").colors
+
+# Stable per-model colors used wherever a model is identified by color
+# (model-size figure, hardware figure, etc.). The hardware figure uses
+# blue for Qwen 3 32B and red for Qwen 3 8B; that pair is canonical and
+# the rest of the palette is filled in around it.
+MODEL_COLORS: dict[str, str] = {
+    "Qwen 3 32B": _TAB10[0],  # blue
+    "Qwen 3 30B A3B": _TAB10[1],  # orange
+    "GPT-OSS 120B": _TAB10[2],  # green
+    "Qwen 3 8B": _TAB10[3],  # red
+    "Qwen 3 235B A22B": _TAB10[4],  # purple
+    "Qwen 3 235B A22B Thinking": _TAB10[4],  # same as 235B A22B
+    "Llama 3.1 70B": _TAB10[5],  # brown
+    "Llama 3.1 405B": _TAB10[6],  # pink
+}
+
+# Distinct color pairs for the two parallelism panels so (a) and (b) read as
+# separate experiments rather than the same one twice. The first pair reuses
+# the default h100/b200 hues (blue, red); the second pair picks two of the
+# remaining tab10 colors used by the model-size figure that are not in (a).
+PARALLELISM_PAIR_COLORS = {
+    "gpt-oss-120b": (COLORS["h100"], COLORS["b200"]),
+    "qwen-235b-a22b-thinking": (_TAB10[1], _TAB10[2]),
 }
 
 DISPLAY_LABELS = {
@@ -356,15 +382,21 @@ def plot_model_size(
     agg = _aggregate(df)
     all_variants = df["variant"].drop_duplicates().tolist()
 
-    # Sort variants by power-swing range (least flexible → most flexible).
+    # Compute power-swing range per variant.
     def _swing(variant: str) -> float:
         row = df[(df["variant"] == variant) & (df["mode"] == OFO_MODE)].iloc[0].to_dict()
         _, p_mw, _ = _pareto_from_row(row, logistic_models)
         return float(p_mw.max() - p_mw.min()) if len(p_mw) else 0.0
 
-    variants = sorted(all_variants, key=_swing)
-    palette = plt.get_cmap("tab10").colors
-    color_map = {v: palette[i % len(palette)] for i, v in enumerate(variants)}
+    # Color is fixed per model identity (MODEL_COLORS), so the same model
+    # appears in the same color across model-size, hardware, and other
+    # figures.
+    color_map = {v: MODEL_COLORS[_pretty(v)] for v in all_variants}
+
+    # Plot order: widest → narrowest, so the reference model with the largest
+    # feasible power range anchors the leftmost position and other models
+    # trail as comparisons.
+    variants = sorted(all_variants, key=_swing, reverse=True)
 
     def _val(v, mode, col):
         row = agg[(agg.variant == v) & (agg["mode"] == mode)]
@@ -420,7 +452,8 @@ def _v(v, mode, col, sub=sub):
             r = sub[(sub["variant"] == v) & (sub["mode"] == mode)]
             return float(r[col].iloc[0]) if not r.empty else math.nan
 
-        color_map = {labels[i]: (COLORS["h100"] if i == 0 else COLORS["b200"]) for i in range(len(variants))}
+        pair_palette = PARALLELISM_PAIR_COLORS.get(pair, (COLORS["h100"], COLORS["b200"]))
+        color_map = {labels[i]: pair_palette[i] for i in range(len(variants))}
 
         # (a) integral violation — colored bars, hatch = baseline vs OFO
         fig, ax = _make_panel()
@@ -484,9 +517,11 @@ def _v(pair_, hw, mode, col):
         r = agg[(agg["variant"] == v) & (agg["mode"] == mode)]
         return float(r[col].iloc[0]) if not r.empty else math.nan
 
-    # Two-color encoding (blue for the first model pair, red for the
-    # second), matching the parallelism and precision figures.
-    pair_colors = {p: (COLORS["h100"] if i == 0 else COLORS["b200"]) for i, p in enumerate(pairs)}
+    # Color each pair by its canonical model color (MODEL_COLORS), so
+    # e.g. Qwen 3 8B is the same red here as in the model-size figure.
+    pair_colors = {
+        p: MODEL_COLORS[_pretty(next(v for (pp, _), v in variants_by_pair_hw.items() if pp == p))] for p in pairs
+    }
 
     # (a) — integral violation. X-axis = hardware. Within each hardware
     # group, 4 bars: 2 models × (uncoord, coord). Color = model, hatch =
@@ -654,7 +689,10 @@ def _v(v, mode, col):
     def _display_prec(label: str) -> str:
         return "BF16" if label == "bf16" else label
 
-    handles = [_combo_handle(color_map[lbl], _display_prec(lbl)) for lbl in sorted(set(labels))]
+    # Order: BF16 first (blue), then FP8 (red), matching the blue-then-red
+    # order used in the model-size, hardware, and parallelism legends.
+    ordered_labels = [lbl for lbl in ("bf16", "FP8") if lbl in set(labels)]
+    handles = [_combo_handle(color_map[lbl], _display_prec(lbl)) for lbl in ordered_labels]
     _save_legend(handles, out / f"precision_legend{suffix}", ncol=len(set(labels)), width=1.8)
 
 
diff --git a/openg2g/controller/ofo.py b/openg2g/controller/ofo.py
index 38304c7..871713d 100644
--- a/openg2g/controller/ofo.py
+++ b/openg2g/controller/ofo.py
@@ -14,8 +14,8 @@
 from typing import Any
 
 import numpy as np
-from mlenergy_data.modeling import LogisticModel
-from mlenergy_data.records import LLMRuns
+from mlenergy.data.modeling import LogisticModel
+from mlenergy.data.records import LLMRuns
 from pydantic import BaseModel, ConfigDict
 
 from openg2g.clock import SimulationClock
diff --git a/openg2g/datacenter/workloads/inference.py b/openg2g/datacenter/workloads/inference.py
index b51ee73..4bee278 100644
--- a/openg2g/datacenter/workloads/inference.py
+++ b/openg2g/datacenter/workloads/inference.py
@@ -14,8 +14,8 @@
 
 import numpy as np
 import pandas as pd
-from mlenergy_data.modeling import ITLMixtureModel
-from mlenergy_data.records import LLMRuns
+from mlenergy.data.modeling import ITLMixtureModel
+from mlenergy.data.records import LLMRuns
 from pydantic import BaseModel, ConfigDict
 
 import openg2g
diff --git a/pyproject.toml b/pyproject.toml
index 331990f..ab20202 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@ dependencies = [
     "pydantic>=2.0",
     "aiohttp",
     "zeus>=0.15.0",
-    "mlenergy-data",
+    "mlenergy-data>=0.4.0",
 ]
 
 [project.urls]
diff --git a/tests/test_logistic.py b/tests/test_logistic.py
index f1251ab..f032cb3 100644
--- a/tests/test_logistic.py
+++ b/tests/test_logistic.py
@@ -4,7 +4,7 @@
 
 import math
 
-from mlenergy_data.modeling import LogisticModel
+from mlenergy.data.modeling import LogisticModel
 
 
 def test_eval_at_midpoint():
diff --git a/tests/test_offline_dc.py b/tests/test_offline_dc.py
index f1b1f4c..b8b5160 100644
--- a/tests/test_offline_dc.py
+++ b/tests/test_offline_dc.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 import pytest
-from mlenergy_data.modeling import ITLMixtureModel
+from mlenergy.data.modeling import ITLMixtureModel
 
 from openg2g.clock import SimulationClock
 from openg2g.coordinator import SimulationLog
diff --git a/tests/test_ofo_internals.py b/tests/test_ofo_internals.py
index 59045de..d24b773 100644
--- a/tests/test_ofo_internals.py
+++ b/tests/test_ofo_internals.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import pytest
-from mlenergy_data.modeling import LogisticModel
+from mlenergy.data.modeling import LogisticModel
 
 from openg2g.controller.ofo import (
     OFOConfig,
diff --git a/tests/test_ofo_observed_latency.py b/tests/test_ofo_observed_latency.py
index 2178dfa..783b4db 100644
--- a/tests/test_ofo_observed_latency.py
+++ b/tests/test_ofo_observed_latency.py
@@ -3,7 +3,7 @@
 from fractions import Fraction
 
 import numpy as np
-from mlenergy_data.modeling import LogisticModel
+from mlenergy.data.modeling import LogisticModel
 
 from openg2g.clock import SimulationClock
 from openg2g.common import ThreePhase
diff --git a/uv.lock b/uv.lock
index f63a01f..1df5ddc 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1680,7 +1680,7 @@ wheels = [
 
 [[package]]
 name = "mlenergy-data"
-version = "0.3.2"
+version = "0.4.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub" },
@@ -1691,9 +1691,9 @@ dependencies = [
     { name = "pyarrow" },
     { name = "pyyaml" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ac/55/131b85ce36e2c9ce91612aea93b5912a4730eeac5f5f157647b9040b9f4a/mlenergy_data-0.3.2.tar.gz", hash = "sha256:613f89c508c8a2c328962f015a3c09db8f8fd52f78356e606b4dc5ff75102365", size = 35503, upload-time = "2026-03-23T21:38:21.542Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/37/d6/1958e8445023dfdc748018af1bf349d4f8a3c09d1c61316accc8fdacc12d/mlenergy_data-0.4.0.tar.gz", hash = "sha256:94119d4884d19348af340ecb8d36e75270e37b72ea1d606e9976aa6a3293ce4c", size = 37477, upload-time = "2026-05-07T02:10:30.823Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/24/3e/52e2de434bb6603d4159f945267def38c7b6e5898d4b887226cd35bf7e60/mlenergy_data-0.3.2-py3-none-any.whl", hash = "sha256:56d71c9e6504b8a59eda3a282ab79f68db4b19a374b3b09872b70d3778c84ebe", size = 27503, upload-time = "2026-03-23T21:38:20.677Z" },
+    { url = "https://files.pythonhosted.org/packages/da/d8/4396dcb42dbfc9018f4c397df9e3304c7cbbc62f67afd6f6fe956cc1f889/mlenergy_data-0.4.0-py3-none-any.whl", hash = "sha256:5c793a9b625091ffcd2c6a071458e3bdf76d2461913868d2d8a48bb45907a71f", size = 28378, upload-time = "2026-05-07T02:10:29.593Z" },
 ]
 
 [[package]]
@@ -2299,7 +2299,7 @@ test = [
 requires-dist = [
     { name = "aiohttp" },
     { name = "gymnasium", marker = "extra == 'rl'" },
-    { name = "mlenergy-data" },
+    { name = "mlenergy-data", specifier = ">=0.4.0" },
     { name = "numpy" },
     { name = "opendssdirect-py", marker = "extra == 'opendss'" },
     { name = "pandas" },