From 89bcbae593403c00c84e11f9404deda09d26f90d Mon Sep 17 00:00:00 2001
From: Lars Erik Wik <lars.erik.wik@northern.tech>
Date: Thu, 7 May 2026 11:20:40 +0200
Subject: [PATCH] Build tuples in canonical (name-sorted) order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make tuple identity independent of the order fields are declared in the
config. Previously, the position of each value within a row's primary-key
or subsidiary tuple followed the declaration order in `tables.toml`, so
swapping two fields in the config silently changed every record's hash
key and produced bogus deltas against pre-swap state and blocks.

Replace `compute_key_indices` with `compute_canonical_columns`, which
sorts columns lexicographically by field name within each group and
returns each entry paired with its `FieldConfig`. Bundling index and
parser eliminates the implicit "two sorts must agree" coupling between
indices and configs that the old layout had. SQL emission still follows
the wire's column order, which is now canonical, so generated INSERT
column lists and composite-PK WHERE clauses are emitted in lexicographic
order — a follow-up will reroute SQL emission through the hub's declared
order so users keep their preferred column ordering in generated SQL.

Signed-off-by: Lars Erik Wik <lars.erik.wik@northern.tech>
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/config.rs                |  52 --------
 src/table.rs                 | 224 ++++++++++++++++++++++++-----------
 tests/accept_composite_pk.rs |  11 +-
 3 files changed, 163 insertions(+), 124 deletions(-)
diff --git a/src/config.rs b/src/config.rs
index b1c0ba5..cfff699 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -455,21 +455,6 @@ impl TableConfig {
             .map(|field| field.name.clone())
             .collect()
     }
-
-    /// Return field names in PK-first order: primary key fields first (in
-    /// declaration order), then subsidiary fields (in declaration order).
-    /// This matches the ordering used by `Table::load()` when building the
-    /// in-memory table stored in the STATE file.
-    pub fn ordered_field_names(&self) -> Vec<String> {
-        let primary_key = self.primary_key();
-        let mut names = primary_key.clone();
-        for field in &self.fields {
-            if !primary_key.contains(&field.name) {
-                names.push(field.name.clone());
-            }
-        }
-        names
-    }
 }
 
 impl Validate for Config {
@@ -574,43 +559,6 @@ impl Config {
 mod tests {
     use super::*;
 
-    fn make_field(name: &str, primary_key: bool) -> FieldConfig {
-        FieldConfig {
-            name: name.to_string(),
-            primary_key,
-            ..Default::default()
-        }
-    }
-
-    #[test]
-    fn test_ordered_field_names() {
-        let config = TableConfig {
-            source: "test.csv".to_string(),
-            header: false,
-            fields: vec![
-                make_field("name", false),
-                make_field("id", true),
-                make_field("email", false),
-            ],
-        };
-        assert_eq!(config.ordered_field_names(), vec!["id", "name", "email"]);
-    }
-
-    #[test]
-    fn test_ordered_field_names_multiple_primary_keys() {
-        let config = TableConfig {
-            source: "test.csv".to_string(),
-            header: false,
-            fields: vec![
-                make_field("value", false),
-                make_field("pk_b", true),
-                make_field("pk_a", true),
-            ],
-        };
-        // PKs in declaration order, then subsidiaries
-        assert_eq!(config.ordered_field_names(), vec!["pk_b", "pk_a", "value"]);
-    }
-
     #[test]
     fn test_validate_rejects_true_sentinel_on_non_boolean() {
         let field = FieldConfig {
diff --git a/src/table.rs b/src/table.rs
index ed8c1e4..2583dc5 100644
--- a/src/table.rs
+++ b/src/table.rs
@@ -14,6 +14,12 @@ use crate::value::{
 
 type ProtoTable = crate::proto::table::Table;
 
+/// Tuple positions (column index, field config) for one half of a row,
+/// either the primary-key columns or the subsidiaries. Sorted
+/// lexicographically by field name to keep tuple identity stable across
+/// config field reorderings.
+type CanonicalColumns<'a> = Vec<(usize, &'a FieldConfig)>;
+
 /// A table with records stored in a hash map for efficient lookup.
 /// Fields are ordered with primary key columns first, followed by subsidiary columns.
 #[derive(Debug, Clone, PartialEq)]
@@ -116,23 +122,35 @@ impl Table {
         Ok(indices)
     }
 
-    /// Split field indices into primary-key and subsidiary groups based on
-    /// which fields are marked as primary keys in the config.
-    fn compute_key_indices(
-        field_names: &[String],
-        primary_key: &[String],
+    /// Split columns into primary-key and subsidiary groups, each sorted
+    /// lexicographically by field name. Each entry pairs a column index
+    /// with the `FieldConfig` used to parse the value at that column. The
+    /// canonical ordering makes tuple identity independent of the order
+    /// fields are declared in the config, so reordering fields in
+    /// `tables.toml` does not change the on-disk or on-the-wire
+    /// representation.
+    fn compute_canonical_columns<'a>(
+        config: &'a TableConfig,
         field_indices: &[usize],
-    ) -> (Vec<usize>, Vec<usize>) {
-        let mut primary_indices = Vec::new();
-        let mut subsidiary_indices = Vec::new();
-        for (config_index, name) in field_names.iter().enumerate() {
-            if primary_key.contains(name) {
-                primary_indices.push(field_indices[config_index]);
+    ) -> (CanonicalColumns<'a>, CanonicalColumns<'a>) {
+        let mut entries: Vec<(&str, usize, &FieldConfig)> = config
+            .fields
+            .iter()
+            .zip(field_indices.iter())
+            .map(|(field, &idx)| (field.name.as_str(), idx, field))
+            .collect();
+        entries.sort_by_key(|(name, _, _)| *name);
+
+        let mut primary_columns: CanonicalColumns = Vec::new();
+        let mut subsidiary_columns: CanonicalColumns = Vec::new();
+        for (_, idx, field) in entries {
+            if field.primary_key {
+                primary_columns.push((idx, field));
             } else {
-                subsidiary_indices.push(field_indices[config_index]);
+                subsidiary_columns.push((idx, field));
             }
         }
-        (primary_indices, subsidiary_indices)
+        (primary_columns, subsidiary_columns)
     }
 
     #[cfg(test)]
@@ -153,21 +171,15 @@ impl Table {
         mut reader: csv::Reader<File>,
     ) -> Result<Self> {
         let field_names = config.field_names();
-        let primary_key = config.primary_key();
         let field_indices = Self::resolve_field_indices(config, &mut reader)?;
-        let (primary_indices, subsidiary_indices) =
-            Self::compute_key_indices(&field_names, &primary_key, &field_indices);
-
-        // Field configs split by primary-key flag in declaration order. This
-        // matches the ordering of `primary_indices` / `subsidiary_indices` —
-        // both are derived from `config.fields` in declaration order — so
-        // zipping the two pairs lines up the right config with each value.
-        let primary_field_configs: Vec<&FieldConfig> =
-            config.fields.iter().filter(|f| f.primary_key).collect();
-        let subsidiary_field_configs: Vec<&FieldConfig> =
-            config.fields.iter().filter(|f| !f.primary_key).collect();
+        let (primary_columns, subsidiary_columns) =
+            Self::compute_canonical_columns(config, &field_indices);
 
-        let fields = config.ordered_field_names();
+        let fields: Vec<String> = primary_columns
+            .iter()
+            .chain(subsidiary_columns.iter())
+            .map(|(_, field)| field.name.clone())
+            .collect();
 
         let mut records: HashMap<Vec<Value>, Vec<Value>> = HashMap::new();
 
@@ -190,9 +202,9 @@ impl Table {
                 continue;
             }
 
-            let primary_key = parse_columns(&record, &primary_indices, &primary_field_configs)
+            let primary_key = parse_columns(&record, &primary_columns)
                 .with_context(|| format!("row {}", row_num + 1))?;
-            let subsidiary = parse_columns(&record, &subsidiary_indices, &subsidiary_field_configs)
+            let subsidiary = parse_columns(&record, &subsidiary_columns)
                 .with_context(|| format!("row {}", row_num + 1))?;
 
             if records.insert(primary_key.clone(), subsidiary).is_some() {
@@ -204,17 +216,16 @@ impl Table {
     }
 }
 
-/// Pull values at `csv_indices` out of `record` and parse each one into a
-/// typed `Value` according to its corresponding `FieldConfig`. The two
-/// slices must be the same length and aligned 1-to-1.
+/// For each `(column_index, field_config)` entry, pull the value at
+/// `column_index` out of `record` and parse it into a typed `Value`
+/// according to `field_config`.
 fn parse_columns(
     record: &csv::StringRecord,
-    csv_indices: &[usize],
-    field_configs: &[&FieldConfig],
+    columns: &[(usize, &FieldConfig)],
 ) -> Result<Vec<Value>> {
-    let mut out = Vec::with_capacity(csv_indices.len());
-    for (&csv_index, &field) in csv_indices.iter().zip(field_configs.iter()) {
-        out.push(parse_field_value(&record[csv_index], field)?);
+    let mut out = Vec::with_capacity(columns.len());
+    for &(column_index, field) in columns {
+        out.push(parse_field_value(&record[column_index], field)?);
     }
     Ok(out)
 }
@@ -267,59 +278,140 @@ mod tests {
         }
     }
 
-    // -- compute_key_indices tests --
+    // -- compute_canonical_columns tests --
+
+    /// Project `compute_canonical_columns` output into `(column_index, name)`
+    /// pairs for easy assertion.
+    fn extract<'a>(columns: &[(usize, &'a FieldConfig)]) -> Vec<(usize, &'a str)> {
+        columns
+            .iter()
+            .map(|(idx, field)| (*idx, field.name.as_str()))
+            .collect()
+    }
 
     #[test]
-    fn test_compute_key_indices_single_primary_key() {
-        let field_names = ["id", "name", "email"].map(String::from).to_vec();
-        let primary_key = ["id"].map(String::from).to_vec();
+    fn test_compute_canonical_columns_single_primary_key() {
+        let config = make_config(
+            vec![
+                make_field("id", true),
+                make_field("name", false),
+                make_field("email", false),
+            ],
+            false,
+        );
         let field_indices = vec![0, 1, 2];
 
-        let (primary_indices, subsidiary_indices) =
-            Table::compute_key_indices(&field_names, &primary_key, &field_indices);
+        let (primary, subsidiary) = Table::compute_canonical_columns(&config, &field_indices);
 
-        assert_eq!(primary_indices, vec![0]);
-        assert_eq!(subsidiary_indices, vec![1, 2]);
+        assert_eq!(extract(&primary), vec![(0, "id")]);
+        // Subsidiaries sorted lexicographically: email before name.
+        assert_eq!(extract(&subsidiary), vec![(2, "email"), (1, "name")]);
     }
 
     #[test]
-    fn test_compute_key_indices_composite_primary_key() {
-        let field_names = ["region", "id", "name"].map(String::from).to_vec();
-        let primary_key = ["region", "id"].map(String::from).to_vec();
+    fn test_compute_canonical_columns_composite_primary_key() {
+        let config = make_config(
+            vec![
+                make_field("region", true),
+                make_field("id", true),
+                make_field("name", false),
+            ],
+            false,
+        );
         let field_indices = vec![0, 1, 2];
 
-        let (primary_indices, subsidiary_indices) =
-            Table::compute_key_indices(&field_names, &primary_key, &field_indices);
+        let (primary, subsidiary) = Table::compute_canonical_columns(&config, &field_indices);
 
-        assert_eq!(primary_indices, vec![0, 1]);
-        assert_eq!(subsidiary_indices, vec![2]);
+        // Primary keys sorted lexicographically: id before region.
+        assert_eq!(extract(&primary), vec![(1, "id"), (0, "region")]);
+        assert_eq!(extract(&subsidiary), vec![(2, "name")]);
     }
 
     #[test]
-    fn test_compute_key_indices_reordered_columns() {
-        // CSV columns are in a different order than the config fields.
-        let field_names = ["id", "name", "email"].map(String::from).to_vec();
-        let primary_key = ["id"].map(String::from).to_vec();
+    fn test_compute_canonical_columns_reordered_columns() {
+        // Source columns are in a different order than the declared fields.
+        let config = make_config(
+            vec![
+                make_field("id", true),
+                make_field("name", false),
+                make_field("email", false),
+            ],
+            true,
+        );
         let field_indices = vec![2, 0, 1]; // id->col2, name->col0, email->col1
 
-        let (primary_indices, subsidiary_indices) =
-            Table::compute_key_indices(&field_names, &primary_key, &field_indices);
+        let (primary, subsidiary) = Table::compute_canonical_columns(&config, &field_indices);
 
-        assert_eq!(primary_indices, vec![2]);
-        assert_eq!(subsidiary_indices, vec![0, 1]);
+        assert_eq!(extract(&primary), vec![(2, "id")]);
+        assert_eq!(extract(&subsidiary), vec![(1, "email"), (0, "name")]);
     }
 
     #[test]
-    fn test_compute_key_indices_all_primary_keys() {
-        let field_names = ["a", "b"].map(String::from).to_vec();
-        let primary_key = ["a", "b"].map(String::from).to_vec();
+    fn test_compute_canonical_columns_all_primary_keys() {
+        let config = make_config(vec![make_field("b", true), make_field("a", true)], false);
         let field_indices = vec![0, 1];
 
-        let (primary_indices, subsidiary_indices) =
-            Table::compute_key_indices(&field_names, &primary_key, &field_indices);
+        let (primary, subsidiary) = Table::compute_canonical_columns(&config, &field_indices);
 
-        assert_eq!(primary_indices, vec![0, 1]);
-        assert!(subsidiary_indices.is_empty());
+        // Both PKs, sorted lexicographically: a before b. The column index
+        // at each tuple position follows the field's source position.
+        assert_eq!(extract(&primary), vec![(1, "a"), (0, "b")]);
+        assert!(subsidiary.is_empty());
+    }
+
+    #[test]
+    fn test_parse_csv_tuple_identity_invariant_under_field_reorder() {
+        // Two configs with the same fields in different declaration orders
+        // produce identical tables when fed the same source data.
+        let config_a = make_config(
+            vec![
+                make_field("id", true),
+                make_field("name", false),
+                make_field("email", false),
+            ],
+            true,
+        );
+        let config_b = make_config(
+            vec![
+                make_field("email", false),
+                make_field("id", true),
+                make_field("name", false),
+            ],
+            true,
+        );
+        let csv = "id,name,email\n1,Alice,alice@example.com\n";
+
+        let reader_a = Table::test_reader(csv, true);
+        let table_a = Table::parse_csv("t", &config_a, &FilterConfig::default(), reader_a).unwrap();
+        let reader_b = Table::test_reader(csv, true);
+        let table_b = Table::parse_csv("t", &config_b, &FilterConfig::default(), reader_b).unwrap();
+
+        assert_eq!(table_a.fields, vec!["id", "email", "name"]);
+        assert_eq!(table_a.fields, table_b.fields);
+        assert_eq!(table_a.records, table_b.records);
+    }
+
+    #[test]
+    fn test_parse_csv_no_header_uses_canonical_tuple_layout() {
+        // header=false: source columns are matched positionally against the
+        // declaration order, but the resulting tuple is still canonical.
+        let config = make_config(
+            vec![
+                make_field("name", false),
+                make_field("id", true),
+                make_field("email", false),
+            ],
+            false,
+        );
+        let reader = Table::test_reader("Alice,1,a@b.com\n", false);
+        let table = Table::parse_csv("t", &config, &FilterConfig::default(), reader).unwrap();
+
+        // Canonical layout: id (PK), then subsidiaries sorted lex.
+        assert_eq!(table.fields, vec!["id", "email", "name"]);
+        assert_eq!(
+            table.records.get(&vec!["1".into()]),
+            Some(&vec!["a@b.com".into(), "Alice".into()])
+        );
     }
 
     // -- resolve_field_indices tests --
diff --git a/tests/accept_composite_pk.rs b/tests/accept_composite_pk.rs
index 4dfda71..a758c1d 100644
--- a/tests/accept_composite_pk.rs
+++ b/tests/accept_composite_pk.rs
@@ -38,18 +38,17 @@ fields = [
     let patch = Patch::create(&config, &hash1).unwrap();
     let sql = sql::patch_to_sql(&config, &patch).unwrap().unwrap();
 
-    // DELETE should use composite key with AND
+    // Composite-key columns appear in canonical (lex-sorted) order:
+    // course_id before student_id.
     assert!(
-        sql.contains(r#"DELETE FROM "enrollments" WHERE "student_id" = 1 AND "course_id" = 102;"#)
+        sql.contains(r#"DELETE FROM "enrollments" WHERE "course_id" = 102 AND "student_id" = 1;"#)
     );
 
-    // INSERT should include all columns
     assert!(sql.contains(
-        r#"INSERT INTO "enrollments" ("student_id", "course_id", "grade") VALUES (2, 103, 'B');"#
+        r#"INSERT INTO "enrollments" ("course_id", "student_id", "grade") VALUES (103, 2, 'B');"#
     ));
 
-    // UPDATE should use composite key in WHERE
-    assert!(sql.contains(r#"WHERE "student_id" = 1 AND "course_id" = 101;"#));
+    assert!(sql.contains(r#"WHERE "course_id" = 101 AND "student_id" = 1;"#));
     assert!(sql.contains(r#"SET "grade" = 'A+'"#));
 
     common::assert_wire_roundtrip(&config, &patch);