opensafely · rw251 · Dec 2, 2025 · Nov 28, 2025 · Nov 28, 2025 · Nov 28, 2025
diff --git a/analysis/query_icd10_apcs.sql b/analysis/query_icd10_apcs.sql
@@ -1,59 +1,138 @@
--- Count ICD10 codes from Der_Diagnosis_All in HES APC spells
+-- Count ICD10 codes from HES APC spells
 -- Each code is counted only once per spell, regardless of how many episodes contain it
 -- Format of Der_Diagnosis_All: "||E119 ,E780 ,J849 ||I801 ,I802 ,N179"
 --   - "||" delimits episodes within a spell
 --   - "," delimits codes within an episode
 --   - We assume spaces may vary
--- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14)
+-- Outputs two independent count columns:
+--   - primary_count: count of spells where this code is the primary diagnosis (from APCS_Der)
+--   - secondary_count: count of spells where this code is in the secondary diagnoses (from APCS_Der)
+--   - all_count: count of spells where this code appears in Der_Diagnosis_All
+-- Note: A code may appear as primary but not in "all", or vice versa, or neither - we're not really sure
+-- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14, "0" if 0)
 -- Excludes patients with Type 1 opt-out
 --
 -- Note: Uses XML-based string splitting for SQL Server compatibility level 100
 -- (STRING_SPLIT requires compatibility level 130+ which TPP doesn't use)
 
-WITH apcs_filtered AS (
-    -- Pre-filter APCS records excluding Type 1 opt-outs
-    SELECT 
-        APCS_Ident,
-        Der_Financial_Year,
+WITH apcs_base AS (
+    -- Base APCS records excluding Type 1 opt-outs
+    SELECT
+        apcs.APCS_Ident,
+        apcs.Der_Financial_Year,
+        LTRIM(RTRIM(der.Spell_Primary_Diagnosis)) as primary_diagnosis,
+        LTRIM(RTRIM(der.Spell_Secondary_Diagnoses)) as secondary_diagnoses,
         -- Normalize: replace || with comma, remove spaces
-        REPLACE(REPLACE(Der_Diagnosis_All, '||', ','), ' ', '') AS normalized_codes
-    FROM APCS
-    WHERE Der_Diagnosis_All IS NOT NULL
-      AND NOT EXISTS (
-          SELECT 1 FROM PatientsWithTypeOneDissent p 
-          WHERE p.Patient_ID = APCS.Patient_ID
+        REPLACE(REPLACE(apcs.Der_Diagnosis_All, '||', ','), ' ', '') AS normalized_codes
+    FROM APCS as apcs
+    LEFT JOIN APCS_Der AS der -- 1:1 relationship between apcs and apcs_der as per ehrql docs
+    ON apcs.APCS_Ident = der.APCS_Ident
+    WHERE NOT EXISTS (
+          SELECT 1 FROM PatientsWithTypeOneDissent p
+          WHERE p.Patient_ID = apcs.Patient_ID
       )
 ),
-spell_codes AS (
-    -- Extract all codes from each spell using XML-based string splitting
-    -- This approach is compatible with SQL Server 2008+ (compatibility level 100)
+-- Extract all codes from Der_Diagnosis_All
+all_codes AS (
     SELECT DISTINCT
         a.APCS_Ident,
         a.Der_Financial_Year,
         LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) AS icd10_code
-    FROM apcs_filtered a
+    FROM apcs_base a
     CROSS APPLY (
         SELECT CAST('<x>' + REPLACE(a.normalized_codes, ',', '</x><x>') + '</x>' AS XML) AS xml_codes
     ) AS xml_data
     CROSS APPLY xml_data.xml_codes.nodes('/x') AS split(code)
-    WHERE LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) <> ''
+    WHERE a.normalized_codes IS NOT NULL
+      AND a.normalized_codes <> ''
+      AND LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) <> ''
+),
+-- Extract primary diagnosis codes
+primary_codes AS (
+    SELECT DISTINCT
+        APCS_Ident,
+        Der_Financial_Year,
+        primary_diagnosis AS icd10_code
+    FROM apcs_base
+    WHERE primary_diagnosis IS NOT NULL
+      AND primary_diagnosis <> ''
+),
+-- Extract secondary diagnosis codes
+secondary_codes AS (
+    SELECT DISTINCT
+        APCS_Ident,
+        Der_Financial_Year,
+        secondary_diagnoses AS icd10_code
+    FROM apcs_base
+    WHERE secondary_diagnoses IS NOT NULL
+      AND secondary_diagnoses <> ''
+),
+-- Get all unique codes from either source
+all_unique_codes AS (
+    SELECT Der_Financial_Year, icd10_code FROM all_codes
+    UNION
+    SELECT Der_Financial_Year, icd10_code FROM primary_codes
+    UNION
+    SELECT Der_Financial_Year, icd10_code FROM secondary_codes
 ),
-code_counts AS (
-    -- Count spells per code per financial year
-    SELECT 
+-- Count "all" occurrences per code per financial year
+all_counts AS (
+    SELECT
         Der_Financial_Year AS financial_year,
         icd10_code,
         COUNT(*) AS raw_count
-    FROM spell_codes
+    FROM all_codes
     GROUP BY Der_Financial_Year, icd10_code
+),
+-- Count "primary" occurrences per code per financial year
+primary_counts AS (
+    SELECT
+        Der_Financial_Year AS financial_year,
+        icd10_code,
+        COUNT(*) AS raw_count
+    FROM primary_codes
+    GROUP BY Der_Financial_Year, icd10_code
+),
+-- Count "secondary" occurrences per code per financial year
+secondary_counts AS (
+    SELECT
+        Der_Financial_Year AS financial_year,
+        icd10_code,
+        COUNT(*) AS raw_count
+    FROM secondary_codes
+    GROUP BY Der_Financial_Year, icd10_code
+),
+-- Combine counts
+combined_counts AS (
+    SELECT
+        u.Der_Financial_Year AS financial_year,
+        u.icd10_code,
+        ISNULL(p.raw_count, 0) AS primary_raw_count,
+        ISNULL(s.raw_count, 0) AS secondary_raw_count,
+        ISNULL(a.raw_count, 0) AS all_raw_count
+    FROM all_unique_codes u
+    LEFT JOIN primary_counts p ON u.Der_Financial_Year = p.financial_year AND u.icd10_code = p.icd10_code
+    LEFT JOIN secondary_counts s ON u.Der_Financial_Year = s.financial_year AND u.icd10_code = s.icd10_code
+    LEFT JOIN all_counts a ON u.Der_Financial_Year = a.financial_year AND u.icd10_code = a.icd10_code
 )
-SELECT 
+SELECT
     financial_year,
     icd10_code,
-    CASE 
-        WHEN raw_count < 15 THEN '<15'
-        ELSE CAST(ROUND(raw_count, -1) AS VARCHAR(20))
-    END AS spell_count
-FROM code_counts
-WHERE raw_count > 0
-ORDER BY financial_year DESC, raw_count DESC, icd10_code;
+    CASE
+        WHEN primary_raw_count = 0 THEN '0'
+        WHEN primary_raw_count < 15 THEN '<15'
+        ELSE CAST(ROUND(primary_raw_count, -1) AS VARCHAR(20))
+    END AS primary_count,
+    CASE
+        WHEN secondary_raw_count = 0 THEN '0'
+        WHEN secondary_raw_count < 15 THEN '<15'
+        ELSE CAST(ROUND(secondary_raw_count, -1) AS VARCHAR(20))
+    END AS secondary_count,
+    CASE
+        WHEN all_raw_count = 0 THEN '0'
+        WHEN all_raw_count < 15 THEN '<15'
+        ELSE CAST(ROUND(all_raw_count, -1) AS VARCHAR(20))
+    END AS all_count
+FROM combined_counts
+WHERE primary_raw_count > 0 OR secondary_raw_count > 0 OR all_raw_count > 0
+ORDER BY financial_year DESC, all_raw_count DESC, icd10_code
diff --git a/analysis/query_icd10_ons_deaths.sql b/analysis/query_icd10_ons_deaths.sql
@@ -0,0 +1,115 @@
+-- Count ICD10 codes from ONS Deaths data
+-- Counts primary cause (icd10u) and contributing causes (ICD10001-ICD10015) separately
+-- Uses only the first death record per patient (by earliest dod, then by icd10u for ties - which is what ehrql does)
+-- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14)
+-- Excludes patients with Type 1 opt-out
+
+WITH first_record_per_patient AS (
+    -- Get only the first death record per patient, excluding T1OO patients
+    SELECT *
+    FROM (
+        SELECT
+            Patient_ID,
+            dod,
+            LTRIM(RTRIM(icd10u)) AS icd10_code,
+            LTRIM(RTRIM(ICD10001)) AS ICD10001,
+            LTRIM(RTRIM(ICD10002)) AS ICD10002,
+            LTRIM(RTRIM(ICD10003)) AS ICD10003,
+            LTRIM(RTRIM(ICD10004)) AS ICD10004,
+            LTRIM(RTRIM(ICD10005)) AS ICD10005,
+            LTRIM(RTRIM(ICD10006)) AS ICD10006,
+            LTRIM(RTRIM(ICD10007)) AS ICD10007,
+            LTRIM(RTRIM(ICD10008)) AS ICD10008,
+            LTRIM(RTRIM(ICD10009)) AS ICD10009,
+            LTRIM(RTRIM(ICD10010)) AS ICD10010,
+            LTRIM(RTRIM(ICD10011)) AS ICD10011,
+            LTRIM(RTRIM(ICD10012)) AS ICD10012,
+            LTRIM(RTRIM(ICD10013)) AS ICD10013,
+            LTRIM(RTRIM(ICD10014)) AS ICD10014,
+            LTRIM(RTRIM(ICD10015)) AS ICD10015,
+            ROW_NUMBER() OVER (
+                PARTITION BY Patient_ID
+                ORDER BY dod ASC, icd10u ASC
+            ) AS rownum
+        FROM ONS_Deaths ons
+        WHERE NOT EXISTS (
+            SELECT 1 FROM PatientsWithTypeOneDissent p WHERE p.Patient_ID = ons.Patient_ID
+        )
+    ) t
+    WHERE t.rownum = 1
+),
+death_codes AS (
+    -- Unpivot all ICD10 codes with their type (primary vs contributing)
+    SELECT Patient_ID, dod, icd10_code, 'primary' AS code_type FROM first_record_per_patient WHERE icd10_code IS NOT NULL AND icd10_code <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10001, 'contributing' FROM first_record_per_patient WHERE ICD10001 IS NOT NULL AND ICD10001 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10002, 'contributing' FROM first_record_per_patient WHERE ICD10002 IS NOT NULL AND ICD10002 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10003, 'contributing' FROM first_record_per_patient WHERE ICD10003 IS NOT NULL AND ICD10003 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10004, 'contributing' FROM first_record_per_patient WHERE ICD10004 IS NOT NULL AND ICD10004 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10005, 'contributing' FROM first_record_per_patient WHERE ICD10005 IS NOT NULL AND ICD10005 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10006, 'contributing' FROM first_record_per_patient WHERE ICD10006 IS NOT NULL AND ICD10006 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10007, 'contributing' FROM first_record_per_patient WHERE ICD10007 IS NOT NULL AND ICD10007 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10008, 'contributing' FROM first_record_per_patient WHERE ICD10008 IS NOT NULL AND ICD10008 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10009, 'contributing' FROM first_record_per_patient WHERE ICD10009 IS NOT NULL AND ICD10009 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10010, 'contributing' FROM first_record_per_patient WHERE ICD10010 IS NOT NULL AND ICD10010 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10011, 'contributing' FROM first_record_per_patient WHERE ICD10011 IS NOT NULL AND ICD10011 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10012, 'contributing' FROM first_record_per_patient WHERE ICD10012 IS NOT NULL AND ICD10012 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10013, 'contributing' FROM first_record_per_patient WHERE ICD10013 IS NOT NULL AND ICD10013 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10014, 'contributing' FROM first_record_per_patient WHERE ICD10014 IS NOT NULL AND ICD10014 <> ''
+    UNION ALL
+    SELECT Patient_ID, dod, ICD10015, 'contributing' FROM first_record_per_patient WHERE ICD10015 IS NOT NULL AND ICD10015 <> ''
+),
+codes_with_fy AS (
+    -- Calculate financial year from date of death and deduplicate per patient
+    SELECT DISTINCT
+        Patient_ID,
+        icd10_code,
+        code_type,
+        -- Financial year: Apr-Mar, so Apr 2024 -> 2024-25, Mar 2024 -> 2023-24
+        CASE 
+            WHEN MONTH(dod) >= 4 
+            THEN CAST(YEAR(dod) AS VARCHAR(4)) + '-' + RIGHT(CAST(YEAR(dod) + 1 AS VARCHAR(4)), 2)
+            ELSE CAST(YEAR(dod) - 1 AS VARCHAR(4)) + '-' + RIGHT(CAST(YEAR(dod) AS VARCHAR(4)), 2)
+        END AS financial_year
+    FROM death_codes
+    WHERE dod IS NOT NULL
+),
+code_counts AS (
+    -- Count by financial year, code, and type
+    SELECT 
+        financial_year,
+        icd10_code,
+        SUM(CASE WHEN code_type = 'primary' THEN 1 ELSE 0 END) AS primary_count,
+        SUM(CASE WHEN code_type = 'contributing' THEN 1 ELSE 0 END) AS contributing_count
+    FROM codes_with_fy
+    GROUP BY financial_year, icd10_code
+)
+SELECT 
+    financial_year,
+    icd10_code,
+    CASE 
+        WHEN primary_count = 0 THEN '0'
+        WHEN primary_count < 15 THEN '<15'
+        ELSE CAST(ROUND(primary_count, -1) AS VARCHAR(20))
+    END AS primary_cause_count,
+    CASE 
+        WHEN contributing_count = 0 THEN '0'
+        WHEN contributing_count < 15 THEN '<15'
+        ELSE CAST(ROUND(contributing_count, -1) AS VARCHAR(20))
+    END AS contributing_cause_count
+FROM code_counts
+WHERE primary_count > 0 OR contributing_count > 0
+ORDER BY financial_year DESC, (primary_count + contributing_count) DESC, icd10_code;
diff --git a/analysis/validate_output.py b/analysis/validate_output.py
@@ -0,0 +1,84 @@
+"""
+Validate output from ICD10 code counting queries.
+
+Checks:
+1. ICD10 codes match expected formats (or are blank/NULL)
+2. Financial years match expected formats (or are blank/NULL)
+
+Produces a short report with findings.
+"""
+
+import csv
+import re
+
+
+# Valid ICD10 code patterns:
+# - A00 (3 chars)
+# - A00.0 or A00.X (5 chars with dot)
+# - A000 or A00X (4 chars without dot)
+# - A00.00 or A00.X0 (6 chars with dot)
+# - A0000 or A00X0 (5 chars without dot)
+ICD10_PATTERN = re.compile(r"^[A-Z][0-9]{2}\.?[0-9X]?[0-9]?$")
+
+# Valid financial year patterns:
+# - "2024-25" format
+# - "202425" format
+FY_PATTERN = re.compile(r"^(\d{4}-\d{2}|\d{6})$")
+
+
+def is_valid(value, pattern):
+    """Check if value matches pattern (blank/NULL always valid)."""
+    v = (value or "").strip().upper()
+    return v == "" or v == "NULL" or pattern.match(v)
+
+
+def validate_file(filepath):
+    """Validate a CSV file, returning sets of invalid values."""
+    invalid_icd10 = set()
+    invalid_fy = set()
+
+    with open(filepath) as f:
+        for row in csv.DictReader(f):
+            code = row.get("icd10_code", "")
+            fy = row.get("financial_year", "")
+
+            if not is_valid(code, ICD10_PATTERN):
+                invalid_icd10.add(code.strip())
+            if not is_valid(fy, FY_PATTERN):
+                invalid_fy.add(fy.strip())
+
+    return invalid_icd10, invalid_fy
+
+
+def format_bullet_list(items):
+    if not items:
+        return "None"
+    return "\n" + "\n".join(f"    - {item}" for item in sorted(items))
+
+
+def main():
+    apcs_icd10, apcs_fy = validate_file("output/icd10_apcs.csv")
+    ons_icd10, ons_fy = validate_file("output/icd10_ons_deaths.csv")
+
+    lines = [
+        "=" * 60,
+        "ICD10 CODE OUTPUT VALIDATION REPORT",
+        "=" * 60,
+        "",
+        "HES APCS:",
+        f"  Invalid ICD10 codes: {format_bullet_list(apcs_icd10)}",
+        f"  Invalid financial years: {format_bullet_list(apcs_fy)}",
+        "",
+        "ONS Deaths:",
+        f"  Invalid ICD10 codes: {format_bullet_list(ons_icd10)}",
+        f"  Invalid financial years: {format_bullet_list(ons_fy)}",
+    ]
+
+    report = "\n".join(lines)
+
+    with open("output/validation_report.txt", "w") as f:
+        f.write(report)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/project.yaml b/project.yaml
@@ -11,3 +11,23 @@ actions:
       moderately_sensitive:
         counts: output/icd10_apcs.csv
         log: output/icd10_apcs_log.json
+
+  icd10_ons_deaths:
+    run: >
+      sqlrunner:latest
+        --output output/icd10_ons_deaths.csv
+        --log-file output/icd10_ons_deaths_log.json
+        analysis/query_icd10_ons_deaths.sql
+    outputs:
+      moderately_sensitive:
+        counts: output/icd10_ons_deaths.csv
+        log: output/icd10_ons_deaths_log.json
+
+  validate_output:
+    run: python:latest python analysis/validate_output.py
+    needs:
+      - icd10_apcs
+      - icd10_ons_deaths
+    outputs:
+      moderately_sensitive:
+        report: output/validation_report.txt