diff --git a/analysis/query_icd10_apcs.sql b/analysis/query_icd10_apcs.sql
index 3d8ec38..21aaad7 100644
--- a/analysis/query_icd10_apcs.sql
+++ b/analysis/query_icd10_apcs.sql
@@ -1,59 +1,138 @@
--- Count ICD10 codes from Der_Diagnosis_All in HES APC spells
+-- Count ICD10 codes from HES APC spells
-- Each code is counted only once per spell, regardless of how many episodes contain it
-- Format of Der_Diagnosis_All: "||E119 ,E780 ,J849 ||I801 ,I802 ,N179"
-- - "||" delimits episodes within a spell
-- - "," delimits codes within an episode
-- - We assume spaces may vary
--- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14)
+-- Outputs two independent count columns:
+-- - primary_count: count of spells where this code is the primary diagnosis (from APCS_Der)
+-- - secondary_count: count of spells where this code is in the secondary diagnoses (from APCS_Der)
+-- - all_count: count of spells where this code appears in Der_Diagnosis_All
+-- Note: A code may appear as primary but not in "all", or vice versa, or neither - we're not really sure
+-- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14, "0" if 0)
-- Excludes patients with Type 1 opt-out
--
-- Note: Uses XML-based string splitting for SQL Server compatibility level 100
-- (STRING_SPLIT requires compatibility level 130+ which TPP doesn't use)
-WITH apcs_filtered AS (
- -- Pre-filter APCS records excluding Type 1 opt-outs
- SELECT
- APCS_Ident,
- Der_Financial_Year,
+WITH apcs_base AS (
+ -- Base APCS records excluding Type 1 opt-outs
+ SELECT
+ apcs.APCS_Ident,
+ apcs.Der_Financial_Year,
+ LTRIM(RTRIM(der.Spell_Primary_Diagnosis)) as primary_diagnosis,
+ LTRIM(RTRIM(der.Spell_Secondary_Diagnoses)) as secondary_diagnoses,
-- Normalize: replace || with comma, remove spaces
- REPLACE(REPLACE(Der_Diagnosis_All, '||', ','), ' ', '') AS normalized_codes
- FROM APCS
- WHERE Der_Diagnosis_All IS NOT NULL
- AND NOT EXISTS (
- SELECT 1 FROM PatientsWithTypeOneDissent p
- WHERE p.Patient_ID = APCS.Patient_ID
+ REPLACE(REPLACE(apcs.Der_Diagnosis_All, '||', ','), ' ', '') AS normalized_codes
+ FROM APCS as apcs
+ LEFT JOIN APCS_Der AS der -- 1:1 relationship between apcs and apcs_der as per ehrql docs
+ ON apcs.APCS_Ident = der.APCS_Ident
+ WHERE NOT EXISTS (
+ SELECT 1 FROM PatientsWithTypeOneDissent p
+ WHERE p.Patient_ID = apcs.Patient_ID
)
),
-spell_codes AS (
- -- Extract all codes from each spell using XML-based string splitting
- -- This approach is compatible with SQL Server 2008+ (compatibility level 100)
+-- Extract all codes from Der_Diagnosis_All
+all_codes AS (
SELECT DISTINCT
a.APCS_Ident,
a.Der_Financial_Year,
LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) AS icd10_code
- FROM apcs_filtered a
+ FROM apcs_base a
CROSS APPLY (
SELECT CAST('' + REPLACE(a.normalized_codes, ',', '') + '' AS XML) AS xml_codes
) AS xml_data
CROSS APPLY xml_data.xml_codes.nodes('/x') AS split(code)
- WHERE LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) <> ''
+ WHERE a.normalized_codes IS NOT NULL
+ AND a.normalized_codes <> ''
+ AND LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) <> ''
+),
+-- Extract primary diagnosis codes
+primary_codes AS (
+ SELECT DISTINCT
+ APCS_Ident,
+ Der_Financial_Year,
+ primary_diagnosis AS icd10_code
+ FROM apcs_base
+ WHERE primary_diagnosis IS NOT NULL
+ AND primary_diagnosis <> ''
+),
+-- Extract secondary diagnosis codes
+secondary_codes AS (
+ SELECT DISTINCT
+ APCS_Ident,
+ Der_Financial_Year,
+ secondary_diagnoses AS icd10_code
+ FROM apcs_base
+ WHERE secondary_diagnoses IS NOT NULL
+ AND secondary_diagnoses <> ''
+),
+-- Get all unique codes from either source
+all_unique_codes AS (
+ SELECT Der_Financial_Year, icd10_code FROM all_codes
+ UNION
+ SELECT Der_Financial_Year, icd10_code FROM primary_codes
+ UNION
+ SELECT Der_Financial_Year, icd10_code FROM secondary_codes
),
-code_counts AS (
- -- Count spells per code per financial year
- SELECT
+-- Count "all" occurrences per code per financial year
+all_counts AS (
+ SELECT
Der_Financial_Year AS financial_year,
icd10_code,
COUNT(*) AS raw_count
- FROM spell_codes
+ FROM all_codes
GROUP BY Der_Financial_Year, icd10_code
+),
+-- Count "primary" occurrences per code per financial year
+primary_counts AS (
+ SELECT
+ Der_Financial_Year AS financial_year,
+ icd10_code,
+ COUNT(*) AS raw_count
+ FROM primary_codes
+ GROUP BY Der_Financial_Year, icd10_code
+),
+-- Count "secondary" occurrences per code per financial year
+secondary_counts AS (
+ SELECT
+ Der_Financial_Year AS financial_year,
+ icd10_code,
+ COUNT(*) AS raw_count
+ FROM secondary_codes
+ GROUP BY Der_Financial_Year, icd10_code
+),
+-- Combine counts
+combined_counts AS (
+ SELECT
+ u.Der_Financial_Year AS financial_year,
+ u.icd10_code,
+ ISNULL(p.raw_count, 0) AS primary_raw_count,
+ ISNULL(s.raw_count, 0) AS secondary_raw_count,
+ ISNULL(a.raw_count, 0) AS all_raw_count
+ FROM all_unique_codes u
+ LEFT JOIN primary_counts p ON u.Der_Financial_Year = p.financial_year AND u.icd10_code = p.icd10_code
+ LEFT JOIN secondary_counts s ON u.Der_Financial_Year = s.financial_year AND u.icd10_code = s.icd10_code
+ LEFT JOIN all_counts a ON u.Der_Financial_Year = a.financial_year AND u.icd10_code = a.icd10_code
)
-SELECT
+SELECT
financial_year,
icd10_code,
- CASE
- WHEN raw_count < 15 THEN '<15'
- ELSE CAST(ROUND(raw_count, -1) AS VARCHAR(20))
- END AS spell_count
-FROM code_counts
-WHERE raw_count > 0
-ORDER BY financial_year DESC, raw_count DESC, icd10_code;
\ No newline at end of file
+ CASE
+ WHEN primary_raw_count = 0 THEN '0'
+ WHEN primary_raw_count < 15 THEN '<15'
+ ELSE CAST(ROUND(primary_raw_count, -1) AS VARCHAR(20))
+ END AS primary_count,
+ CASE
+ WHEN secondary_raw_count = 0 THEN '0'
+ WHEN secondary_raw_count < 15 THEN '<15'
+ ELSE CAST(ROUND(secondary_raw_count, -1) AS VARCHAR(20))
+ END AS secondary_count,
+ CASE
+ WHEN all_raw_count = 0 THEN '0'
+ WHEN all_raw_count < 15 THEN '<15'
+ ELSE CAST(ROUND(all_raw_count, -1) AS VARCHAR(20))
+ END AS all_count
+FROM combined_counts
+WHERE primary_raw_count > 0 OR secondary_raw_count > 0 OR all_raw_count > 0
+ORDER BY financial_year DESC, all_raw_count DESC, icd10_code
diff --git a/analysis/query_icd10_ons_deaths.sql b/analysis/query_icd10_ons_deaths.sql
new file mode 100644
index 0000000..50f26ad
--- /dev/null
+++ b/analysis/query_icd10_ons_deaths.sql
@@ -0,0 +1,115 @@
+-- Count ICD10 codes from ONS Deaths data
+-- Counts primary cause (icd10u) and contributing causes (ICD10001-ICD10015) separately
+-- Uses only the first death record per patient (by earliest dod, then by icd10u for ties - which is what ehrql does)
+-- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14)
+-- Excludes patients with Type 1 opt-out
+
+WITH first_record_per_patient AS (
+ -- Get only the first death record per patient, excluding T1OO patients
+ SELECT *
+ FROM (
+ SELECT
+ Patient_ID,
+ dod,
+ LTRIM(RTRIM(icd10u)) AS icd10_code,
+ LTRIM(RTRIM(ICD10001)) AS ICD10001,
+ LTRIM(RTRIM(ICD10002)) AS ICD10002,
+ LTRIM(RTRIM(ICD10003)) AS ICD10003,
+ LTRIM(RTRIM(ICD10004)) AS ICD10004,
+ LTRIM(RTRIM(ICD10005)) AS ICD10005,
+ LTRIM(RTRIM(ICD10006)) AS ICD10006,
+ LTRIM(RTRIM(ICD10007)) AS ICD10007,
+ LTRIM(RTRIM(ICD10008)) AS ICD10008,
+ LTRIM(RTRIM(ICD10009)) AS ICD10009,
+ LTRIM(RTRIM(ICD10010)) AS ICD10010,
+ LTRIM(RTRIM(ICD10011)) AS ICD10011,
+ LTRIM(RTRIM(ICD10012)) AS ICD10012,
+ LTRIM(RTRIM(ICD10013)) AS ICD10013,
+ LTRIM(RTRIM(ICD10014)) AS ICD10014,
+ LTRIM(RTRIM(ICD10015)) AS ICD10015,
+ ROW_NUMBER() OVER (
+ PARTITION BY Patient_ID
+ ORDER BY dod ASC, icd10u ASC
+ ) AS rownum
+ FROM ONS_Deaths ons
+ WHERE NOT EXISTS (
+ SELECT 1 FROM PatientsWithTypeOneDissent p WHERE p.Patient_ID = ons.Patient_ID
+ )
+ ) t
+ WHERE t.rownum = 1
+),
+death_codes AS (
+ -- Unpivot all ICD10 codes with their type (primary vs contributing)
+ SELECT Patient_ID, dod, icd10_code, 'primary' AS code_type FROM first_record_per_patient WHERE icd10_code IS NOT NULL AND icd10_code <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10001, 'contributing' FROM first_record_per_patient WHERE ICD10001 IS NOT NULL AND ICD10001 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10002, 'contributing' FROM first_record_per_patient WHERE ICD10002 IS NOT NULL AND ICD10002 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10003, 'contributing' FROM first_record_per_patient WHERE ICD10003 IS NOT NULL AND ICD10003 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10004, 'contributing' FROM first_record_per_patient WHERE ICD10004 IS NOT NULL AND ICD10004 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10005, 'contributing' FROM first_record_per_patient WHERE ICD10005 IS NOT NULL AND ICD10005 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10006, 'contributing' FROM first_record_per_patient WHERE ICD10006 IS NOT NULL AND ICD10006 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10007, 'contributing' FROM first_record_per_patient WHERE ICD10007 IS NOT NULL AND ICD10007 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10008, 'contributing' FROM first_record_per_patient WHERE ICD10008 IS NOT NULL AND ICD10008 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10009, 'contributing' FROM first_record_per_patient WHERE ICD10009 IS NOT NULL AND ICD10009 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10010, 'contributing' FROM first_record_per_patient WHERE ICD10010 IS NOT NULL AND ICD10010 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10011, 'contributing' FROM first_record_per_patient WHERE ICD10011 IS NOT NULL AND ICD10011 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10012, 'contributing' FROM first_record_per_patient WHERE ICD10012 IS NOT NULL AND ICD10012 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10013, 'contributing' FROM first_record_per_patient WHERE ICD10013 IS NOT NULL AND ICD10013 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10014, 'contributing' FROM first_record_per_patient WHERE ICD10014 IS NOT NULL AND ICD10014 <> ''
+ UNION ALL
+ SELECT Patient_ID, dod, ICD10015, 'contributing' FROM first_record_per_patient WHERE ICD10015 IS NOT NULL AND ICD10015 <> ''
+),
+codes_with_fy AS (
+ -- Calculate financial year from date of death and deduplicate per patient
+ SELECT DISTINCT
+ Patient_ID,
+ icd10_code,
+ code_type,
+ -- Financial year: Apr-Mar, so Apr 2024 -> 2024-25, Mar 2024 -> 2023-24
+ CASE
+ WHEN MONTH(dod) >= 4
+ THEN CAST(YEAR(dod) AS VARCHAR(4)) + '-' + RIGHT(CAST(YEAR(dod) + 1 AS VARCHAR(4)), 2)
+ ELSE CAST(YEAR(dod) - 1 AS VARCHAR(4)) + '-' + RIGHT(CAST(YEAR(dod) AS VARCHAR(4)), 2)
+ END AS financial_year
+ FROM death_codes
+ WHERE dod IS NOT NULL
+),
+code_counts AS (
+ -- Count by financial year, code, and type
+ SELECT
+ financial_year,
+ icd10_code,
+ SUM(CASE WHEN code_type = 'primary' THEN 1 ELSE 0 END) AS primary_count,
+ SUM(CASE WHEN code_type = 'contributing' THEN 1 ELSE 0 END) AS contributing_count
+ FROM codes_with_fy
+ GROUP BY financial_year, icd10_code
+)
+SELECT
+ financial_year,
+ icd10_code,
+ CASE
+ WHEN primary_count = 0 THEN '0'
+ WHEN primary_count < 15 THEN '<15'
+ ELSE CAST(ROUND(primary_count, -1) AS VARCHAR(20))
+ END AS primary_cause_count,
+ CASE
+ WHEN contributing_count = 0 THEN '0'
+ WHEN contributing_count < 15 THEN '<15'
+ ELSE CAST(ROUND(contributing_count, -1) AS VARCHAR(20))
+ END AS contributing_cause_count
+FROM code_counts
+WHERE primary_count > 0 OR contributing_count > 0
+ORDER BY financial_year DESC, (primary_count + contributing_count) DESC, icd10_code;
diff --git a/analysis/validate_output.py b/analysis/validate_output.py
new file mode 100644
index 0000000..ad7e865
--- /dev/null
+++ b/analysis/validate_output.py
@@ -0,0 +1,84 @@
+"""
+Validate output from ICD10 code counting queries.
+
+Checks:
+1. ICD10 codes match expected formats (or are blank/NULL)
+2. Financial years match expected formats (or are blank/NULL)
+
+Produces a short report with findings.
+"""
+
+import csv
+import re
+
+
+# Valid ICD10 code patterns:
+# - A00 (3 chars)
+# - A00.0 or A00.X (5 chars with dot)
+# - A000 or A00X (4 chars without dot)
+# - A00.00 or A00.X0 (6 chars with dot)
+# - A0000 or A00X0 (5 chars without dot)
+ICD10_PATTERN = re.compile(r"^[A-Z][0-9]{2}\.?[0-9X]?[0-9]?$")
+
+# Valid financial year patterns:
+# - "2024-25" format
+# - "202425" format
+FY_PATTERN = re.compile(r"^(\d{4}-\d{2}|\d{6})$")
+
+
+def is_valid(value, pattern):
+ """Check if value matches pattern (blank/NULL always valid)."""
+ v = (value or "").strip().upper()
+ return v == "" or v == "NULL" or pattern.match(v)
+
+
+def validate_file(filepath):
+ """Validate a CSV file, returning sets of invalid values."""
+ invalid_icd10 = set()
+ invalid_fy = set()
+
+ with open(filepath) as f:
+ for row in csv.DictReader(f):
+ code = row.get("icd10_code", "")
+ fy = row.get("financial_year", "")
+
+ if not is_valid(code, ICD10_PATTERN):
+ invalid_icd10.add(code.strip())
+ if not is_valid(fy, FY_PATTERN):
+ invalid_fy.add(fy.strip())
+
+ return invalid_icd10, invalid_fy
+
+
+def format_bullet_list(items):
+ if not items:
+ return "None"
+ return "\n" + "\n".join(f" - {item}" for item in sorted(items))
+
+
+def main():
+ apcs_icd10, apcs_fy = validate_file("output/icd10_apcs.csv")
+ ons_icd10, ons_fy = validate_file("output/icd10_ons_deaths.csv")
+
+ lines = [
+ "=" * 60,
+ "ICD10 CODE OUTPUT VALIDATION REPORT",
+ "=" * 60,
+ "",
+ "HES APCS:",
+ f" Invalid ICD10 codes: {format_bullet_list(apcs_icd10)}",
+ f" Invalid financial years: {format_bullet_list(apcs_fy)}",
+ "",
+ "ONS Deaths:",
+ f" Invalid ICD10 codes: {format_bullet_list(ons_icd10)}",
+ f" Invalid financial years: {format_bullet_list(ons_fy)}",
+ ]
+
+ report = "\n".join(lines)
+
+ with open("output/validation_report.txt", "w") as f:
+ f.write(report)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/project.yaml b/project.yaml
index be20fdf..9126344 100644
--- a/project.yaml
+++ b/project.yaml
@@ -11,3 +11,23 @@ actions:
moderately_sensitive:
counts: output/icd10_apcs.csv
log: output/icd10_apcs_log.json
+
+ icd10_ons_deaths:
+ run: >
+ sqlrunner:latest
+ --output output/icd10_ons_deaths.csv
+ --log-file output/icd10_ons_deaths_log.json
+ analysis/query_icd10_ons_deaths.sql
+ outputs:
+ moderately_sensitive:
+ counts: output/icd10_ons_deaths.csv
+ log: output/icd10_ons_deaths_log.json
+
+ validate_output:
+ run: python:latest python analysis/validate_output.py
+ needs:
+ - icd10_apcs
+ - icd10_ons_deaths
+ outputs:
+ moderately_sensitive:
+ report: output/validation_report.txt