diff --git a/analysis/query_icd10_apcs.sql b/analysis/query_icd10_apcs.sql index 3d8ec38..21aaad7 100644 --- a/analysis/query_icd10_apcs.sql +++ b/analysis/query_icd10_apcs.sql @@ -1,59 +1,138 @@ --- Count ICD10 codes from Der_Diagnosis_All in HES APC spells +-- Count ICD10 codes from HES APC spells -- Each code is counted only once per spell, regardless of how many episodes contain it -- Format of Der_Diagnosis_All: "||E119 ,E780 ,J849 ||I801 ,I802 ,N179" -- - "||" delimits episodes within a spell -- - "," delimits codes within an episode -- - We assume spaces may vary --- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14) +-- Outputs two independent count columns: +-- - primary_count: count of spells where this code is the primary diagnosis (from APCS_Der) +-- - secondary_count: count of spells where this code is in the secondary diagnoses (from APCS_Der) +-- - all_count: count of spells where this code appears in Der_Diagnosis_All +-- Note: A code may appear as primary but not in "all", or vice versa, or neither - we're not really sure +-- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14, "0" if 0) -- Excludes patients with Type 1 opt-out -- -- Note: Uses XML-based string splitting for SQL Server compatibility level 100 -- (STRING_SPLIT requires compatibility level 130+ which TPP doesn't use) -WITH apcs_filtered AS ( - -- Pre-filter APCS records excluding Type 1 opt-outs - SELECT - APCS_Ident, - Der_Financial_Year, +WITH apcs_base AS ( + -- Base APCS records excluding Type 1 opt-outs + SELECT + apcs.APCS_Ident, + apcs.Der_Financial_Year, + LTRIM(RTRIM(der.Spell_Primary_Diagnosis)) as primary_diagnosis, + LTRIM(RTRIM(der.Spell_Secondary_Diagnoses)) as secondary_diagnoses, -- Normalize: replace || with comma, remove spaces - REPLACE(REPLACE(Der_Diagnosis_All, '||', ','), ' ', '') AS normalized_codes - FROM APCS - WHERE Der_Diagnosis_All IS NOT NULL - AND NOT EXISTS ( - SELECT 1 FROM PatientsWithTypeOneDissent p - WHERE p.Patient_ID = APCS.Patient_ID + REPLACE(REPLACE(apcs.Der_Diagnosis_All, '||', ','), ' ', '') AS normalized_codes + FROM APCS as apcs + LEFT JOIN APCS_Der AS der -- 1:1 relationship between apcs and apcs_der as per ehrql docs + ON apcs.APCS_Ident = der.APCS_Ident + WHERE NOT EXISTS ( + SELECT 1 FROM PatientsWithTypeOneDissent p + WHERE p.Patient_ID = apcs.Patient_ID ) ), -spell_codes AS ( - -- Extract all codes from each spell using XML-based string splitting - -- This approach is compatible with SQL Server 2008+ (compatibility level 100) +-- Extract all codes from Der_Diagnosis_All +all_codes AS ( SELECT DISTINCT a.APCS_Ident, a.Der_Financial_Year, LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) AS icd10_code - FROM apcs_filtered a + FROM apcs_base a CROSS APPLY ( SELECT CAST('' + REPLACE(a.normalized_codes, ',', '') + '' AS XML) AS xml_codes ) AS xml_data CROSS APPLY xml_data.xml_codes.nodes('/x') AS split(code) - WHERE LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) <> '' + WHERE a.normalized_codes IS NOT NULL + AND a.normalized_codes <> '' + AND LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) <> '' +), +-- Extract primary diagnosis codes +primary_codes AS ( + SELECT DISTINCT + APCS_Ident, + Der_Financial_Year, + primary_diagnosis AS icd10_code + FROM apcs_base + WHERE primary_diagnosis IS NOT NULL + AND primary_diagnosis <> '' +), +-- Extract secondary diagnosis codes +secondary_codes AS ( + SELECT DISTINCT + APCS_Ident, + Der_Financial_Year, + secondary_diagnoses AS icd10_code + FROM apcs_base + WHERE secondary_diagnoses IS NOT NULL + AND secondary_diagnoses <> '' +), +-- Get all unique codes from either source +all_unique_codes AS ( + SELECT Der_Financial_Year, icd10_code FROM all_codes + UNION + SELECT Der_Financial_Year, icd10_code FROM primary_codes + UNION + SELECT Der_Financial_Year, icd10_code FROM secondary_codes ), -code_counts AS ( - -- Count spells per code per financial year - SELECT +-- Count "all" occurrences per code per financial year +all_counts AS ( + SELECT Der_Financial_Year AS financial_year, icd10_code, COUNT(*) AS raw_count - FROM spell_codes + FROM all_codes GROUP BY Der_Financial_Year, icd10_code +), +-- Count "primary" occurrences per code per financial year +primary_counts AS ( + SELECT + Der_Financial_Year AS financial_year, + icd10_code, + COUNT(*) AS raw_count + FROM primary_codes + GROUP BY Der_Financial_Year, icd10_code +), +-- Count "secondary" occurrences per code per financial year +secondary_counts AS ( + SELECT + Der_Financial_Year AS financial_year, + icd10_code, + COUNT(*) AS raw_count + FROM secondary_codes + GROUP BY Der_Financial_Year, icd10_code +), +-- Combine counts +combined_counts AS ( + SELECT + u.Der_Financial_Year AS financial_year, + u.icd10_code, + ISNULL(p.raw_count, 0) AS primary_raw_count, + ISNULL(s.raw_count, 0) AS secondary_raw_count, + ISNULL(a.raw_count, 0) AS all_raw_count + FROM all_unique_codes u + LEFT JOIN primary_counts p ON u.Der_Financial_Year = p.financial_year AND u.icd10_code = p.icd10_code + LEFT JOIN secondary_counts s ON u.Der_Financial_Year = s.financial_year AND u.icd10_code = s.icd10_code + LEFT JOIN all_counts a ON u.Der_Financial_Year = a.financial_year AND u.icd10_code = a.icd10_code ) -SELECT +SELECT financial_year, icd10_code, - CASE - WHEN raw_count < 15 THEN '<15' - ELSE CAST(ROUND(raw_count, -1) AS VARCHAR(20)) - END AS spell_count -FROM code_counts -WHERE raw_count > 0 -ORDER BY financial_year DESC, raw_count DESC, icd10_code; \ No newline at end of file + CASE + WHEN primary_raw_count = 0 THEN '0' + WHEN primary_raw_count < 15 THEN '<15' + ELSE CAST(ROUND(primary_raw_count, -1) AS VARCHAR(20)) + END AS primary_count, + CASE + WHEN secondary_raw_count = 0 THEN '0' + WHEN secondary_raw_count < 15 THEN '<15' + ELSE CAST(ROUND(secondary_raw_count, -1) AS VARCHAR(20)) + END AS secondary_count, + CASE + WHEN all_raw_count = 0 THEN '0' + WHEN all_raw_count < 15 THEN '<15' + ELSE CAST(ROUND(all_raw_count, -1) AS VARCHAR(20)) + END AS all_count +FROM combined_counts +WHERE primary_raw_count > 0 OR secondary_raw_count > 0 OR all_raw_count > 0 +ORDER BY financial_year DESC, all_raw_count DESC, icd10_code diff --git a/analysis/query_icd10_ons_deaths.sql b/analysis/query_icd10_ons_deaths.sql new file mode 100644 index 0000000..50f26ad --- /dev/null +++ b/analysis/query_icd10_ons_deaths.sql @@ -0,0 +1,115 @@ +-- Count ICD10 codes from ONS Deaths data +-- Counts primary cause (icd10u) and contributing causes (ICD10001-ICD10015) separately +-- Uses only the first death record per patient (by earliest dod, then by icd10u for ties - which is what ehrql does) +-- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14) +-- Excludes patients with Type 1 opt-out + +WITH first_record_per_patient AS ( + -- Get only the first death record per patient, excluding T1OO patients + SELECT * + FROM ( + SELECT + Patient_ID, + dod, + LTRIM(RTRIM(icd10u)) AS icd10_code, + LTRIM(RTRIM(ICD10001)) AS ICD10001, + LTRIM(RTRIM(ICD10002)) AS ICD10002, + LTRIM(RTRIM(ICD10003)) AS ICD10003, + LTRIM(RTRIM(ICD10004)) AS ICD10004, + LTRIM(RTRIM(ICD10005)) AS ICD10005, + LTRIM(RTRIM(ICD10006)) AS ICD10006, + LTRIM(RTRIM(ICD10007)) AS ICD10007, + LTRIM(RTRIM(ICD10008)) AS ICD10008, + LTRIM(RTRIM(ICD10009)) AS ICD10009, + LTRIM(RTRIM(ICD10010)) AS ICD10010, + LTRIM(RTRIM(ICD10011)) AS ICD10011, + LTRIM(RTRIM(ICD10012)) AS ICD10012, + LTRIM(RTRIM(ICD10013)) AS ICD10013, + LTRIM(RTRIM(ICD10014)) AS ICD10014, + LTRIM(RTRIM(ICD10015)) AS ICD10015, + ROW_NUMBER() OVER ( + PARTITION BY Patient_ID + ORDER BY dod ASC, icd10u ASC + ) AS rownum + FROM ONS_Deaths ons + WHERE NOT EXISTS ( + SELECT 1 FROM PatientsWithTypeOneDissent p WHERE p.Patient_ID = ons.Patient_ID + ) + ) t + WHERE t.rownum = 1 +), +death_codes AS ( + -- Unpivot all ICD10 codes with their type (primary vs contributing) + SELECT Patient_ID, dod, icd10_code, 'primary' AS code_type FROM first_record_per_patient WHERE icd10_code IS NOT NULL AND icd10_code <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10001, 'contributing' FROM first_record_per_patient WHERE ICD10001 IS NOT NULL AND ICD10001 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10002, 'contributing' FROM first_record_per_patient WHERE ICD10002 IS NOT NULL AND ICD10002 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10003, 'contributing' FROM first_record_per_patient WHERE ICD10003 IS NOT NULL AND ICD10003 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10004, 'contributing' FROM first_record_per_patient WHERE ICD10004 IS NOT NULL AND ICD10004 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10005, 'contributing' FROM first_record_per_patient WHERE ICD10005 IS NOT NULL AND ICD10005 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10006, 'contributing' FROM first_record_per_patient WHERE ICD10006 IS NOT NULL AND ICD10006 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10007, 'contributing' FROM first_record_per_patient WHERE ICD10007 IS NOT NULL AND ICD10007 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10008, 'contributing' FROM first_record_per_patient WHERE ICD10008 IS NOT NULL AND ICD10008 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10009, 'contributing' FROM first_record_per_patient WHERE ICD10009 IS NOT NULL AND ICD10009 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10010, 'contributing' FROM first_record_per_patient WHERE ICD10010 IS NOT NULL AND ICD10010 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10011, 'contributing' FROM first_record_per_patient WHERE ICD10011 IS NOT NULL AND ICD10011 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10012, 'contributing' FROM first_record_per_patient WHERE ICD10012 IS NOT NULL AND ICD10012 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10013, 'contributing' FROM first_record_per_patient WHERE ICD10013 IS NOT NULL AND ICD10013 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10014, 'contributing' FROM first_record_per_patient WHERE ICD10014 IS NOT NULL AND ICD10014 <> '' + UNION ALL + SELECT Patient_ID, dod, ICD10015, 'contributing' FROM first_record_per_patient WHERE ICD10015 IS NOT NULL AND ICD10015 <> '' +), +codes_with_fy AS ( + -- Calculate financial year from date of death and deduplicate per patient + SELECT DISTINCT + Patient_ID, + icd10_code, + code_type, + -- Financial year: Apr-Mar, so Apr 2024 -> 2024-25, Mar 2024 -> 2023-24 + CASE + WHEN MONTH(dod) >= 4 + THEN CAST(YEAR(dod) AS VARCHAR(4)) + '-' + RIGHT(CAST(YEAR(dod) + 1 AS VARCHAR(4)), 2) + ELSE CAST(YEAR(dod) - 1 AS VARCHAR(4)) + '-' + RIGHT(CAST(YEAR(dod) AS VARCHAR(4)), 2) + END AS financial_year + FROM death_codes + WHERE dod IS NOT NULL +), +code_counts AS ( + -- Count by financial year, code, and type + SELECT + financial_year, + icd10_code, + SUM(CASE WHEN code_type = 'primary' THEN 1 ELSE 0 END) AS primary_count, + SUM(CASE WHEN code_type = 'contributing' THEN 1 ELSE 0 END) AS contributing_count + FROM codes_with_fy + GROUP BY financial_year, icd10_code +) +SELECT + financial_year, + icd10_code, + CASE + WHEN primary_count = 0 THEN '0' + WHEN primary_count < 15 THEN '<15' + ELSE CAST(ROUND(primary_count, -1) AS VARCHAR(20)) + END AS primary_cause_count, + CASE + WHEN contributing_count = 0 THEN '0' + WHEN contributing_count < 15 THEN '<15' + ELSE CAST(ROUND(contributing_count, -1) AS VARCHAR(20)) + END AS contributing_cause_count +FROM code_counts +WHERE primary_count > 0 OR contributing_count > 0 +ORDER BY financial_year DESC, (primary_count + contributing_count) DESC, icd10_code; diff --git a/analysis/validate_output.py b/analysis/validate_output.py new file mode 100644 index 0000000..ad7e865 --- /dev/null +++ b/analysis/validate_output.py @@ -0,0 +1,84 @@ +""" +Validate output from ICD10 code counting queries. + +Checks: +1. ICD10 codes match expected formats (or are blank/NULL) +2. Financial years match expected formats (or are blank/NULL) + +Produces a short report with findings. +""" + +import csv +import re + + +# Valid ICD10 code patterns: +# - A00 (3 chars) +# - A00.0 or A00.X (5 chars with dot) +# - A000 or A00X (4 chars without dot) +# - A00.00 or A00.X0 (6 chars with dot) +# - A0000 or A00X0 (5 chars without dot) +ICD10_PATTERN = re.compile(r"^[A-Z][0-9]{2}\.?[0-9X]?[0-9]?$") + +# Valid financial year patterns: +# - "2024-25" format +# - "202425" format +FY_PATTERN = re.compile(r"^(\d{4}-\d{2}|\d{6})$") + + +def is_valid(value, pattern): + """Check if value matches pattern (blank/NULL always valid).""" + v = (value or "").strip().upper() + return v == "" or v == "NULL" or pattern.match(v) + + +def validate_file(filepath): + """Validate a CSV file, returning sets of invalid values.""" + invalid_icd10 = set() + invalid_fy = set() + + with open(filepath) as f: + for row in csv.DictReader(f): + code = row.get("icd10_code", "") + fy = row.get("financial_year", "") + + if not is_valid(code, ICD10_PATTERN): + invalid_icd10.add(code.strip()) + if not is_valid(fy, FY_PATTERN): + invalid_fy.add(fy.strip()) + + return invalid_icd10, invalid_fy + + +def format_bullet_list(items): + if not items: + return "None" + return "\n" + "\n".join(f" - {item}" for item in sorted(items)) + + +def main(): + apcs_icd10, apcs_fy = validate_file("output/icd10_apcs.csv") + ons_icd10, ons_fy = validate_file("output/icd10_ons_deaths.csv") + + lines = [ + "=" * 60, + "ICD10 CODE OUTPUT VALIDATION REPORT", + "=" * 60, + "", + "HES APCS:", + f" Invalid ICD10 codes: {format_bullet_list(apcs_icd10)}", + f" Invalid financial years: {format_bullet_list(apcs_fy)}", + "", + "ONS Deaths:", + f" Invalid ICD10 codes: {format_bullet_list(ons_icd10)}", + f" Invalid financial years: {format_bullet_list(ons_fy)}", + ] + + report = "\n".join(lines) + + with open("output/validation_report.txt", "w") as f: + f.write(report) + + +if __name__ == "__main__": + main() diff --git a/project.yaml b/project.yaml index be20fdf..9126344 100644 --- a/project.yaml +++ b/project.yaml @@ -11,3 +11,23 @@ actions: moderately_sensitive: counts: output/icd10_apcs.csv log: output/icd10_apcs_log.json + + icd10_ons_deaths: + run: > + sqlrunner:latest + --output output/icd10_ons_deaths.csv + --log-file output/icd10_ons_deaths_log.json + analysis/query_icd10_ons_deaths.sql + outputs: + moderately_sensitive: + counts: output/icd10_ons_deaths.csv + log: output/icd10_ons_deaths_log.json + + validate_output: + run: python:latest python analysis/validate_output.py + needs: + - icd10_apcs + - icd10_ons_deaths + outputs: + moderately_sensitive: + report: output/validation_report.txt