Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 109 additions & 30 deletions analysis/query_icd10_apcs.sql
Original file line number Diff line number Diff line change
@@ -1,59 +1,138 @@
-- Count ICD10 codes from Der_Diagnosis_All in HES APC spells
-- Count ICD10 codes from HES APC spells
-- Each code is counted only once per spell, regardless of how many episodes contain it
-- Format of Der_Diagnosis_All: "||E119 ,E780 ,J849 ||I801 ,I802 ,N179"
-- - "||" delimits episodes within a spell
-- - "," delimits codes within an episode
-- - We assume spaces may vary
-- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14)
-- Outputs two independent count columns:
-- - primary_count: count of spells where this code is the primary diagnosis (from APCS_Der)
-- - secondary_count: count of spells where this code is in the secondary diagnoses (from APCS_Der)
-- - all_count: count of spells where this code appears in Der_Diagnosis_All
-- Note: A code may appear as primary but not in "all", or vice versa, or neither - we're not really sure
-- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14, "0" if 0)
-- Excludes patients with Type 1 opt-out
--
-- Note: Uses XML-based string splitting for SQL Server compatibility level 100
-- (STRING_SPLIT requires compatibility level 130+ which TPP doesn't use)

WITH apcs_filtered AS (
-- Pre-filter APCS records excluding Type 1 opt-outs
SELECT
APCS_Ident,
Der_Financial_Year,
WITH apcs_base AS (
-- Base APCS records excluding Type 1 opt-outs
SELECT
apcs.APCS_Ident,
apcs.Der_Financial_Year,
LTRIM(RTRIM(der.Spell_Primary_Diagnosis)) as primary_diagnosis,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

der.Spell_Secondary_Diagnosis too for good measure?

our docs say

Code indicating secondary diagnosis. This is a single code giving the first listed secondary diagnosis, but there may other secondary diagnoses listed in the all_diagnoses field below.

LTRIM(RTRIM(der.Spell_Secondary_Diagnoses)) as secondary_diagnoses,
-- Normalize: replace || with comma, remove spaces
REPLACE(REPLACE(Der_Diagnosis_All, '||', ','), ' ', '') AS normalized_codes
FROM APCS
WHERE Der_Diagnosis_All IS NOT NULL
AND NOT EXISTS (
SELECT 1 FROM PatientsWithTypeOneDissent p
WHERE p.Patient_ID = APCS.Patient_ID
REPLACE(REPLACE(apcs.Der_Diagnosis_All, '||', ','), ' ', '') AS normalized_codes
FROM APCS as apcs
LEFT JOIN APCS_Der AS der -- 1:1 relationship between apcs and apcs_der as per ehrql docs
ON apcs.APCS_Ident = der.APCS_Ident
WHERE NOT EXISTS (
SELECT 1 FROM PatientsWithTypeOneDissent p
WHERE p.Patient_ID = apcs.Patient_ID
)
),
spell_codes AS (
-- Extract all codes from each spell using XML-based string splitting
-- This approach is compatible with SQL Server 2008+ (compatibility level 100)
-- Extract all codes from Der_Diagnosis_All
all_codes AS (
SELECT DISTINCT
a.APCS_Ident,
a.Der_Financial_Year,
LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) AS icd10_code
FROM apcs_filtered a
FROM apcs_base a
CROSS APPLY (
SELECT CAST('<x>' + REPLACE(a.normalized_codes, ',', '</x><x>') + '</x>' AS XML) AS xml_codes
) AS xml_data
CROSS APPLY xml_data.xml_codes.nodes('/x') AS split(code)
WHERE LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) <> ''
WHERE a.normalized_codes IS NOT NULL
AND a.normalized_codes <> ''
AND LTRIM(RTRIM(split.code.value('.', 'VARCHAR(20)'))) <> ''
),
-- Extract primary diagnosis codes
primary_codes AS (
SELECT DISTINCT
APCS_Ident,
Der_Financial_Year,
primary_diagnosis AS icd10_code
FROM apcs_base
WHERE primary_diagnosis IS NOT NULL
AND primary_diagnosis <> ''
),
-- Extract secondary diagnosis codes
secondary_codes AS (
SELECT DISTINCT
APCS_Ident,
Der_Financial_Year,
secondary_diagnoses AS icd10_code
FROM apcs_base
WHERE secondary_diagnoses IS NOT NULL
AND secondary_diagnoses <> ''
),
-- Get all unique codes from either source
all_unique_codes AS (
SELECT Der_Financial_Year, icd10_code FROM all_codes
UNION
SELECT Der_Financial_Year, icd10_code FROM primary_codes
UNION
SELECT Der_Financial_Year, icd10_code FROM secondary_codes
),
code_counts AS (
-- Count spells per code per financial year
SELECT
-- Count "all" occurrences per code per financial year
all_counts AS (
SELECT
Der_Financial_Year AS financial_year,
icd10_code,
COUNT(*) AS raw_count
FROM spell_codes
FROM all_codes
GROUP BY Der_Financial_Year, icd10_code
),
-- Count "primary" occurrences per code per financial year
primary_counts AS (
SELECT
Der_Financial_Year AS financial_year,
icd10_code,
COUNT(*) AS raw_count
FROM primary_codes
GROUP BY Der_Financial_Year, icd10_code
),
-- Count "secondary" occurrences per code per financial year
secondary_counts AS (
SELECT
Der_Financial_Year AS financial_year,
icd10_code,
COUNT(*) AS raw_count
FROM secondary_codes
GROUP BY Der_Financial_Year, icd10_code
),
-- Combine counts
combined_counts AS (
SELECT
u.Der_Financial_Year AS financial_year,
u.icd10_code,
ISNULL(p.raw_count, 0) AS primary_raw_count,
ISNULL(s.raw_count, 0) AS secondary_raw_count,
ISNULL(a.raw_count, 0) AS all_raw_count
FROM all_unique_codes u
LEFT JOIN primary_counts p ON u.Der_Financial_Year = p.financial_year AND u.icd10_code = p.icd10_code
LEFT JOIN secondary_counts s ON u.Der_Financial_Year = s.financial_year AND u.icd10_code = s.icd10_code
LEFT JOIN all_counts a ON u.Der_Financial_Year = a.financial_year AND u.icd10_code = a.icd10_code
)
SELECT
SELECT
financial_year,
icd10_code,
CASE
WHEN raw_count < 15 THEN '<15'
ELSE CAST(ROUND(raw_count, -1) AS VARCHAR(20))
END AS spell_count
FROM code_counts
WHERE raw_count > 0
ORDER BY financial_year DESC, raw_count DESC, icd10_code;
CASE
WHEN primary_raw_count = 0 THEN '0'
WHEN primary_raw_count < 15 THEN '<15'
ELSE CAST(ROUND(primary_raw_count, -1) AS VARCHAR(20))
END AS primary_count,
CASE
WHEN secondary_raw_count = 0 THEN '0'
WHEN secondary_raw_count < 15 THEN '<15'
ELSE CAST(ROUND(secondary_raw_count, -1) AS VARCHAR(20))
END AS secondary_count,
CASE
WHEN all_raw_count = 0 THEN '0'
WHEN all_raw_count < 15 THEN '<15'
ELSE CAST(ROUND(all_raw_count, -1) AS VARCHAR(20))
END AS all_count
FROM combined_counts
WHERE primary_raw_count > 0 OR secondary_raw_count > 0 OR all_raw_count > 0
ORDER BY financial_year DESC, all_raw_count DESC, icd10_code
115 changes: 115 additions & 0 deletions analysis/query_icd10_ons_deaths.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
-- Count ICD10 codes from ONS Deaths data
-- Counts primary cause (icd10u) and contributing causes (ICD10001-ICD10015) separately
-- Uses only the first death record per patient (by earliest dod, then by icd10u for ties - which is what ehrql does)
-- Grouped by financial year, counts rounded to nearest 10 (or "<15" if 1-14)
-- Excludes patients with Type 1 opt-out

WITH first_record_per_patient AS (
-- Get only the first death record per patient, excluding T1OO patients
SELECT *
FROM (
SELECT
Patient_ID,
dod,
LTRIM(RTRIM(icd10u)) AS icd10_code,
LTRIM(RTRIM(ICD10001)) AS ICD10001,
LTRIM(RTRIM(ICD10002)) AS ICD10002,
LTRIM(RTRIM(ICD10003)) AS ICD10003,
LTRIM(RTRIM(ICD10004)) AS ICD10004,
LTRIM(RTRIM(ICD10005)) AS ICD10005,
LTRIM(RTRIM(ICD10006)) AS ICD10006,
LTRIM(RTRIM(ICD10007)) AS ICD10007,
LTRIM(RTRIM(ICD10008)) AS ICD10008,
LTRIM(RTRIM(ICD10009)) AS ICD10009,
LTRIM(RTRIM(ICD10010)) AS ICD10010,
LTRIM(RTRIM(ICD10011)) AS ICD10011,
LTRIM(RTRIM(ICD10012)) AS ICD10012,
LTRIM(RTRIM(ICD10013)) AS ICD10013,
LTRIM(RTRIM(ICD10014)) AS ICD10014,
LTRIM(RTRIM(ICD10015)) AS ICD10015,
ROW_NUMBER() OVER (
PARTITION BY Patient_ID
ORDER BY dod ASC, icd10u ASC
) AS rownum
FROM ONS_Deaths ons
WHERE NOT EXISTS (
SELECT 1 FROM PatientsWithTypeOneDissent p WHERE p.Patient_ID = ons.Patient_ID
)
) t
WHERE t.rownum = 1
),
death_codes AS (
-- Unpivot all ICD10 codes with their type (primary vs contributing)
SELECT Patient_ID, dod, icd10_code, 'primary' AS code_type FROM first_record_per_patient WHERE icd10_code IS NOT NULL AND icd10_code <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10001, 'contributing' FROM first_record_per_patient WHERE ICD10001 IS NOT NULL AND ICD10001 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10002, 'contributing' FROM first_record_per_patient WHERE ICD10002 IS NOT NULL AND ICD10002 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10003, 'contributing' FROM first_record_per_patient WHERE ICD10003 IS NOT NULL AND ICD10003 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10004, 'contributing' FROM first_record_per_patient WHERE ICD10004 IS NOT NULL AND ICD10004 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10005, 'contributing' FROM first_record_per_patient WHERE ICD10005 IS NOT NULL AND ICD10005 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10006, 'contributing' FROM first_record_per_patient WHERE ICD10006 IS NOT NULL AND ICD10006 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10007, 'contributing' FROM first_record_per_patient WHERE ICD10007 IS NOT NULL AND ICD10007 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10008, 'contributing' FROM first_record_per_patient WHERE ICD10008 IS NOT NULL AND ICD10008 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10009, 'contributing' FROM first_record_per_patient WHERE ICD10009 IS NOT NULL AND ICD10009 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10010, 'contributing' FROM first_record_per_patient WHERE ICD10010 IS NOT NULL AND ICD10010 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10011, 'contributing' FROM first_record_per_patient WHERE ICD10011 IS NOT NULL AND ICD10011 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10012, 'contributing' FROM first_record_per_patient WHERE ICD10012 IS NOT NULL AND ICD10012 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10013, 'contributing' FROM first_record_per_patient WHERE ICD10013 IS NOT NULL AND ICD10013 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10014, 'contributing' FROM first_record_per_patient WHERE ICD10014 IS NOT NULL AND ICD10014 <> ''
UNION ALL
SELECT Patient_ID, dod, ICD10015, 'contributing' FROM first_record_per_patient WHERE ICD10015 IS NOT NULL AND ICD10015 <> ''
),
codes_with_fy AS (
-- Calculate financial year from date of death and deduplicate per patient
SELECT DISTINCT
Patient_ID,
icd10_code,
code_type,
-- Financial year: Apr-Mar, so Apr 2024 -> 2024-25, Mar 2024 -> 2023-24
CASE
WHEN MONTH(dod) >= 4
THEN CAST(YEAR(dod) AS VARCHAR(4)) + '-' + RIGHT(CAST(YEAR(dod) + 1 AS VARCHAR(4)), 2)
ELSE CAST(YEAR(dod) - 1 AS VARCHAR(4)) + '-' + RIGHT(CAST(YEAR(dod) AS VARCHAR(4)), 2)
END AS financial_year
FROM death_codes
WHERE dod IS NOT NULL
),
code_counts AS (
-- Count by financial year, code, and type
SELECT
financial_year,
icd10_code,
SUM(CASE WHEN code_type = 'primary' THEN 1 ELSE 0 END) AS primary_count,
SUM(CASE WHEN code_type = 'contributing' THEN 1 ELSE 0 END) AS contributing_count
FROM codes_with_fy
GROUP BY financial_year, icd10_code
)
SELECT
financial_year,
icd10_code,
CASE
WHEN primary_count = 0 THEN '0'
WHEN primary_count < 15 THEN '<15'
ELSE CAST(ROUND(primary_count, -1) AS VARCHAR(20))
END AS primary_cause_count,
CASE
WHEN contributing_count = 0 THEN '0'
WHEN contributing_count < 15 THEN '<15'
ELSE CAST(ROUND(contributing_count, -1) AS VARCHAR(20))
END AS contributing_cause_count
FROM code_counts
WHERE primary_count > 0 OR contributing_count > 0
ORDER BY financial_year DESC, (primary_count + contributing_count) DESC, icd10_code;
84 changes: 84 additions & 0 deletions analysis/validate_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
Validate output from ICD10 code counting queries.

Checks:
1. ICD10 codes match expected formats (or are blank/NULL)
2. Financial years match expected formats (or are blank/NULL)

Produces a short report with findings.
"""

import csv
import re


# Valid ICD10 code patterns:
# - A00 (3 chars)
# - A00.0 or A00.X (5 chars with dot)
# - A000 or A00X (4 chars without dot)
# - A00.00 or A00.X0 (6 chars with dot)
# - A0000 or A00X0 (5 chars without dot)
ICD10_PATTERN = re.compile(r"^[A-Z][0-9]{2}\.?[0-9X]?[0-9]?$")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to correctly match A00X0 and A00.X0 so I think the comment is wrong?


# Valid financial year patterns:
# - "2024-25" format
# - "202425" format
FY_PATTERN = re.compile(r"^(\d{4}-\d{2}|\d{6})$")


def is_valid(value, pattern):
"""Check if value matches pattern (blank/NULL always valid)."""
v = (value or "").strip().upper()
return v == "" or v == "NULL" or pattern.match(v)


def validate_file(filepath):
"""Validate a CSV file, returning sets of invalid values."""
invalid_icd10 = set()
invalid_fy = set()

with open(filepath) as f:
for row in csv.DictReader(f):
code = row.get("icd10_code", "")
fy = row.get("financial_year", "")

if not is_valid(code, ICD10_PATTERN):
invalid_icd10.add(code.strip())
if not is_valid(fy, FY_PATTERN):
invalid_fy.add(fy.strip())

return invalid_icd10, invalid_fy


def format_bullet_list(items):
if not items:
return "None"
return "\n" + "\n".join(f" - {item}" for item in sorted(items))


def main():
apcs_icd10, apcs_fy = validate_file("output/icd10_apcs.csv")
ons_icd10, ons_fy = validate_file("output/icd10_ons_deaths.csv")

lines = [
"=" * 60,
"ICD10 CODE OUTPUT VALIDATION REPORT",
"=" * 60,
"",
"HES APCS:",
f" Invalid ICD10 codes: {format_bullet_list(apcs_icd10)}",
f" Invalid financial years: {format_bullet_list(apcs_fy)}",
"",
"ONS Deaths:",
f" Invalid ICD10 codes: {format_bullet_list(ons_icd10)}",
f" Invalid financial years: {format_bullet_list(ons_fy)}",
]

report = "\n".join(lines)

with open("output/validation_report.txt", "w") as f:
f.write(report)


if __name__ == "__main__":
main()
20 changes: 20 additions & 0 deletions project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,23 @@ actions:
moderately_sensitive:
counts: output/icd10_apcs.csv
log: output/icd10_apcs_log.json

icd10_ons_deaths:
run: >
sqlrunner:latest
--output output/icd10_ons_deaths.csv
--log-file output/icd10_ons_deaths_log.json
analysis/query_icd10_ons_deaths.sql
outputs:
moderately_sensitive:
counts: output/icd10_ons_deaths.csv
log: output/icd10_ons_deaths_log.json

validate_output:
run: python:latest python analysis/validate_output.py
needs:
- icd10_apcs
- icd10_ons_deaths
outputs:
moderately_sensitive:
report: output/validation_report.txt