PICORI2OMOP/simple_etl_test.py at master · TheDecodeLab/PICORI2OMOP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#!/usr/bin/env python3
"""
Simple ETL test for PICORI to OMOP conversion
This script tests the ETL process with a small sample of data
"""

import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

def create_spark_session():
    """Create Spark session with memory optimization"""
    return SparkSession.builder \
        .appName("PICORI2OMOP-SimpleTest") \
        .master("local[2]") \
        .config("spark.sql.session.timeZone", "UTC") \
        .config("spark.driver.memory", "2g") \
        .config("spark.driver.maxResultSize", "1g") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .getOrCreate()

def test_demographic_etl(spark):
    """Test demographic to person ETL"""
    print("=== Testing Demographic to Person ETL ===")

    # Read demographic data
    demographic_path = "/home/asadr/datasets/stroke_data/demographic.parquet"
    if not os.path.exists(demographic_path):
        print(f"❌ File not found: {demographic_path}")
        return None

    df = spark.read.parquet(demographic_path)
    print(f"📊 Loaded {df.count()} demographic records")

    # Show sample data
    print("📄 Sample demographic data:")
    df.select("PATID", "BIRTH_DATE", "SEX", "RACE", "HISPANIC").show(5)

    # Test transformations
    person_df = df.select(
        col("PATID").alias("person_source_value"),
        col("SEX"),
        col("RACE"),
        col("HISPANIC"),
        col("BIRTH_DATE")
    ).withColumn(
        "person_id",
        row_number().over(Window.orderBy("person_source_value"))
    ).withColumn(
        "gender_concept_id",
        when(col("SEX") == "M", lit(8507))
        .when(col("SEX") == "F", lit(8532))
        .otherwise(lit(0))
    ).withColumn(
        "race_concept_id",
        when(col("RACE") == "01", lit(8515))
        .when(col("RACE") == "02", lit(8516))
        .when(col("RACE") == "03", lit(8527))
        .when(col("RACE") == "04", lit(8557))
        .when(col("RACE") == "05", lit(8516))
        .when(col("RACE") == "06", lit(8522))
        .otherwise(lit(0))
    ).withColumn(
        "ethnicity_concept_id",
        when(col("HISPANIC") == "Y", lit(38003563))
        .when(col("HISPANIC") == "N", lit(38003564))
        .otherwise(lit(0))
    ).withColumn(
        "year_of_birth",
        year(col("BIRTH_DATE"))
    ).withColumn(
        "month_of_birth",
        month(col("BIRTH_DATE"))
    ).withColumn(
        "day_of_birth",
        dayofmonth(col("BIRTH_DATE"))
    )

    print("📄 Sample transformed person data:")
    person_df.select(
        "person_id", "person_source_value", "gender_concept_id",
        "race_concept_id", "ethnicity_concept_id", "year_of_birth"
    ).show(5)

    # Show concept mapping results
    print("📊 Concept mapping summary:")
    person_df.groupBy("gender_concept_id").count().show()
    person_df.groupBy("race_concept_id").count().show()
    person_df.groupBy("ethnicity_concept_id").count().show()

    return person_df

def test_encounter_etl(spark):
    """Test encounter to visit ETL"""
    print("\n=== Testing Encounter to Visit ETL ===")

    # Read encounter data (sample first 1000 rows to avoid memory issues)
    encounter_path = "/home/asadr/datasets/stroke_data/encounter.parquet"
    if not os.path.exists(encounter_path):
        print(f"❌ File not found: {encounter_path}")
        return None

    df = spark.read.parquet(encounter_path).limit(1000)  # Limit for testing
    print(f"📊 Loaded {df.count()} encounter records (limited for testing)")

    # Show sample data
    print("📄 Sample encounter data:")
    df.select("ENCOUNTERID", "PATID", "ENC_TYPE", "ADMIT_DATE", "DISCHARGE_DATE").show(5)

    # Test transformations
    visit_df = df.select(
        col("ENCOUNTERID").alias("visit_source_value"),
        col("PATID"),
        col("ENC_TYPE"),
        col("ADMIT_DATE"),
        col("DISCHARGE_DATE")
    ).withColumn(
        "visit_occurrence_id",
        row_number().over(Window.orderBy("visit_source_value"))
    ).withColumn(
        "visit_concept_id",
        when(col("ENC_TYPE") == "AV", lit(9202))
        .when(col("ENC_TYPE") == "ED", lit(9203))
        .when(col("ENC_TYPE") == "EI", lit(262))
        .when(col("ENC_TYPE") == "IP", lit(9201))
        .when(col("ENC_TYPE") == "IS", lit(9201))
        .when(col("ENC_TYPE") == "OA", lit(9202))
        .when(col("ENC_TYPE") == "OS", lit(9202))
        .otherwise(lit(0))
    ).withColumn(
        "visit_start_date",
        to_date(col("ADMIT_DATE"))
    ).withColumn(
        "visit_end_date",
        to_date(col("DISCHARGE_DATE"))
    )

    print("📄 Sample transformed visit data:")
    visit_df.select(
        "visit_occurrence_id", "visit_source_value", "visit_concept_id",
        "visit_start_date", "visit_end_date"
    ).show(5)

    # Show encounter type mapping results
    print("📊 Encounter type mapping summary:")
    visit_df.groupBy("visit_concept_id").count().show()

    return visit_df

def test_diagnosis_etl(spark):
    """Test diagnosis to condition ETL (with sampling)"""
    print("\n=== Testing Diagnosis to Condition ETL ===")

    # Read diagnosis data (sample first 1000 rows to avoid memory issues)
    diagnosis_path = "/home/asadr/datasets/stroke_data/diagnosis.parquet"
    if not os.path.exists(diagnosis_path):
        print(f"❌ File not found: {diagnosis_path}")
        return None

    df = spark.read.parquet(diagnosis_path).limit(1000)  # Limit for testing
    print(f"📊 Loaded {df.count()} diagnosis records (limited for testing)")

    # Show sample data
    print("📄 Sample diagnosis data:")
    df.select("DIAGNOSISID", "PATID", "ENCOUNTERID", "DX", "DX_TYPE", "DX_DATE").show(5)

    # Test transformations
    condition_df = df.select(
        col("DIAGNOSISID").alias("condition_source_value"),
        col("PATID"),
        col("ENCOUNTERID"),
        col("DX"),
        col("DX_TYPE"),
        col("DX_DATE")
    ).withColumn(
        "condition_occurrence_id",
        row_number().over(Window.orderBy("condition_source_value"))
    ).withColumn(
        "condition_concept_id",
        lit(0)  # Simplified - would need vocabulary mapping
    ).withColumn(
        "condition_type_concept_id",
        when(col("DX_TYPE") == "PD", lit(38000201))
        .when(col("DX_TYPE") == "SD", lit(38000245))
        .when(col("DX_TYPE") == "AD", lit(38000230))
        .otherwise(lit(38000245))
    ).withColumn(
        "condition_start_date",
        to_date(col("DX_DATE"))
    )

    print("📄 Sample transformed condition data:")
    condition_df.select(
        "condition_occurrence_id", "condition_source_value", "condition_concept_id",
        "condition_type_concept_id", "condition_start_date"
    ).show(5)

    # Show diagnosis type mapping results
    print("📊 Diagnosis type mapping summary:")
    condition_df.groupBy("condition_type_concept_id").count().show()

    return condition_df

def main():
    """Main function"""
    print("🧪 PICORI to OMOP Simple ETL Test")
    print("="*50)

    # Create Spark session
    spark = create_spark_session()

    try:
        # Test demographic ETL
        person_df = test_demographic_etl(spark)

        # Test encounter ETL
        visit_df = test_encounter_etl(spark)

        # Test diagnosis ETL
        condition_df = test_diagnosis_etl(spark)

        print("\n✅ ETL Test Summary:")
        print(f"  - Person records: {person_df.count() if person_df else 0}")
        print(f"  - Visit records: {visit_df.count() if visit_df else 0}")
        print(f"  - Condition records: {condition_df.count() if condition_df else 0}")

        print("\n🎯 Next Steps:")
        print("1. Set up PostgreSQL database")
        print("2. Load OMOP vocabularies")
        print("3. Run full ETL with proper concept mapping")
        print("4. Implement memory optimization for large tables")

        return 0

    except Exception as e:
        print(f"❌ Error during ETL test: {e}")
        return 1

    finally:
        spark.stop()

if __name__ == "__main__":
    sys.exit(main())