OpenXAIProject · shiningstone23 · Oct 29, 2025 · Oct 30, 2025 · Oct 31, 2025 · Oct 31, 2025
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,8 @@ MANIFEST
 */.ipynb_checkpoints/*
 **/NOTES
 **/*.mp4
+
+results
+benchmark
+logs
+etc
diff --git a/Dockerfile.wine_quality b/Dockerfile.wine_quality
@@ -0,0 +1,66 @@
+# Wine Quality Experiment Dockerfile
+# Extends the base pnpxai-experiments image with additional XAI frameworks
+
+# Use the base image from README
+FROM seongun/ubuntu22.04-cuda12.2.2-cudnn8-pytorch2.1:base
+
+# Set working directory
+WORKDIR /root/pnpxai-experiments
+
+# Reinstall pnpxai from exp/tab branch
+RUN pip uninstall -y pnpxai && \
+    pip install --no-cache-dir git+https://github.com/OpenXAIProject/pnpxai.git@exp/tab
+
+# Install additional XAI frameworks for Wine Quality experiments
+# Pin specific versions for reproducibility (verified on 2025-11-13)
+# Note: shap<=0.44.0 required for OmniXAI v1.3.2 compatibility (output format changed in 0.45.0)
+RUN pip install --no-cache-dir \
+    captum==0.8.0 \
+    scikit-learn==1.1.3 \
+    pandas==2.3.3 \
+    xgboost==3.1.1 \
+    shap==0.44.0 \
+    lime==0.2.0.1 \
+    pyyaml==6.0 \
+    tqdm==4.66.0 \
+    ucimlrepo
+
+# Install XAI frameworks from GitHub
+# Pin to specific versions for reproducibility (verified on 2025-11-13)
+
+# OmniXAI v1.3.2 - Salesforce's comprehensive XAI library
+RUN pip install --no-cache-dir git+https://github.com/salesforce/OmniXAI.git@v1.3.2
+
+# OpenXAI v0.1 - Standardized XAI evaluation framework
+# Note: Uses latest commit as no version tags available
+RUN pip install --no-cache-dir git+https://github.com/AI4LIFE-GROUP/OpenXAI.git
+
+# Create separate virtual environment for AutoXAI
+# AutoXAI code is mounted from experiments/scripts/lib/AutoXAI (not cloned in Docker)
+# AutoXAI requires bayesian-optimization which needs numpy<2.0, but other frameworks need numpy>=2.0
+# AutoXAI also requires aix360, which depends on xport (requires pandas<1.4) and cvxpy
+RUN python -m venv /opt/autoxai_venv && \
+    /opt/autoxai_venv/bin/pip install --upgrade pip && \
+    /opt/autoxai_venv/bin/pip install --no-cache-dir \
+    'numpy<2.0' \
+    scikit-learn==1.1.3 \
+    scikit-learn-extra \
+    scikit-optimize \
+    'pandas<1.4' \
+    xgboost==3.1.1 \
+    shap==0.49.1 \
+    lime==0.2.0.1 \
+    aix360 \
+    xport \
+    cvxpy \
+    pyyaml==6.0 \
+    tqdm==4.66.0 \
+    bayesian-optimization \
+    ucimlrepo && \
+    /opt/autoxai_venv/bin/pip install --no-cache-dir git+https://github.com/OpenXAIProject/pnpxai.git@exp/tab
+
+# Clean up pip cache to reduce image size
+RUN pip cache purge
+
+# Set default command
+CMD ["/bin/bash"]
diff --git a/README.md b/README.md
diff --git a/data/mimiciii/README.md b/data/mimiciii/README.md
@@ -0,0 +1,85 @@
+# MIMIC III Data Generation
+
+## Data Loading
+
+[MIMIC III Clinical Database](https://doi.org/10.13026/C2XW26) is a large database of anonymized data of more than forty thousand patients. The data provided at the source, should first be downloaded, and built. The building process is provided by the official [Github package](https://github.com/MIT-LCP/mimic-code/tree/main/mimic-iii/buildmimic/postgres). We utilize PostgreSQL version of the built database for quick and convenient data querying.
+
+## Data Formatting
+
+Having built the MIMIC III on a PostgreSQL DBMS, the data needs to be preprocessed. This process comprises three essential stages:
+* Database parsing
+* Preprocessing
+* Data cleanup
+
+### Database Parsing
+
+The [`parse_db.py`](./parse_db.py) script connects to the database to build essential materialized views, which could further be quieried for quick data extraction.
+
+#### Usage
+
+```bash
+python parse_db.py \
+    --host localhost \
+    --db mimic \
+    --user postgres \
+    --password postgres \
+    --out_path ./formatted
+```
+
+#### Arguments
+
+  * `--host <HOST>`: The host of PostgreSQL DBMS containing build MIMIC III database. The default value is set to `localhost`.
+  * `--db <DATABASE>`: The name of the database of PostgreSQL DBMS, containing build MIMIC III database. The default value is set to `mimic`.
+  * `--user <USER>`: The user owning the PostgreSQL database, containing build MIMIC III database. The default value is set to `postgres`.
+  * `--password <PASSWORD>`: The password of the user owning the PostgreSQL database, containing build MIMIC III database. The default value is set to `postgres`.
+  * `--out_path <OUT_PATH>`: The target directory for intermediate files, produced by the script. The default value is set to `./formatted`.
+
+#### Output
+
+Results will be saved under the path, specified in `OUT_PATH`. The list of files in the directory is expected to consist of:
+  * `AKI_KIDIGO_7D_SQL_CREATININE_DBSOURCE.csv`
+  * `labstay_DBSOURCE.csv`
+  * `chart_vitals_stay_DBSOURCE.csv`
+  * `comorbidities_DBSOURCE.csv`
+
+### Preprocessing
+
+The [`preprocess.py`](./preprocess.py) script combines the data from the previous step with the source data files to build the pre-final version of the dataset.
+
+#### Usage
+
+```bash
+python preprocess.py \
+    --formatted_path ./formatted \
+    --data_path ./data
+```
+
+#### Arguments
+
+  * `--formatted_path <FORMATTED_PATH>`: The path, where the files, produced by `parse_db.py` are stored. The default value is set to `./formatted`.
+  * `--data_path <DATA_PATH>`: The path to gzipped MIMIC III data. The default value is set to `./data`.
+
+#### Output
+
+Results will be saved under the path, specified in `FORMATTED_PATH`. The list of files in the directory is expected to be extended by `INFO_DATASET_7days_creatinine.csv`.
+
+### Data Cleanup
+
+The [`cleanup.py`](./cleanup.py) script cleans up the data from the previous step and prepares it for the analysis.
+
+#### Usage
+
+```bash
+python cleanup.py \
+    --data_path ./formatted/INFO_DATASET_7days_creatinine.csv \
+    --formatted_path ./formatted/data.csv
+```
+
+#### Arguments
+
+  * `--data_path <DATA_PATH>`: The path, where the file, produced by `preprocess.py` (`INFO_DATASET_7days_creatinine.csv`) is stored. The default value is set to `./formatted/INFO_DATASET_7days_creatinine.csv`.
+  * `--formatted_path <FORMATTED_PATH>`: The target path to store the final version of the data, ready for analysis. The default value is set to `./formatted/data.csv`.
+
+#### Output
+
+Results will be saved under the path, specified in `FORMATTED_PATH`. The list of files in the directory is expected to be extended by `data.csv`.
diff --git a/data/mimiciii/cleanup.py b/data/mimiciii/cleanup.py
@@ -0,0 +1,197 @@
+from argparse import ArgumentParser
+import pandas as pd
+import os
+
+
+def code_ethnicity(ethinicity: str) -> int:
+    if ethinicity in {
+        "UNKNOWN/NOT SPECIFIED",
+        "OTHER",
+        "PATIENT DECLINED TO ANSWER",
+        "MULTI RACE ETHNICITY",
+        "UNABLE TO OBTAIN",
+    }:
+        return -1
+    if ethinicity in {
+        "WHITE",
+        "WHITE - RUSSIAN",
+        "WHITE - EASTERN EUROPEAN",
+        "WHITE - OTHER EUROPEAN",
+        "WHITE - BRAZILIAN",
+    }:
+        return 0
+    if ethinicity in {
+        "BLACK/AFRICAN AMERICAN",
+        "BLACK/AFRICAN",
+        "BLACK/HAITIAN",
+        "BLACK/CAPE VERDEAN",
+    }:
+        return 1
+    if ethinicity in {
+        "ASIAN",
+        "ASIAN - ASIAN INDIAN",
+        "ASIAN - VIETNAMESE",
+        "ASIAN - CHINESE",
+        "ASIAN - FILIPINO",
+        "ASIAN - CAMBODIAN",
+        "ASIAN - THAI",
+        "ASIAN - OTHER",
+        "ASIAN - KOREAN",
+        "ASIAN - JAPANESE",
+    }:
+        return 2
+    if ethinicity in {
+        "HISPANIC OR LATINO",
+        "HISPANIC/LATINO - GUATEMALAN",
+        "HISPANIC/LATINO - PUERTO RICAN",
+        "HISPANIC/LATINO - DOMINICAN",
+        "HISPANIC/LATINO - SALVADORAN",
+        "HISPANIC/LATINO - COLOMBIAN",
+        "PORTUGUESE",
+        "HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)",
+        "HISPANIC/LATINO - HONDURAN",
+        "HISPANIC/LATINO - CUBAN",
+        "HISPANIC/LATINO - MEXICAN",
+    }:
+        return 3
+    if ethinicity in {
+        "AMERICAN INDIAN/ALASKA NATIVE",
+        "AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE",
+    }:
+        return 4
+    if ethinicity in {"CARIBBEAN ISLAND", "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER"}:
+        return 5
+    if ethinicity == "SOUTH AMERICAN":
+        return 6
+    if ethinicity == "MIDDLE EASTERN":
+        return 7
+
+    raise NotImplementedError()
+
+
+def code_system(system: str) -> int:
+    if system == "carevue":
+        return 0
+
+    if system == "metavision":
+        return 1
+
+    raise NotImplementedError()
+
+
+def code_gender(gender: str) -> int:
+    return int(gender == "M")
+
+
+def cleanup_data(filename: str) -> pd.DataFrame:
+    # read the data from the CSV
+    df = pd.read_csv(open(filename, "r"), delimiter=",")
+    df.columns = map(str.upper, df.columns)
+    print(df.shape)
+
+    # exclude CKD and AKI on admission patients
+    df = df[~(df["AKI"] == 2)]
+    df = df[~(df["AKI"] == 3)]
+    df = df[~(df["AKI"] == 4)]
+
+    print(df.groupby("AKI")["ICUSTAY_ID"].nunique())
+
+    # Consider only adults
+    df = df[~(df["AGE"] < 18)]
+
+    df["ETHNICITY"] = df["ETHNICITY"].apply(lambda x: code_ethnicity(x))
+    df["GENDER"] = df["GENDER"].apply(lambda x: code_gender(x))
+
+    print(df.groupby("ETHNICITY")["ICUSTAY_ID"].nunique())
+
+    df = df.rename(
+        columns={
+            "HADM_ID_X": "HADM_ID",
+            "GLUCOSE_MIN_X": "GLUCOSE_MIN",
+            "GLUCOSE_MAX_X": "GLUCOSE_MAX",
+            "SUBJECT_ID_Y": "SUBJECT_ID",
+            "SUBJECT_ID_X.1": "SUBJECT_ID",
+            "DBSOURCE_Y": "DBSOURCE",
+        }
+    )
+    df = df.fillna(0)
+
+    df = df.drop(df.columns[1], axis=1)
+
+    print(df.groupby("AKI")["ICUSTAY_ID"].nunique())
+    print(df.groupby("AKI_STAGE_7DAY")["ICUSTAY_ID"].nunique())
+
+    print(
+        "Non AKI Patients : {}".format(
+            df.loc[df["AKI_STAGE_7DAY"] == 0]["ICUSTAY_ID"].count()
+        )
+    )
+    print(
+        "AKI patients STAGE 1: {}".format(
+            df.loc[df["AKI_STAGE_7DAY"] == 1]["ICUSTAY_ID"].count()
+        )
+    )
+    print(
+        "AKI Patients STAGE 2: {}".format(
+            df.loc[df["AKI_STAGE_7DAY"] == 2]["ICUSTAY_ID"].count()
+        )
+    )
+    print(
+        "AKI Patients STAGE 3: {}".format(
+            df.loc[df["AKI_STAGE_7DAY"] == 3]["ICUSTAY_ID"].count()
+        )
+    )
+    print("NAN patients: {}".format(df["AKI"].isna().sum()))
+
+    df = df.drop(
+        [
+            "ADMITTIME",
+            "DISCHTIME",
+            "OUTTIME",
+            "INTIME",
+            "DOB",
+            "CHARTTIME_CREAT",
+            "UNNAMED: 0",
+            "AKI_STAGE_CREAT",
+            "AKI_7DAY",
+            "GLUCOSE_MAX_Y",
+            "GLUCOSE_MIN_Y",
+            "DBSOURCE_X",
+        ],
+        axis=1,
+    )
+
+    if isinstance(df["DBSOURCE"], pd.DataFrame):
+        df["DBSOURCE_NEW"] = df["DBSOURCE"].iloc[:, 0]
+        df = df.drop(["DBSOURCE"], axis=1)
+        df = df.rename(columns={"DBSOURCE_NEW": "DBSOURCE"})
+
+    df = df[~(df["DBSOURCE"] == "both")]
+
+    df["DBSOURCE"] = df["DBSOURCE"].apply(lambda x: code_system(x))
+
+    return df
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        default="./formatted/INFO_DATASET_7days_creatinine.csv",
+        help='Path to formatted MIMIC III data from "preprocess.py"',
+    )
+    parser.add_argument(
+        "--formatted_path",
+        type=str,
+        default="./formatted/data.csv",
+        help="Output path to store cleaned data",
+    )
+    args = parser.parse_args()
+
+    df = cleanup_data(args.data_path)
+    df.to_csv(args.formatted_path)
+
+
+if __name__ == "__main__":
+    main()