From cd1def5cccf4ab44c330205de545438207f87f03 Mon Sep 17 00:00:00 2001 From: Johan Mathe Date: Thu, 29 Jan 2026 05:47:46 +0000 Subject: [PATCH 1/2] Anonymize --- .gitignore | 6 + LICENSE | 2 +- README.md | 14 +- notebooks/datasets.ipynb | 268 +++--- ogbench/baseline.py | 4 +- pyproject.toml | 5 +- scripts/export_wandb.py | 2 +- scripts/plot_adjacency_threshold_analysis.py | 2 +- scripts/utils.py | 2 +- tutorials/dataset_stats.ipynb | 904 +++++++++---------- tutorials/dataset_stats_analysis.py | 6 +- webapp/README.md | 2 +- webapp/precompute_stats.py | 2 +- 13 files changed, 610 insertions(+), 609 deletions(-) diff --git a/.gitignore b/.gitignore index d0a90784..5ba05b56 100644 --- a/.gitignore +++ b/.gitignore @@ -192,3 +192,9 @@ logs/ *.parquet datasets/ run_data/ + +# Anonymization: exclude files with identifying info for ICML submission +search_results/ +outputs/ +wandb_analysis_report.md +.cursor/ diff --git a/LICENSE b/LICENSE index 86852bbf..c312126d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 geometric-intelligence +Copyright (c) 2025 Anonymous Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 36181af6..445382f9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,3 @@ -[![Code Quality](https://github.com/geometric-intelligence/bgbench/actions/workflows/code-quality-main.yaml/badge.svg)](https://github.com/geometric-intelligence/bgbench/actions/workflows/code-quality-main.yaml) -[![Dependencies](https://github.com/geometric-intelligence/bgbench/actions/workflows/dependabot/dependabot-updates/badge.svg)](https://github.com/geometric-intelligence/bgbench/actions/workflows/dependabot/dependabot-updates) -[![Tests](https://github.com/geometric-intelligence/bgbench/actions/workflows/test.yml/badge.svg)](https://github.com/geometric-intelligence/bgbench/actions/workflows/test.yml) - # Big Graph Bench (BGBench) A comprehensive benchmarking framework for Graph Neural Networks (GNNs) on omics datasets for classification tasks. This repository provides standardized datasets, preprocessing pipelines, and evaluation metrics to enable fair comparison of different GNN architectures on biological data. @@ -24,7 +20,7 @@ The easiest way to set up BGBench is using the provided environment setup script ```bash # Clone the repository -git clone git@github.com:geometric-intelligence/bgbench.git +git clone cd bgbench # Run the automated setup script @@ -44,7 +40,7 @@ If you prefer manual setup: ```bash # Clone the repository -git clone git@github.com:geometric-intelligence/bgbench.git +git clone cd bgbench # Create conda environment @@ -91,7 +87,7 @@ BGBench includes three curated omics datasets for graph-based classification: ### Dataset Storage and Access -All datasets are stored on Hugging Face Hub at `geometric-intelligence/bgbench` and automatically downloaded when needed. The datasets are preprocessed and stored in Parquet format for efficient loading. +All datasets are stored on Hugging Face Hub at `/bgbench` and automatically downloaded when needed. The datasets are preprocessed and stored in Parquet format for efficient loading. ## Dataset Preprocessing @@ -392,9 +388,9 @@ If you use BGBench in your research, please cite: ```bibtex @software{bgbench2024, title={Big Graph Bench: A Benchmarking Framework for Graph Neural Networks on Omics Data}, - author={Geometric Intelligence Team}, + author={Anonymous Authors}, year={2024}, - url={https://github.com/geometric-intelligence/bgbench} + url={} } ``` diff --git a/notebooks/datasets.ipynb b/notebooks/datasets.ipynb index a40af1c5..6bf377cd 100644 --- a/notebooks/datasets.ipynb +++ b/notebooks/datasets.ipynb @@ -1,137 +1,137 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import os\n", - "import sys\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "\n", - "sys.path.append(\"/home/johmathe/bgbench\")\n", - "os.environ[\"PYTHONPATH\"] = os.pathsep.join(sys.path)\n", - "from src.data import hf_datamodule\n" - ] + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import sys\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(\".\"))))\n", + "os.environ[\"PYTHONPATH\"] = os.pathsep.join(sys.path)\n", + "from src.data import hf_datamodule\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data inspection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set style\n", + "sns.set_palette(\"husl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set style\n", + "sns.set_palette(\"husl\")\n", + "\n", + "\n", + "def plot_dataset_distributions(datamodule_class, title):\n", + " \"\"\"Plot feature and target distributions for a dataset.\"\"\"\n", + " # Initialize datamodule\n", + " datamodule = datamodule_class()\n", + " datamodule.prepare_data()\n", + "\n", + " # Load data\n", + " selected_data = pd.read_parquet(datamodule.selected_data_path)\n", + " targets = np.load(datamodule.targets_path)\n", + "\n", + " # Create figure\n", + " fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))\n", + " fig.suptitle(f\"{title} Dataset Distributions\", fontsize=16)\n", + "\n", + " # Plot feature distributions\n", + " sns.histplot(data=selected_data.values.flatten(), ax=ax1, bins=1000)\n", + " ax1.set_title(\"Feature Value Distribution\")\n", + " ax1.set_xlabel(\"Feature Value\")\n", + " ax1.set_ylabel(\"Count\")\n", + "\n", + " # Plot target distribution\n", + " sns.histplot(data=targets, ax=ax2, bins=50)\n", + " ax2.set_title(\"Target Value Distribution\")\n", + " ax2.set_xlabel(\"Target Value\")\n", + " ax2.set_ylabel(\"Count\")\n", + "\n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + " # Print statistics\n", + " print(f\"\\n{title} Dataset Statistics:\")\n", + " print(f\"Number of samples: {len(targets)}\")\n", + " print(f\"Number of features: {selected_data.shape[1]}\")\n", + " print(\"\\nTarget Statistics:\")\n", + " print(f\"Mean: {np.mean(targets):.2f}\")\n", + " print(f\"Std: {np.std(targets):.2f}\")\n", + " print(f\"Min: {np.min(targets):.2f}\")\n", + " print(f\"Max: {np.max(targets):.2f}\")\n", + "\n", + "\n", + "# Plot distributions for each dataset\n", + "data = hf_datamodule.HFMotrPacDataModule()\n", + "data = hf_datamodule.HFAddNeuroMedOmicsDataModule()\n", + "data = hf_datamodule.HFCovidAKIOmicsDataModule()\n", + "data = hf_datamodule.HFParkinsonsOmicsDataModule()\n", + "\n", + "data.prepare_data()\n", + "\n", + "# plot_dataset_distributions(data, data.dataset_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bgbench", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data inspection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set style\n", - "sns.set_palette(\"husl\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set style\n", - "sns.set_palette(\"husl\")\n", - "\n", - "\n", - "def plot_dataset_distributions(datamodule_class, title):\n", - " \"\"\"Plot feature and target distributions for a dataset.\"\"\"\n", - " # Initialize datamodule\n", - " datamodule = datamodule_class()\n", - " datamodule.prepare_data()\n", - "\n", - " # Load data\n", - " selected_data = pd.read_parquet(datamodule.selected_data_path)\n", - " targets = np.load(datamodule.targets_path)\n", - "\n", - " # Create figure\n", - " fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))\n", - " fig.suptitle(f\"{title} Dataset Distributions\", fontsize=16)\n", - "\n", - " # Plot feature distributions\n", - " sns.histplot(data=selected_data.values.flatten(), ax=ax1, bins=1000)\n", - " ax1.set_title(\"Feature Value Distribution\")\n", - " ax1.set_xlabel(\"Feature Value\")\n", - " ax1.set_ylabel(\"Count\")\n", - "\n", - " # Plot target distribution\n", - " sns.histplot(data=targets, ax=ax2, bins=50)\n", - " ax2.set_title(\"Target Value Distribution\")\n", - " ax2.set_xlabel(\"Target Value\")\n", - " ax2.set_ylabel(\"Count\")\n", - "\n", - " plt.tight_layout()\n", - " plt.show()\n", - "\n", - " # Print statistics\n", - " print(f\"\\n{title} Dataset Statistics:\")\n", - " print(f\"Number of samples: {len(targets)}\")\n", - " print(f\"Number of features: {selected_data.shape[1]}\")\n", - " print(\"\\nTarget Statistics:\")\n", - " print(f\"Mean: {np.mean(targets):.2f}\")\n", - " print(f\"Std: {np.std(targets):.2f}\")\n", - " print(f\"Min: {np.min(targets):.2f}\")\n", - " print(f\"Max: {np.max(targets):.2f}\")\n", - "\n", - "\n", - "# Plot distributions for each dataset\n", - "data = hf_datamodule.HFMotrPacDataModule()\n", - "data = hf_datamodule.HFAddNeuroMedOmicsDataModule()\n", - "data = hf_datamodule.HFCovidAKIOmicsDataModule()\n", - "data = hf_datamodule.HFParkinsonsOmicsDataModule()\n", - "\n", - "data.prepare_data()\n", - "\n", - "# plot_dataset_distributions(data, data.dataset_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "bgbench", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/ogbench/baseline.py b/ogbench/baseline.py index 58370398..1351c6d2 100644 --- a/ogbench/baseline.py +++ b/ogbench/baseline.py @@ -171,7 +171,7 @@ def load_metadata(data_name: str, cfg: DictConfig) -> dict[str, Any] | None: # Download from HuggingFace try: logger.info('Downloading metadata from HuggingFace...') - hf_repo_id = 'geometric-intelligence/bgbench' + hf_repo_id = '/bgbench' revision = cfg.dataset.loader.parameters.get('revision', 'e1631e8') metadata_file = hf_hub_download( # nosec @@ -248,7 +248,7 @@ def load_and_prepare_data(cfg: DictConfig) -> DatasetContainer: # Download from HuggingFace logger.info('Downloading from HuggingFace...') - hf_repo_id = 'geometric-intelligence/bgbench' + hf_repo_id = '/bgbench' revision = cfg.dataset.loader.parameters.get('revision', 'e1631e8') data_file = hf_hub_download( # nosec diff --git a/pyproject.toml b/pyproject.toml index 251444e9..681631e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,9 +94,8 @@ all = ["ogbench[dev, doc]"] [project.scripts] ogbench-train = "ogbench.run:main" -[project.urls] -homepage="https://geometric-intelligence.github.io/bgbench/index.html" -repository="https://github.com/geometric-intelligence/bgbench" +# [project.urls] +# homepage and repository URLs removed for anonymous review [tool.ruff] line-length = 99 diff --git a/scripts/export_wandb.py b/scripts/export_wandb.py index f2588b76..d657b404 100644 --- a/scripts/export_wandb.py +++ b/scripts/export_wandb.py @@ -18,7 +18,7 @@ def flatten_dict(d: dict[str, Any], parent_key: str = '', sep: str = '_') -> dic def main() -> None: api = wandb.Api() - runs = api.runs('johmathe/biggraphbench') + runs = api.runs('/biggraphbench') summary_list: list[dict[str, Any]] = [] config_list: list[dict[str, Any]] = [] diff --git a/scripts/plot_adjacency_threshold_analysis.py b/scripts/plot_adjacency_threshold_analysis.py index f54e3811..6c424ace 100755 --- a/scripts/plot_adjacency_threshold_analysis.py +++ b/scripts/plot_adjacency_threshold_analysis.py @@ -49,7 +49,7 @@ }, } -HF_REPO_ID = 'geometric-intelligence/bgbench' +HF_REPO_ID = '/bgbench' def load_and_preprocess_dataset( diff --git a/scripts/utils.py b/scripts/utils.py index 7b926cf3..5d9edb3d 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -79,7 +79,7 @@ def upload_to_huggingface( """ try: api = huggingface_hub.HfApi() - repo_id = 'geometric-intelligence/bgbench' + repo_id = '/bgbench' # Create repository if it doesn't exist try: diff --git a/tutorials/dataset_stats.ipynb b/tutorials/dataset_stats.ipynb index 8224f7c0..4f352562 100644 --- a/tutorials/dataset_stats.ipynb +++ b/tutorials/dataset_stats.ipynb @@ -1,455 +1,455 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "6ce976e7", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import os\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "from hydra import compose, initialize\n", - "from hydra.core.global_hydra import GlobalHydra # Import GlobalHydra explicitly\n", - "from hydra.utils import instantiate\n", - "\n", - "from ogbench.utils.config_resolvers import (\n", - " get_default_transform,\n", - " get_monitor_metric,\n", - " get_monitor_mode,\n", - " infer_in_channels,\n", - ")\n", - "\n", - "# Clear GlobalHydra instance if already initialized\n", - "if GlobalHydra().is_initialized():\n", - " GlobalHydra().clear()\n", - "\n", - "initialize(config_path=\"../configs\", job_name=\"job\")" - ] + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "6ce976e7", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from hydra import compose, initialize\n", + "from hydra.core.global_hydra import GlobalHydra # Import GlobalHydra explicitly\n", + "from hydra.utils import instantiate\n", + "\n", + "from ogbench.utils.config_resolvers import (\n", + " get_default_transform,\n", + " get_monitor_metric,\n", + " get_monitor_mode,\n", + " infer_in_channels,\n", + ")\n", + "\n", + "# Clear GlobalHydra instance if already initialized\n", + "if GlobalHydra().is_initialized():\n", + " GlobalHydra().clear()\n", + "\n", + "initialize(config_path=\"../configs\", job_name=\"job\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "935b096d", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_name = \"addneuromed\"\n", + "\n", + "cfg = compose(\n", + " config_name=\"train.yaml\",\n", + " overrides=[\n", + " \"model=gat\",\n", + " f\"dataset={dataset_name}\",\n", + " \"dataset.loader.parameters.adjacency_threshold=0.5\",\n", + " \"dataset.loader.parameters.node_sample_ratio=full\",\n", + " ],\n", + " return_hydra_config=True,\n", + ")\n", + "loader = instantiate(cfg.dataset.loader)\n", + "dataset = loader.load_dataset()\n", + "print(dataset.processed_dir)\n", + "print(dataset[0])\n", + "\n", + "\n", + "def load_dataset(dataset_name, adj_thresh=0.5):\n", + " \"\"\"\n", + " Load the FTD dataset with a specified adjacency threshold.\n", + " \"\"\"\n", + " cfg = compose(\n", + " config_name=\"train.yaml\",\n", + " overrides=[\n", + " \"model=gat\",\n", + " f\"dataset={dataset_name}\",\n", + " f\"dataset.loader.parameters.adjacency_threshold={adj_thresh}\",\n", + " \"dataset.loader.parameters.node_sample_ratio=full\",\n", + " ],\n", + " return_hydra_config=True,\n", + " )\n", + " loader = instantiate(cfg.dataset.loader)\n", + " dataset = loader.load_dataset()\n", + " return dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31cb9b22", + "metadata": {}, + "outputs": [], + "source": [ + "root = \"./run_data/omics/\"\n", + "name = osp.join(\n", + " root,\n", + " f\"{dataset.data_name}\",\n", + " f\"adj_thresh_{dataset.adjacency_threshold}\",\n", + " f\"{dataset.method}\",\n", + " f\"p_{dataset.node_sample_ratio}\",\n", + " f\"train_split_{dataset.train_val_test_split[0]}\",\n", + " \"raw/adj_matrix.npy\",\n", + ")\n", + "print(name)\n", + "adj_matrix = np.load(name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7050e42f", + "metadata": {}, + "outputs": [], + "source": [ + "# Adjacency matrix loaded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8786a28d", + "metadata": {}, + "outputs": [], + "source": [ + "def get_graph_stats(dataset):\n", + " \"\"\"\n", + " Get statistics of the graph.\n", + " \"\"\"\n", + " # Load the adjacency matrix\n", + " root = \"./run_data/omics/\"\n", + " name = osp.join(\n", + " root,\n", + " f\"{dataset.data_name}\",\n", + " f\"adj_thresh_{dataset.adjacency_threshold}\",\n", + " f\"{dataset.method}\",\n", + " f\"p_{dataset.node_sample_ratio}\",\n", + " f\"train_split_{dataset.train_val_test_split[0]}\",\n", + " \"raw/adj_matrix.npy\",\n", + " )\n", + " adj_matrix = np.load(name)\n", + "\n", + " # Generate a graph from the adjacency matrix\n", + " graph = nx.from_numpy_array(adj_matrix)\n", + " graph.remove_edges_from(nx.selfloop_edges(graph))\n", + "\n", + " # Calculate statistics\n", + " num_nodes = graph.number_of_nodes()\n", + " num_edges = graph.number_of_edges()\n", + " avg_degree = np.mean([d for _, d in graph.degree()])\n", + " density = nx.density(graph)\n", + " number_connected_components = nx.number_connected_components(graph)\n", + "\n", + " return {\n", + " \"num_nodes\": num_nodes,\n", + " \"num_edges\": num_edges,\n", + " \"avg_degree\": avg_degree,\n", + " \"density\": density,\n", + " \"number_connected_components\": number_connected_components,\n", + " }\n", + "\n", + "\n", + "# Get graph statistics\n", + "stats = get_graph_stats(dataset)\n", + "print(\"\\nGraph statistics:\\n\")\n", + "for key, value in stats.items():\n", + " print(f\"\\t{key}: {value}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "949f821e", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize a list to store all stats\n", + "all_stats = []\n", + "\n", + "for adj_thresh in np.arange(0, 1.01, 0.01):\n", + " # Load the dataset\n", + " dataset = load_dataset(dataset_name, adj_thresh=adj_thresh)\n", + " # Get graph statistics\n", + " stats = get_graph_stats(dataset)\n", + " # Add the adjacency threshold to the stats\n", + " stats[\"adj_thresh\"] = adj_thresh\n", + " # Append the stats to the list\n", + " all_stats.append(stats)\n", + "\n", + "# Save all stats to a CSV file\n", + "output_file = f\"./stats/{dataset_name}/graph_stats.csv\"\n", + "os.makedirs(os.path.dirname(output_file), exist_ok=True)\n", + "with open(output_file, \"w\", newline=\"\") as f:\n", + " writer = csv.DictWriter(\n", + " f,\n", + " fieldnames=[\n", + " \"adj_thresh\",\n", + " \"num_nodes\",\n", + " \"num_edges\",\n", + " \"avg_degree\",\n", + " \"density\",\n", + " \"number_connected_components\",\n", + " ],\n", + " )\n", + " writer.writeheader() # Write the header row\n", + " writer.writerows(all_stats) # Write all rows\n", + "\n", + "print(f\"Graph statistics saved to {output_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a93c9c0", + "metadata": {}, + "outputs": [], + "source": [ + "# import os, csv, math\n", + "# import numpy as np\n", + "# from concurrent.futures import ProcessPoolExecutor, as_completed\n", + "# from functools import partial\n", + "# from time import perf_counter\n", + "\n", + "# # ---------- helper: single-task worker ----------\n", + "# def _compute_stats_for_thresh(adj_thresh, dataset_name):\n", + "# \"\"\"\n", + "# Runs in a separate process. Must be top-level so it can be pickled.\n", + "# \"\"\"\n", + "# # Load the dataset for this threshold\n", + "# dataset = load_dataset(dataset_name, adj_thresh=adj_thresh)\n", + "# # Compute graph stats\n", + "# stats = get_graph_stats(dataset)\n", + "# # Attach the threshold\n", + "# stats[\"adj_thresh\"] = float(adj_thresh)\n", + "# return stats\n", + "\n", + "# # ---------- main parallel block ----------\n", + "# start = perf_counter()\n", + "\n", + "# # Make a stable list of thresholds (avoids float step accumulation issues)\n", + "# adj_thresholds = [round(x, 2) for x in np.linspace(0.0, 1.0, 101)]\n", + "\n", + "# # How many worker processes to use. Tweak if you want fewer.\n", + "# max_workers = os.cpu_count() or 2\n", + "\n", + "# # Run in parallel\n", + "# results_in_order = [None] * len(adj_thresholds)\n", + "# with ProcessPoolExecutor(max_workers=max_workers) as ex:\n", + "# # Submit all jobs\n", + "# futures = {\n", + "# ex.submit(_compute_stats_for_thresh, t, dataset_name): idx\n", + "# for idx, t in enumerate(adj_thresholds)\n", + "# }\n", + "# # Collect as they finish, but store back in original order\n", + "# for fut in as_completed(futures):\n", + "# idx = futures[fut]\n", + "# try:\n", + "# results_in_order[idx] = fut.result()\n", + "# except Exception as e:\n", + "# # You can choose to raise here, or record an error row.\n", + "# # For now, record a minimal row with the error noted.\n", + "# results_in_order[idx] = {\n", + "# \"adj_thresh\": adj_thresholds[idx],\n", + "# \"num_nodes\": None,\n", + "# \"num_edges\": None,\n", + "# \"avg_degree\": None,\n", + "# \"density\": None,\n", + "# \"number_connected_components\": None,\n", + "# \"error\": str(e),\n", + "# }\n", + "\n", + "# # Save to CSV (same columns you used; \"error\" is optional)\n", + "# output_file = f\"./stats/{dataset_name}/graph_stats.csv\"\n", + "# os.makedirs(os.path.dirname(output_file), exist_ok=True)\n", + "\n", + "# fieldnames = [\n", + "# \"adj_thresh\",\n", + "# \"num_nodes\",\n", + "# \"num_edges\",\n", + "# \"avg_degree\",\n", + "# \"density\",\n", + "# \"number_connected_components\",\n", + "# ]\n", + "# # If any row had an error, include that column so you can see what failed.\n", + "# if any((\"error\" in r) for r in results_in_order):\n", + "# fieldnames.append(\"error\")\n", + "\n", + "# with open(output_file, \"w\", newline=\"\") as f:\n", + "# writer = csv.DictWriter(f, fieldnames=fieldnames)\n", + "# writer.writeheader()\n", + "# writer.writerows(results_in_order)\n", + "\n", + "# print(f\"Graph statistics saved to {output_file}\")\n", + "# print(f\"Completed in {perf_counter() - start:.2f}s using {max_workers} workers.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07e6c5df", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(100, -1, -1):\n", + " print(i / 100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "729e7c59", + "metadata": {}, + "outputs": [], + "source": [ + "for datasets in [\"addneuromed\", \"parkinsons\", \"covidaki\", \"motrpac\"]:\n", + " csv_file = f\"./stats/{datasets}/graph_stats.csv\"\n", + " df = pd.read_csv(csv_file)\n", + " # Sort the DataFrame by the 'adj_thresh' column in ascending order\n", + " df = df.sort_values(by=\"adj_thresh\", ascending=True)\n", + "\n", + " # Plot the evolution of graph statistics with respect to adj_thresh\n", + " plt.figure(figsize=(14, 10))\n", + "\n", + " # Plot number of edges\n", + " plt.subplot(3, 2, 1)\n", + " plt.plot(df[\"adj_thresh\"], df[\"num_edges\"], label=\"Number of Edges\", color=\"green\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Number of Edges\")\n", + " plt.title(\"Number of Edges vs. Adjacency Threshold\")\n", + " plt.grid(True)\n", + "\n", + " # Plot average degree\n", + " plt.subplot(3, 2, 2)\n", + " plt.plot(df[\"adj_thresh\"], df[\"avg_degree\"], label=\"Average Degree\", color=\"orange\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Average Degree\")\n", + " plt.title(\"Average Degree vs. Adjacency Threshold\")\n", + " plt.grid(True)\n", + "\n", + " # Plot density\n", + " plt.subplot(3, 2, 3)\n", + " plt.plot(df[\"adj_thresh\"], df[\"density\"], label=\"Density\", color=\"red\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Density\")\n", + " plt.title(\"Density vs. Adjacency Threshold\")\n", + " plt.grid(True)\n", + "\n", + " # Plot number of connected components\n", + " plt.subplot(3, 2, 4)\n", + " plt.plot(\n", + " df[\"adj_thresh\"],\n", + " df[\"number_connected_components\"],\n", + " label=\"Connected Components\",\n", + " color=\"purple\",\n", + " )\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Number of Connected Components\")\n", + " plt.title(\"Connected Components vs. Adjacency Threshold\")\n", + " plt.grid(True)\n", + "\n", + " plt.suptitle(datasets, fontsize=16, y=1.02)\n", + " # Adjust layout and show the plots\n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d53f33f", + "metadata": {}, + "outputs": [], + "source": [ + "metrics = [\"wgcna\", \"spearman_correlation\", \"mutual_information\", \"distance_correlation\"]\n", + "\n", + "for metric in metrics:\n", + " # Load the CSV file into a pandas DataFrame\n", + " csv_file = \"./stats/\" + metric + \"/graph_stats.csv\"\n", + " df = pd.read_csv(csv_file)\n", + " # Sort the DataFrame by the 'adj_thresh' column in ascending order\n", + " df = df.sort_values(by=\"adj_thresh\", ascending=True)\n", + "\n", + " # Plot the evolution of graph statistics with respect to adj_thresh\n", + " plt.figure(figsize=(14, 10))\n", + " plt.suptitle(metric, fontsize=18)\n", + "\n", + " # Plot number of edges\n", + " plt.subplot(3, 2, 1)\n", + " plt.plot(df[\"adj_thresh\"], df[\"num_edges\"], label=\"Number of Edges\", color=\"green\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Number of Edges\")\n", + " plt.title(\"Number of Edges vs. Adjacency Threshold\")\n", + " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", + " plt.grid(True)\n", + "\n", + " # Plot average degree\n", + " plt.subplot(3, 2, 2)\n", + " plt.plot(df[\"adj_thresh\"], df[\"avg_degree\"], label=\"Average Degree\", color=\"orange\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Average Degree\")\n", + " plt.title(\"Average Degree vs. Adjacency Threshold\")\n", + " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", + " plt.grid(True)\n", + "\n", + " # Plot density\n", + " plt.subplot(3, 2, 3)\n", + " plt.plot(df[\"adj_thresh\"], df[\"density\"], label=\"Density\", color=\"red\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Density\")\n", + " plt.title(\"Density vs. Adjacency Threshold\")\n", + " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", + " plt.grid(True)\n", + "\n", + " # Plot number of connected components\n", + " plt.subplot(3, 2, 4)\n", + " plt.plot(\n", + " df[\"adj_thresh\"],\n", + " df[\"number_connected_components\"],\n", + " label=\"Connected Components\",\n", + " color=\"purple\",\n", + " )\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Number of Connected Components\")\n", + " plt.title(\"Connected Components vs. Adjacency Threshold\")\n", + " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", + " plt.grid(True)\n", + "\n", + " # Adjust layout and show the plots\n", + " plt.tight_layout()\n", + " plt.show()\n", + " plt.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12e6b900", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "proteo", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "935b096d", - "metadata": {}, - "outputs": [], - "source": [ - "dataset_name = \"addneuromed\"\n", - "\n", - "cfg = compose(\n", - " config_name=\"train.yaml\",\n", - " overrides=[\n", - " \"model=gat\",\n", - " f\"dataset={dataset_name}\",\n", - " \"dataset.loader.parameters.adjacency_threshold=0.5\",\n", - " \"dataset.loader.parameters.node_sample_ratio=full\",\n", - " ],\n", - " return_hydra_config=True,\n", - ")\n", - "loader = instantiate(cfg.dataset.loader)\n", - "dataset = loader.load_dataset()\n", - "print(dataset.processed_dir)\n", - "print(dataset[0])\n", - "\n", - "\n", - "def load_dataset(dataset_name, adj_thresh=0.5):\n", - " \"\"\"\n", - " Load the FTD dataset with a specified adjacency threshold.\n", - " \"\"\"\n", - " cfg = compose(\n", - " config_name=\"train.yaml\",\n", - " overrides=[\n", - " \"model=gat\",\n", - " f\"dataset={dataset_name}\",\n", - " f\"dataset.loader.parameters.adjacency_threshold={adj_thresh}\",\n", - " \"dataset.loader.parameters.node_sample_ratio=full\",\n", - " ],\n", - " return_hydra_config=True,\n", - " )\n", - " loader = instantiate(cfg.dataset.loader)\n", - " dataset = loader.load_dataset()\n", - " return dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31cb9b22", - "metadata": {}, - "outputs": [], - "source": [ - "root = \"/home/lcornelis/code/bgbench/run_data/omics/\"\n", - "name = osp.join(\n", - " root,\n", - " f\"{dataset.data_name}\",\n", - " f\"adj_thresh_{dataset.adjacency_threshold}\",\n", - " f\"{dataset.method}\",\n", - " f\"p_{dataset.node_sample_ratio}\",\n", - " f\"train_split_{dataset.train_val_test_split[0]}\",\n", - " \"raw/adj_matrix.npy\",\n", - ")\n", - "print(name)\n", - "adj_matrix = np.load(name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7050e42f", - "metadata": {}, - "outputs": [], - "source": [ - "# Adjacency matrix loaded" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8786a28d", - "metadata": {}, - "outputs": [], - "source": [ - "def get_graph_stats(dataset):\n", - " \"\"\"\n", - " Get statistics of the graph.\n", - " \"\"\"\n", - " # Load the adjacency matrix\n", - " root = \"/home/lcornelis/code/bgbench/run_data/omics/\"\n", - " name = osp.join(\n", - " root,\n", - " f\"{dataset.data_name}\",\n", - " f\"adj_thresh_{dataset.adjacency_threshold}\",\n", - " f\"{dataset.method}\",\n", - " f\"p_{dataset.node_sample_ratio}\",\n", - " f\"train_split_{dataset.train_val_test_split[0]}\",\n", - " \"raw/adj_matrix.npy\",\n", - " )\n", - " adj_matrix = np.load(name)\n", - "\n", - " # Generate a graph from the adjacency matrix\n", - " graph = nx.from_numpy_array(adj_matrix)\n", - " graph.remove_edges_from(nx.selfloop_edges(graph))\n", - "\n", - " # Calculate statistics\n", - " num_nodes = graph.number_of_nodes()\n", - " num_edges = graph.number_of_edges()\n", - " avg_degree = np.mean([d for _, d in graph.degree()])\n", - " density = nx.density(graph)\n", - " number_connected_components = nx.number_connected_components(graph)\n", - "\n", - " return {\n", - " \"num_nodes\": num_nodes,\n", - " \"num_edges\": num_edges,\n", - " \"avg_degree\": avg_degree,\n", - " \"density\": density,\n", - " \"number_connected_components\": number_connected_components,\n", - " }\n", - "\n", - "\n", - "# Get graph statistics\n", - "stats = get_graph_stats(dataset)\n", - "print(\"\\nGraph statistics:\\n\")\n", - "for key, value in stats.items():\n", - " print(f\"\\t{key}: {value}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "949f821e", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize a list to store all stats\n", - "all_stats = []\n", - "\n", - "for adj_thresh in np.arange(0, 1.01, 0.01):\n", - " # Load the dataset\n", - " dataset = load_dataset(dataset_name, adj_thresh=adj_thresh)\n", - " # Get graph statistics\n", - " stats = get_graph_stats(dataset)\n", - " # Add the adjacency threshold to the stats\n", - " stats[\"adj_thresh\"] = adj_thresh\n", - " # Append the stats to the list\n", - " all_stats.append(stats)\n", - "\n", - "# Save all stats to a CSV file\n", - "output_file = f\"./stats/{dataset_name}/graph_stats.csv\"\n", - "os.makedirs(os.path.dirname(output_file), exist_ok=True)\n", - "with open(output_file, \"w\", newline=\"\") as f:\n", - " writer = csv.DictWriter(\n", - " f,\n", - " fieldnames=[\n", - " \"adj_thresh\",\n", - " \"num_nodes\",\n", - " \"num_edges\",\n", - " \"avg_degree\",\n", - " \"density\",\n", - " \"number_connected_components\",\n", - " ],\n", - " )\n", - " writer.writeheader() # Write the header row\n", - " writer.writerows(all_stats) # Write all rows\n", - "\n", - "print(f\"Graph statistics saved to {output_file}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a93c9c0", - "metadata": {}, - "outputs": [], - "source": [ - "# import os, csv, math\n", - "# import numpy as np\n", - "# from concurrent.futures import ProcessPoolExecutor, as_completed\n", - "# from functools import partial\n", - "# from time import perf_counter\n", - "\n", - "# # ---------- helper: single-task worker ----------\n", - "# def _compute_stats_for_thresh(adj_thresh, dataset_name):\n", - "# \"\"\"\n", - "# Runs in a separate process. Must be top-level so it can be pickled.\n", - "# \"\"\"\n", - "# # Load the dataset for this threshold\n", - "# dataset = load_dataset(dataset_name, adj_thresh=adj_thresh)\n", - "# # Compute graph stats\n", - "# stats = get_graph_stats(dataset)\n", - "# # Attach the threshold\n", - "# stats[\"adj_thresh\"] = float(adj_thresh)\n", - "# return stats\n", - "\n", - "# # ---------- main parallel block ----------\n", - "# start = perf_counter()\n", - "\n", - "# # Make a stable list of thresholds (avoids float step accumulation issues)\n", - "# adj_thresholds = [round(x, 2) for x in np.linspace(0.0, 1.0, 101)]\n", - "\n", - "# # How many worker processes to use. Tweak if you want fewer.\n", - "# max_workers = os.cpu_count() or 2\n", - "\n", - "# # Run in parallel\n", - "# results_in_order = [None] * len(adj_thresholds)\n", - "# with ProcessPoolExecutor(max_workers=max_workers) as ex:\n", - "# # Submit all jobs\n", - "# futures = {\n", - "# ex.submit(_compute_stats_for_thresh, t, dataset_name): idx\n", - "# for idx, t in enumerate(adj_thresholds)\n", - "# }\n", - "# # Collect as they finish, but store back in original order\n", - "# for fut in as_completed(futures):\n", - "# idx = futures[fut]\n", - "# try:\n", - "# results_in_order[idx] = fut.result()\n", - "# except Exception as e:\n", - "# # You can choose to raise here, or record an error row.\n", - "# # For now, record a minimal row with the error noted.\n", - "# results_in_order[idx] = {\n", - "# \"adj_thresh\": adj_thresholds[idx],\n", - "# \"num_nodes\": None,\n", - "# \"num_edges\": None,\n", - "# \"avg_degree\": None,\n", - "# \"density\": None,\n", - "# \"number_connected_components\": None,\n", - "# \"error\": str(e),\n", - "# }\n", - "\n", - "# # Save to CSV (same columns you used; \"error\" is optional)\n", - "# output_file = f\"./stats/{dataset_name}/graph_stats.csv\"\n", - "# os.makedirs(os.path.dirname(output_file), exist_ok=True)\n", - "\n", - "# fieldnames = [\n", - "# \"adj_thresh\",\n", - "# \"num_nodes\",\n", - "# \"num_edges\",\n", - "# \"avg_degree\",\n", - "# \"density\",\n", - "# \"number_connected_components\",\n", - "# ]\n", - "# # If any row had an error, include that column so you can see what failed.\n", - "# if any((\"error\" in r) for r in results_in_order):\n", - "# fieldnames.append(\"error\")\n", - "\n", - "# with open(output_file, \"w\", newline=\"\") as f:\n", - "# writer = csv.DictWriter(f, fieldnames=fieldnames)\n", - "# writer.writeheader()\n", - "# writer.writerows(results_in_order)\n", - "\n", - "# print(f\"Graph statistics saved to {output_file}\")\n", - "# print(f\"Completed in {perf_counter() - start:.2f}s using {max_workers} workers.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "07e6c5df", - "metadata": {}, - "outputs": [], - "source": [ - "for i in range(100, -1, -1):\n", - " print(i / 100)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "729e7c59", - "metadata": {}, - "outputs": [], - "source": [ - "for datasets in [\"addneuromed\", \"parkinsons\", \"covidaki\", \"motrpac\"]:\n", - " csv_file = f\"./stats/{datasets}/graph_stats.csv\"\n", - " df = pd.read_csv(csv_file)\n", - " # Sort the DataFrame by the 'adj_thresh' column in ascending order\n", - " df = df.sort_values(by=\"adj_thresh\", ascending=True)\n", - "\n", - " # Plot the evolution of graph statistics with respect to adj_thresh\n", - " plt.figure(figsize=(14, 10))\n", - "\n", - " # Plot number of edges\n", - " plt.subplot(3, 2, 1)\n", - " plt.plot(df[\"adj_thresh\"], df[\"num_edges\"], label=\"Number of Edges\", color=\"green\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Number of Edges\")\n", - " plt.title(\"Number of Edges vs. Adjacency Threshold\")\n", - " plt.grid(True)\n", - "\n", - " # Plot average degree\n", - " plt.subplot(3, 2, 2)\n", - " plt.plot(df[\"adj_thresh\"], df[\"avg_degree\"], label=\"Average Degree\", color=\"orange\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Average Degree\")\n", - " plt.title(\"Average Degree vs. Adjacency Threshold\")\n", - " plt.grid(True)\n", - "\n", - " # Plot density\n", - " plt.subplot(3, 2, 3)\n", - " plt.plot(df[\"adj_thresh\"], df[\"density\"], label=\"Density\", color=\"red\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Density\")\n", - " plt.title(\"Density vs. Adjacency Threshold\")\n", - " plt.grid(True)\n", - "\n", - " # Plot number of connected components\n", - " plt.subplot(3, 2, 4)\n", - " plt.plot(\n", - " df[\"adj_thresh\"],\n", - " df[\"number_connected_components\"],\n", - " label=\"Connected Components\",\n", - " color=\"purple\",\n", - " )\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Number of Connected Components\")\n", - " plt.title(\"Connected Components vs. Adjacency Threshold\")\n", - " plt.grid(True)\n", - "\n", - " plt.suptitle(datasets, fontsize=16, y=1.02)\n", - " # Adjust layout and show the plots\n", - " plt.tight_layout()\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d53f33f", - "metadata": {}, - "outputs": [], - "source": [ - "metrics = [\"wgcna\", \"spearman_correlation\", \"mutual_information\", \"distance_correlation\"]\n", - "\n", - "for metric in metrics:\n", - " # Load the CSV file into a pandas DataFrame\n", - " csv_file = \"./stats/\" + metric + \"/graph_stats.csv\"\n", - " df = pd.read_csv(csv_file)\n", - " # Sort the DataFrame by the 'adj_thresh' column in ascending order\n", - " df = df.sort_values(by=\"adj_thresh\", ascending=True)\n", - "\n", - " # Plot the evolution of graph statistics with respect to adj_thresh\n", - " plt.figure(figsize=(14, 10))\n", - " plt.suptitle(metric, fontsize=18)\n", - "\n", - " # Plot number of edges\n", - " plt.subplot(3, 2, 1)\n", - " plt.plot(df[\"adj_thresh\"], df[\"num_edges\"], label=\"Number of Edges\", color=\"green\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Number of Edges\")\n", - " plt.title(\"Number of Edges vs. Adjacency Threshold\")\n", - " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", - " plt.grid(True)\n", - "\n", - " # Plot average degree\n", - " plt.subplot(3, 2, 2)\n", - " plt.plot(df[\"adj_thresh\"], df[\"avg_degree\"], label=\"Average Degree\", color=\"orange\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Average Degree\")\n", - " plt.title(\"Average Degree vs. Adjacency Threshold\")\n", - " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", - " plt.grid(True)\n", - "\n", - " # Plot density\n", - " plt.subplot(3, 2, 3)\n", - " plt.plot(df[\"adj_thresh\"], df[\"density\"], label=\"Density\", color=\"red\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Density\")\n", - " plt.title(\"Density vs. Adjacency Threshold\")\n", - " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", - " plt.grid(True)\n", - "\n", - " # Plot number of connected components\n", - " plt.subplot(3, 2, 4)\n", - " plt.plot(\n", - " df[\"adj_thresh\"],\n", - " df[\"number_connected_components\"],\n", - " label=\"Connected Components\",\n", - " color=\"purple\",\n", - " )\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Number of Connected Components\")\n", - " plt.title(\"Connected Components vs. Adjacency Threshold\")\n", - " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", - " plt.grid(True)\n", - "\n", - " # Adjust layout and show the plots\n", - " plt.tight_layout()\n", - " plt.show()\n", - " plt.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12e6b900", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "proteo", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/tutorials/dataset_stats_analysis.py b/tutorials/dataset_stats_analysis.py index bc8243a1..5ecdf2b9 100644 --- a/tutorials/dataset_stats_analysis.py +++ b/tutorials/dataset_stats_analysis.py @@ -39,7 +39,7 @@ def load_dataset( if node_sample_ratio == 'full': dataset = HFOmicsDataset( - root='/home/johmathe/bgbench/run_data/omics', + root='./run_data/omics', data_name=dataset_name, method=method, adjacency_threshold=adj_thresh, @@ -49,7 +49,7 @@ def load_dataset( ) else: dataset = HFOmicsDataset( - root='/home/johmathe/bgbench/run_data/omics', + root='./run_data/omics', data_name=dataset_name, method=method, adjacency_threshold=adj_thresh, @@ -85,7 +85,7 @@ def get_graph_stats(dataset: Any) -> dict[str, float]: graph.add_nodes_from(range(num_nodes)) graph.add_edges_from(edge_list) else: - root = '/home/johmathe/bgbench/run_data/omics/' + root = './run_data/omics/' name = osp.join( root, f'{dataset.data_name}', diff --git a/webapp/README.md b/webapp/README.md index 9859ead4..4ccf4545 100644 --- a/webapp/README.md +++ b/webapp/README.md @@ -102,7 +102,7 @@ python precompute_stats.py ``` This will: -1. Download datasets from HuggingFace (`geometric-intelligence/bgbench`) +1. Download datasets from HuggingFace (`/bgbench`) 2. Compute graph statistics for all 324 combinations: - 3 datasets × 6 ratios × 3 methods × 6 thresholds 3. Save results to `public/data/stats.json` diff --git a/webapp/precompute_stats.py b/webapp/precompute_stats.py index 7fc01caa..346fe1f1 100644 --- a/webapp/precompute_stats.py +++ b/webapp/precompute_stats.py @@ -41,7 +41,7 @@ }, } -HF_REPO_ID = 'geometric-intelligence/bgbench' +HF_REPO_ID = '/bgbench' # Parameter grids NODE_SAMPLE_RATIOS = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0] From 0d1c65824732f336a32b51883a1bfb121131572a Mon Sep 17 00:00:00 2001 From: Johan Mathe Date: Thu, 29 Jan 2026 05:49:10 +0000 Subject: [PATCH 2/2] Anonimize --- notebooks/datasets.ipynb | 268 +++++----- ogbench/baseline.py | 3 +- scripts/export_wandb.py | 1 + tutorials/dataset_stats.ipynb | 904 +++++++++++++++++----------------- webapp/README.md | 16 +- webapp/precompute_stats.py | 105 ++-- 6 files changed, 650 insertions(+), 647 deletions(-) diff --git a/notebooks/datasets.ipynb b/notebooks/datasets.ipynb index 6bf377cd..1aba645f 100644 --- a/notebooks/datasets.ipynb +++ b/notebooks/datasets.ipynb @@ -1,137 +1,137 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import os\n", - "import sys\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "\n", - "sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(\".\"))))\n", - "os.environ[\"PYTHONPATH\"] = os.pathsep.join(sys.path)\n", - "from src.data import hf_datamodule\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data inspection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set style\n", - "sns.set_palette(\"husl\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set style\n", - "sns.set_palette(\"husl\")\n", - "\n", - "\n", - "def plot_dataset_distributions(datamodule_class, title):\n", - " \"\"\"Plot feature and target distributions for a dataset.\"\"\"\n", - " # Initialize datamodule\n", - " datamodule = datamodule_class()\n", - " datamodule.prepare_data()\n", - "\n", - " # Load data\n", - " selected_data = pd.read_parquet(datamodule.selected_data_path)\n", - " targets = np.load(datamodule.targets_path)\n", - "\n", - " # Create figure\n", - " fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))\n", - " fig.suptitle(f\"{title} Dataset Distributions\", fontsize=16)\n", - "\n", - " # Plot feature distributions\n", - " sns.histplot(data=selected_data.values.flatten(), ax=ax1, bins=1000)\n", - " ax1.set_title(\"Feature Value Distribution\")\n", - " ax1.set_xlabel(\"Feature Value\")\n", - " ax1.set_ylabel(\"Count\")\n", - "\n", - " # Plot target distribution\n", - " sns.histplot(data=targets, ax=ax2, bins=50)\n", - " ax2.set_title(\"Target Value Distribution\")\n", - " ax2.set_xlabel(\"Target Value\")\n", - " ax2.set_ylabel(\"Count\")\n", - "\n", - " plt.tight_layout()\n", - " plt.show()\n", - "\n", - " # Print statistics\n", - " print(f\"\\n{title} Dataset Statistics:\")\n", - " print(f\"Number of samples: {len(targets)}\")\n", - " print(f\"Number of features: {selected_data.shape[1]}\")\n", - " print(\"\\nTarget Statistics:\")\n", - " print(f\"Mean: {np.mean(targets):.2f}\")\n", - " print(f\"Std: {np.std(targets):.2f}\")\n", - " print(f\"Min: {np.min(targets):.2f}\")\n", - " print(f\"Max: {np.max(targets):.2f}\")\n", - "\n", - "\n", - "# Plot distributions for each dataset\n", - "data = hf_datamodule.HFMotrPacDataModule()\n", - "data = hf_datamodule.HFAddNeuroMedOmicsDataModule()\n", - "data = hf_datamodule.HFCovidAKIOmicsDataModule()\n", - "data = hf_datamodule.HFParkinsonsOmicsDataModule()\n", - "\n", - "data.prepare_data()\n", - "\n", - "# plot_dataset_distributions(data, data.dataset_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "bgbench", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" - } + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import sys\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(\".\"))))\n", + "os.environ[\"PYTHONPATH\"] = os.pathsep.join(sys.path)\n", + "from src.data import hf_datamodule\n" + ] }, - "nbformat": 4, - "nbformat_minor": 2 + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data inspection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set style\n", + "sns.set_palette(\"husl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set style\n", + "sns.set_palette(\"husl\")\n", + "\n", + "\n", + "def plot_dataset_distributions(datamodule_class, title):\n", + " \"\"\"Plot feature and target distributions for a dataset.\"\"\"\n", + " # Initialize datamodule\n", + " datamodule = datamodule_class()\n", + " datamodule.prepare_data()\n", + "\n", + " # Load data\n", + " selected_data = pd.read_parquet(datamodule.selected_data_path)\n", + " targets = np.load(datamodule.targets_path)\n", + "\n", + " # Create figure\n", + " fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))\n", + " fig.suptitle(f\"{title} Dataset Distributions\", fontsize=16)\n", + "\n", + " # Plot feature distributions\n", + " sns.histplot(data=selected_data.values.flatten(), ax=ax1, bins=1000)\n", + " ax1.set_title(\"Feature Value Distribution\")\n", + " ax1.set_xlabel(\"Feature Value\")\n", + " ax1.set_ylabel(\"Count\")\n", + "\n", + " # Plot target distribution\n", + " sns.histplot(data=targets, ax=ax2, bins=50)\n", + " ax2.set_title(\"Target Value Distribution\")\n", + " ax2.set_xlabel(\"Target Value\")\n", + " ax2.set_ylabel(\"Count\")\n", + "\n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + " # Print statistics\n", + " print(f\"\\n{title} Dataset Statistics:\")\n", + " print(f\"Number of samples: {len(targets)}\")\n", + " print(f\"Number of features: {selected_data.shape[1]}\")\n", + " print(\"\\nTarget Statistics:\")\n", + " print(f\"Mean: {np.mean(targets):.2f}\")\n", + " print(f\"Std: {np.std(targets):.2f}\")\n", + " print(f\"Min: {np.min(targets):.2f}\")\n", + " print(f\"Max: {np.max(targets):.2f}\")\n", + "\n", + "\n", + "# Plot distributions for each dataset\n", + "data = hf_datamodule.HFMotrPacDataModule()\n", + "data = hf_datamodule.HFAddNeuroMedOmicsDataModule()\n", + "data = hf_datamodule.HFCovidAKIOmicsDataModule()\n", + "data = hf_datamodule.HFParkinsonsOmicsDataModule()\n", + "\n", + "data.prepare_data()\n", + "\n", + "# plot_dataset_distributions(data, data.dataset_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bgbench", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/ogbench/baseline.py b/ogbench/baseline.py index 1351c6d2..c3d246a5 100644 --- a/ogbench/baseline.py +++ b/ogbench/baseline.py @@ -14,7 +14,6 @@ import pandas as pd import rootutils import seaborn as sns -import wandb from huggingface_hub import hf_hub_download from omegaconf import DictConfig, OmegaConf from sklearn.decomposition import PCA @@ -35,6 +34,8 @@ from sklearn.pipeline import Pipeline from sklearn.utils import shuffle +import wandb + rootutils.setup_root(__file__, indicator='.project-root', pythonpath=True) # Set matplotlib style diff --git a/scripts/export_wandb.py b/scripts/export_wandb.py index d657b404..59aff415 100644 --- a/scripts/export_wandb.py +++ b/scripts/export_wandb.py @@ -2,6 +2,7 @@ from typing import Any import pandas as pd + import wandb diff --git a/tutorials/dataset_stats.ipynb b/tutorials/dataset_stats.ipynb index 4f352562..91753512 100644 --- a/tutorials/dataset_stats.ipynb +++ b/tutorials/dataset_stats.ipynb @@ -1,455 +1,455 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "6ce976e7", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import os\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "from hydra import compose, initialize\n", - "from hydra.core.global_hydra import GlobalHydra # Import GlobalHydra explicitly\n", - "from hydra.utils import instantiate\n", - "\n", - "from ogbench.utils.config_resolvers import (\n", - " get_default_transform,\n", - " get_monitor_metric,\n", - " get_monitor_mode,\n", - " infer_in_channels,\n", - ")\n", - "\n", - "# Clear GlobalHydra instance if already initialized\n", - "if GlobalHydra().is_initialized():\n", - " GlobalHydra().clear()\n", - "\n", - "initialize(config_path=\"../configs\", job_name=\"job\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "935b096d", - "metadata": {}, - "outputs": [], - "source": [ - "dataset_name = \"addneuromed\"\n", - "\n", - "cfg = compose(\n", - " config_name=\"train.yaml\",\n", - " overrides=[\n", - " \"model=gat\",\n", - " f\"dataset={dataset_name}\",\n", - " \"dataset.loader.parameters.adjacency_threshold=0.5\",\n", - " \"dataset.loader.parameters.node_sample_ratio=full\",\n", - " ],\n", - " return_hydra_config=True,\n", - ")\n", - "loader = instantiate(cfg.dataset.loader)\n", - "dataset = loader.load_dataset()\n", - "print(dataset.processed_dir)\n", - "print(dataset[0])\n", - "\n", - "\n", - "def load_dataset(dataset_name, adj_thresh=0.5):\n", - " \"\"\"\n", - " Load the FTD dataset with a specified adjacency threshold.\n", - " \"\"\"\n", - " cfg = compose(\n", - " config_name=\"train.yaml\",\n", - " overrides=[\n", - " \"model=gat\",\n", - " f\"dataset={dataset_name}\",\n", - " f\"dataset.loader.parameters.adjacency_threshold={adj_thresh}\",\n", - " \"dataset.loader.parameters.node_sample_ratio=full\",\n", - " ],\n", - " return_hydra_config=True,\n", - " )\n", - " loader = instantiate(cfg.dataset.loader)\n", - " dataset = loader.load_dataset()\n", - " return dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31cb9b22", - "metadata": {}, - "outputs": [], - "source": [ - "root = \"./run_data/omics/\"\n", - "name = osp.join(\n", - " root,\n", - " f\"{dataset.data_name}\",\n", - " f\"adj_thresh_{dataset.adjacency_threshold}\",\n", - " f\"{dataset.method}\",\n", - " f\"p_{dataset.node_sample_ratio}\",\n", - " f\"train_split_{dataset.train_val_test_split[0]}\",\n", - " \"raw/adj_matrix.npy\",\n", - ")\n", - "print(name)\n", - "adj_matrix = np.load(name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7050e42f", - "metadata": {}, - "outputs": [], - "source": [ - "# Adjacency matrix loaded" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8786a28d", - "metadata": {}, - "outputs": [], - "source": [ - "def get_graph_stats(dataset):\n", - " \"\"\"\n", - " Get statistics of the graph.\n", - " \"\"\"\n", - " # Load the adjacency matrix\n", - " root = \"./run_data/omics/\"\n", - " name = osp.join(\n", - " root,\n", - " f\"{dataset.data_name}\",\n", - " f\"adj_thresh_{dataset.adjacency_threshold}\",\n", - " f\"{dataset.method}\",\n", - " f\"p_{dataset.node_sample_ratio}\",\n", - " f\"train_split_{dataset.train_val_test_split[0]}\",\n", - " \"raw/adj_matrix.npy\",\n", - " )\n", - " adj_matrix = np.load(name)\n", - "\n", - " # Generate a graph from the adjacency matrix\n", - " graph = nx.from_numpy_array(adj_matrix)\n", - " graph.remove_edges_from(nx.selfloop_edges(graph))\n", - "\n", - " # Calculate statistics\n", - " num_nodes = graph.number_of_nodes()\n", - " num_edges = graph.number_of_edges()\n", - " avg_degree = np.mean([d for _, d in graph.degree()])\n", - " density = nx.density(graph)\n", - " number_connected_components = nx.number_connected_components(graph)\n", - "\n", - " return {\n", - " \"num_nodes\": num_nodes,\n", - " \"num_edges\": num_edges,\n", - " \"avg_degree\": avg_degree,\n", - " \"density\": density,\n", - " \"number_connected_components\": number_connected_components,\n", - " }\n", - "\n", - "\n", - "# Get graph statistics\n", - "stats = get_graph_stats(dataset)\n", - "print(\"\\nGraph statistics:\\n\")\n", - "for key, value in stats.items():\n", - " print(f\"\\t{key}: {value}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "949f821e", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize a list to store all stats\n", - "all_stats = []\n", - "\n", - "for adj_thresh in np.arange(0, 1.01, 0.01):\n", - " # Load the dataset\n", - " dataset = load_dataset(dataset_name, adj_thresh=adj_thresh)\n", - " # Get graph statistics\n", - " stats = get_graph_stats(dataset)\n", - " # Add the adjacency threshold to the stats\n", - " stats[\"adj_thresh\"] = adj_thresh\n", - " # Append the stats to the list\n", - " all_stats.append(stats)\n", - "\n", - "# Save all stats to a CSV file\n", - "output_file = f\"./stats/{dataset_name}/graph_stats.csv\"\n", - "os.makedirs(os.path.dirname(output_file), exist_ok=True)\n", - "with open(output_file, \"w\", newline=\"\") as f:\n", - " writer = csv.DictWriter(\n", - " f,\n", - " fieldnames=[\n", - " \"adj_thresh\",\n", - " \"num_nodes\",\n", - " \"num_edges\",\n", - " \"avg_degree\",\n", - " \"density\",\n", - " \"number_connected_components\",\n", - " ],\n", - " )\n", - " writer.writeheader() # Write the header row\n", - " writer.writerows(all_stats) # Write all rows\n", - "\n", - "print(f\"Graph statistics saved to {output_file}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a93c9c0", - "metadata": {}, - "outputs": [], - "source": [ - "# import os, csv, math\n", - "# import numpy as np\n", - "# from concurrent.futures import ProcessPoolExecutor, as_completed\n", - "# from functools import partial\n", - "# from time import perf_counter\n", - "\n", - "# # ---------- helper: single-task worker ----------\n", - "# def _compute_stats_for_thresh(adj_thresh, dataset_name):\n", - "# \"\"\"\n", - "# Runs in a separate process. Must be top-level so it can be pickled.\n", - "# \"\"\"\n", - "# # Load the dataset for this threshold\n", - "# dataset = load_dataset(dataset_name, adj_thresh=adj_thresh)\n", - "# # Compute graph stats\n", - "# stats = get_graph_stats(dataset)\n", - "# # Attach the threshold\n", - "# stats[\"adj_thresh\"] = float(adj_thresh)\n", - "# return stats\n", - "\n", - "# # ---------- main parallel block ----------\n", - "# start = perf_counter()\n", - "\n", - "# # Make a stable list of thresholds (avoids float step accumulation issues)\n", - "# adj_thresholds = [round(x, 2) for x in np.linspace(0.0, 1.0, 101)]\n", - "\n", - "# # How many worker processes to use. Tweak if you want fewer.\n", - "# max_workers = os.cpu_count() or 2\n", - "\n", - "# # Run in parallel\n", - "# results_in_order = [None] * len(adj_thresholds)\n", - "# with ProcessPoolExecutor(max_workers=max_workers) as ex:\n", - "# # Submit all jobs\n", - "# futures = {\n", - "# ex.submit(_compute_stats_for_thresh, t, dataset_name): idx\n", - "# for idx, t in enumerate(adj_thresholds)\n", - "# }\n", - "# # Collect as they finish, but store back in original order\n", - "# for fut in as_completed(futures):\n", - "# idx = futures[fut]\n", - "# try:\n", - "# results_in_order[idx] = fut.result()\n", - "# except Exception as e:\n", - "# # You can choose to raise here, or record an error row.\n", - "# # For now, record a minimal row with the error noted.\n", - "# results_in_order[idx] = {\n", - "# \"adj_thresh\": adj_thresholds[idx],\n", - "# \"num_nodes\": None,\n", - "# \"num_edges\": None,\n", - "# \"avg_degree\": None,\n", - "# \"density\": None,\n", - "# \"number_connected_components\": None,\n", - "# \"error\": str(e),\n", - "# }\n", - "\n", - "# # Save to CSV (same columns you used; \"error\" is optional)\n", - "# output_file = f\"./stats/{dataset_name}/graph_stats.csv\"\n", - "# os.makedirs(os.path.dirname(output_file), exist_ok=True)\n", - "\n", - "# fieldnames = [\n", - "# \"adj_thresh\",\n", - "# \"num_nodes\",\n", - "# \"num_edges\",\n", - "# \"avg_degree\",\n", - "# \"density\",\n", - "# \"number_connected_components\",\n", - "# ]\n", - "# # If any row had an error, include that column so you can see what failed.\n", - "# if any((\"error\" in r) for r in results_in_order):\n", - "# fieldnames.append(\"error\")\n", - "\n", - "# with open(output_file, \"w\", newline=\"\") as f:\n", - "# writer = csv.DictWriter(f, fieldnames=fieldnames)\n", - "# writer.writeheader()\n", - "# writer.writerows(results_in_order)\n", - "\n", - "# print(f\"Graph statistics saved to {output_file}\")\n", - "# print(f\"Completed in {perf_counter() - start:.2f}s using {max_workers} workers.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "07e6c5df", - "metadata": {}, - "outputs": [], - "source": [ - "for i in range(100, -1, -1):\n", - " print(i / 100)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "729e7c59", - "metadata": {}, - "outputs": [], - "source": [ - "for datasets in [\"addneuromed\", \"parkinsons\", \"covidaki\", \"motrpac\"]:\n", - " csv_file = f\"./stats/{datasets}/graph_stats.csv\"\n", - " df = pd.read_csv(csv_file)\n", - " # Sort the DataFrame by the 'adj_thresh' column in ascending order\n", - " df = df.sort_values(by=\"adj_thresh\", ascending=True)\n", - "\n", - " # Plot the evolution of graph statistics with respect to adj_thresh\n", - " plt.figure(figsize=(14, 10))\n", - "\n", - " # Plot number of edges\n", - " plt.subplot(3, 2, 1)\n", - " plt.plot(df[\"adj_thresh\"], df[\"num_edges\"], label=\"Number of Edges\", color=\"green\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Number of Edges\")\n", - " plt.title(\"Number of Edges vs. Adjacency Threshold\")\n", - " plt.grid(True)\n", - "\n", - " # Plot average degree\n", - " plt.subplot(3, 2, 2)\n", - " plt.plot(df[\"adj_thresh\"], df[\"avg_degree\"], label=\"Average Degree\", color=\"orange\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Average Degree\")\n", - " plt.title(\"Average Degree vs. Adjacency Threshold\")\n", - " plt.grid(True)\n", - "\n", - " # Plot density\n", - " plt.subplot(3, 2, 3)\n", - " plt.plot(df[\"adj_thresh\"], df[\"density\"], label=\"Density\", color=\"red\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Density\")\n", - " plt.title(\"Density vs. Adjacency Threshold\")\n", - " plt.grid(True)\n", - "\n", - " # Plot number of connected components\n", - " plt.subplot(3, 2, 4)\n", - " plt.plot(\n", - " df[\"adj_thresh\"],\n", - " df[\"number_connected_components\"],\n", - " label=\"Connected Components\",\n", - " color=\"purple\",\n", - " )\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Number of Connected Components\")\n", - " plt.title(\"Connected Components vs. Adjacency Threshold\")\n", - " plt.grid(True)\n", - "\n", - " plt.suptitle(datasets, fontsize=16, y=1.02)\n", - " # Adjust layout and show the plots\n", - " plt.tight_layout()\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d53f33f", - "metadata": {}, - "outputs": [], - "source": [ - "metrics = [\"wgcna\", \"spearman_correlation\", \"mutual_information\", \"distance_correlation\"]\n", - "\n", - "for metric in metrics:\n", - " # Load the CSV file into a pandas DataFrame\n", - " csv_file = \"./stats/\" + metric + \"/graph_stats.csv\"\n", - " df = pd.read_csv(csv_file)\n", - " # Sort the DataFrame by the 'adj_thresh' column in ascending order\n", - " df = df.sort_values(by=\"adj_thresh\", ascending=True)\n", - "\n", - " # Plot the evolution of graph statistics with respect to adj_thresh\n", - " plt.figure(figsize=(14, 10))\n", - " plt.suptitle(metric, fontsize=18)\n", - "\n", - " # Plot number of edges\n", - " plt.subplot(3, 2, 1)\n", - " plt.plot(df[\"adj_thresh\"], df[\"num_edges\"], label=\"Number of Edges\", color=\"green\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Number of Edges\")\n", - " plt.title(\"Number of Edges vs. Adjacency Threshold\")\n", - " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", - " plt.grid(True)\n", - "\n", - " # Plot average degree\n", - " plt.subplot(3, 2, 2)\n", - " plt.plot(df[\"adj_thresh\"], df[\"avg_degree\"], label=\"Average Degree\", color=\"orange\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Average Degree\")\n", - " plt.title(\"Average Degree vs. Adjacency Threshold\")\n", - " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", - " plt.grid(True)\n", - "\n", - " # Plot density\n", - " plt.subplot(3, 2, 3)\n", - " plt.plot(df[\"adj_thresh\"], df[\"density\"], label=\"Density\", color=\"red\")\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Density\")\n", - " plt.title(\"Density vs. Adjacency Threshold\")\n", - " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", - " plt.grid(True)\n", - "\n", - " # Plot number of connected components\n", - " plt.subplot(3, 2, 4)\n", - " plt.plot(\n", - " df[\"adj_thresh\"],\n", - " df[\"number_connected_components\"],\n", - " label=\"Connected Components\",\n", - " color=\"purple\",\n", - " )\n", - " plt.xlabel(\"Adjacency Threshold\")\n", - " plt.ylabel(\"Number of Connected Components\")\n", - " plt.title(\"Connected Components vs. Adjacency Threshold\")\n", - " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", - " plt.grid(True)\n", - "\n", - " # Adjust layout and show the plots\n", - " plt.tight_layout()\n", - " plt.show()\n", - " plt.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12e6b900", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "proteo", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - } + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "6ce976e7", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from hydra import compose, initialize\n", + "from hydra.core.global_hydra import GlobalHydra # Import GlobalHydra explicitly\n", + "from hydra.utils import instantiate\n", + "\n", + "from ogbench.utils.config_resolvers import (\n", + " get_default_transform,\n", + " get_monitor_metric,\n", + " get_monitor_mode,\n", + " infer_in_channels,\n", + ")\n", + "\n", + "# Clear GlobalHydra instance if already initialized\n", + "if GlobalHydra().is_initialized():\n", + " GlobalHydra().clear()\n", + "\n", + "initialize(config_path=\"../configs\", job_name=\"job\")" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "cell_type": "code", + "execution_count": null, + "id": "935b096d", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_name = \"addneuromed\"\n", + "\n", + "cfg = compose(\n", + " config_name=\"train.yaml\",\n", + " overrides=[\n", + " \"model=gat\",\n", + " f\"dataset={dataset_name}\",\n", + " \"dataset.loader.parameters.adjacency_threshold=0.5\",\n", + " \"dataset.loader.parameters.node_sample_ratio=full\",\n", + " ],\n", + " return_hydra_config=True,\n", + ")\n", + "loader = instantiate(cfg.dataset.loader)\n", + "dataset = loader.load_dataset()\n", + "print(dataset.processed_dir)\n", + "print(dataset[0])\n", + "\n", + "\n", + "def load_dataset(dataset_name, adj_thresh=0.5):\n", + " \"\"\"\n", + " Load the FTD dataset with a specified adjacency threshold.\n", + " \"\"\"\n", + " cfg = compose(\n", + " config_name=\"train.yaml\",\n", + " overrides=[\n", + " \"model=gat\",\n", + " f\"dataset={dataset_name}\",\n", + " f\"dataset.loader.parameters.adjacency_threshold={adj_thresh}\",\n", + " \"dataset.loader.parameters.node_sample_ratio=full\",\n", + " ],\n", + " return_hydra_config=True,\n", + " )\n", + " loader = instantiate(cfg.dataset.loader)\n", + " dataset = loader.load_dataset()\n", + " return dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31cb9b22", + "metadata": {}, + "outputs": [], + "source": [ + "root = \"./run_data/omics/\"\n", + "name = osp.join(\n", + " root,\n", + " f\"{dataset.data_name}\",\n", + " f\"adj_thresh_{dataset.adjacency_threshold}\",\n", + " f\"{dataset.method}\",\n", + " f\"p_{dataset.node_sample_ratio}\",\n", + " f\"train_split_{dataset.train_val_test_split[0]}\",\n", + " \"raw/adj_matrix.npy\",\n", + ")\n", + "print(name)\n", + "adj_matrix = np.load(name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7050e42f", + "metadata": {}, + "outputs": [], + "source": [ + "# Adjacency matrix loaded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8786a28d", + "metadata": {}, + "outputs": [], + "source": [ + "def get_graph_stats(dataset):\n", + " \"\"\"\n", + " Get statistics of the graph.\n", + " \"\"\"\n", + " # Load the adjacency matrix\n", + " root = \"./run_data/omics/\"\n", + " name = osp.join(\n", + " root,\n", + " f\"{dataset.data_name}\",\n", + " f\"adj_thresh_{dataset.adjacency_threshold}\",\n", + " f\"{dataset.method}\",\n", + " f\"p_{dataset.node_sample_ratio}\",\n", + " f\"train_split_{dataset.train_val_test_split[0]}\",\n", + " \"raw/adj_matrix.npy\",\n", + " )\n", + " adj_matrix = np.load(name)\n", + "\n", + " # Generate a graph from the adjacency matrix\n", + " graph = nx.from_numpy_array(adj_matrix)\n", + " graph.remove_edges_from(nx.selfloop_edges(graph))\n", + "\n", + " # Calculate statistics\n", + " num_nodes = graph.number_of_nodes()\n", + " num_edges = graph.number_of_edges()\n", + " avg_degree = np.mean([d for _, d in graph.degree()])\n", + " density = nx.density(graph)\n", + " number_connected_components = nx.number_connected_components(graph)\n", + "\n", + " return {\n", + " \"num_nodes\": num_nodes,\n", + " \"num_edges\": num_edges,\n", + " \"avg_degree\": avg_degree,\n", + " \"density\": density,\n", + " \"number_connected_components\": number_connected_components,\n", + " }\n", + "\n", + "\n", + "# Get graph statistics\n", + "stats = get_graph_stats(dataset)\n", + "print(\"\\nGraph statistics:\\n\")\n", + "for key, value in stats.items():\n", + " print(f\"\\t{key}: {value}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "949f821e", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize a list to store all stats\n", + "all_stats = []\n", + "\n", + "for adj_thresh in np.arange(0, 1.01, 0.01):\n", + " # Load the dataset\n", + " dataset = load_dataset(dataset_name, adj_thresh=adj_thresh)\n", + " # Get graph statistics\n", + " stats = get_graph_stats(dataset)\n", + " # Add the adjacency threshold to the stats\n", + " stats[\"adj_thresh\"] = adj_thresh\n", + " # Append the stats to the list\n", + " all_stats.append(stats)\n", + "\n", + "# Save all stats to a CSV file\n", + "output_file = f\"./stats/{dataset_name}/graph_stats.csv\"\n", + "os.makedirs(os.path.dirname(output_file), exist_ok=True)\n", + "with open(output_file, \"w\", newline=\"\") as f:\n", + " writer = csv.DictWriter(\n", + " f,\n", + " fieldnames=[\n", + " \"adj_thresh\",\n", + " \"num_nodes\",\n", + " \"num_edges\",\n", + " \"avg_degree\",\n", + " \"density\",\n", + " \"number_connected_components\",\n", + " ],\n", + " )\n", + " writer.writeheader() # Write the header row\n", + " writer.writerows(all_stats) # Write all rows\n", + "\n", + "print(f\"Graph statistics saved to {output_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a93c9c0", + "metadata": {}, + "outputs": [], + "source": [ + "# import os, csv, math\n", + "# import numpy as np\n", + "# from concurrent.futures import ProcessPoolExecutor, as_completed\n", + "# from functools import partial\n", + "# from time import perf_counter\n", + "\n", + "# # ---------- helper: single-task worker ----------\n", + "# def _compute_stats_for_thresh(adj_thresh, dataset_name):\n", + "# \"\"\"\n", + "# Runs in a separate process. Must be top-level so it can be pickled.\n", + "# \"\"\"\n", + "# # Load the dataset for this threshold\n", + "# dataset = load_dataset(dataset_name, adj_thresh=adj_thresh)\n", + "# # Compute graph stats\n", + "# stats = get_graph_stats(dataset)\n", + "# # Attach the threshold\n", + "# stats[\"adj_thresh\"] = float(adj_thresh)\n", + "# return stats\n", + "\n", + "# # ---------- main parallel block ----------\n", + "# start = perf_counter()\n", + "\n", + "# # Make a stable list of thresholds (avoids float step accumulation issues)\n", + "# adj_thresholds = [round(x, 2) for x in np.linspace(0.0, 1.0, 101)]\n", + "\n", + "# # How many worker processes to use. Tweak if you want fewer.\n", + "# max_workers = os.cpu_count() or 2\n", + "\n", + "# # Run in parallel\n", + "# results_in_order = [None] * len(adj_thresholds)\n", + "# with ProcessPoolExecutor(max_workers=max_workers) as ex:\n", + "# # Submit all jobs\n", + "# futures = {\n", + "# ex.submit(_compute_stats_for_thresh, t, dataset_name): idx\n", + "# for idx, t in enumerate(adj_thresholds)\n", + "# }\n", + "# # Collect as they finish, but store back in original order\n", + "# for fut in as_completed(futures):\n", + "# idx = futures[fut]\n", + "# try:\n", + "# results_in_order[idx] = fut.result()\n", + "# except Exception as e:\n", + "# # You can choose to raise here, or record an error row.\n", + "# # For now, record a minimal row with the error noted.\n", + "# results_in_order[idx] = {\n", + "# \"adj_thresh\": adj_thresholds[idx],\n", + "# \"num_nodes\": None,\n", + "# \"num_edges\": None,\n", + "# \"avg_degree\": None,\n", + "# \"density\": None,\n", + "# \"number_connected_components\": None,\n", + "# \"error\": str(e),\n", + "# }\n", + "\n", + "# # Save to CSV (same columns you used; \"error\" is optional)\n", + "# output_file = f\"./stats/{dataset_name}/graph_stats.csv\"\n", + "# os.makedirs(os.path.dirname(output_file), exist_ok=True)\n", + "\n", + "# fieldnames = [\n", + "# \"adj_thresh\",\n", + "# \"num_nodes\",\n", + "# \"num_edges\",\n", + "# \"avg_degree\",\n", + "# \"density\",\n", + "# \"number_connected_components\",\n", + "# ]\n", + "# # If any row had an error, include that column so you can see what failed.\n", + "# if any((\"error\" in r) for r in results_in_order):\n", + "# fieldnames.append(\"error\")\n", + "\n", + "# with open(output_file, \"w\", newline=\"\") as f:\n", + "# writer = csv.DictWriter(f, fieldnames=fieldnames)\n", + "# writer.writeheader()\n", + "# writer.writerows(results_in_order)\n", + "\n", + "# print(f\"Graph statistics saved to {output_file}\")\n", + "# print(f\"Completed in {perf_counter() - start:.2f}s using {max_workers} workers.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07e6c5df", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(100, -1, -1):\n", + " print(i / 100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "729e7c59", + "metadata": {}, + "outputs": [], + "source": [ + "for datasets in [\"addneuromed\", \"parkinsons\", \"covidaki\", \"motrpac\"]:\n", + " csv_file = f\"./stats/{datasets}/graph_stats.csv\"\n", + " df = pd.read_csv(csv_file)\n", + " # Sort the DataFrame by the 'adj_thresh' column in ascending order\n", + " df = df.sort_values(by=\"adj_thresh\", ascending=True)\n", + "\n", + " # Plot the evolution of graph statistics with respect to adj_thresh\n", + " plt.figure(figsize=(14, 10))\n", + "\n", + " # Plot number of edges\n", + " plt.subplot(3, 2, 1)\n", + " plt.plot(df[\"adj_thresh\"], df[\"num_edges\"], label=\"Number of Edges\", color=\"green\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Number of Edges\")\n", + " plt.title(\"Number of Edges vs. Adjacency Threshold\")\n", + " plt.grid(True)\n", + "\n", + " # Plot average degree\n", + " plt.subplot(3, 2, 2)\n", + " plt.plot(df[\"adj_thresh\"], df[\"avg_degree\"], label=\"Average Degree\", color=\"orange\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Average Degree\")\n", + " plt.title(\"Average Degree vs. Adjacency Threshold\")\n", + " plt.grid(True)\n", + "\n", + " # Plot density\n", + " plt.subplot(3, 2, 3)\n", + " plt.plot(df[\"adj_thresh\"], df[\"density\"], label=\"Density\", color=\"red\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Density\")\n", + " plt.title(\"Density vs. Adjacency Threshold\")\n", + " plt.grid(True)\n", + "\n", + " # Plot number of connected components\n", + " plt.subplot(3, 2, 4)\n", + " plt.plot(\n", + " df[\"adj_thresh\"],\n", + " df[\"number_connected_components\"],\n", + " label=\"Connected Components\",\n", + " color=\"purple\",\n", + " )\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Number of Connected Components\")\n", + " plt.title(\"Connected Components vs. Adjacency Threshold\")\n", + " plt.grid(True)\n", + "\n", + " plt.suptitle(datasets, fontsize=16, y=1.02)\n", + " # Adjust layout and show the plots\n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d53f33f", + "metadata": {}, + "outputs": [], + "source": [ + "metrics = [\"wgcna\", \"spearman_correlation\", \"mutual_information\", \"distance_correlation\"]\n", + "\n", + "for metric in metrics:\n", + " # Load the CSV file into a pandas DataFrame\n", + " csv_file = \"./stats/\" + metric + \"/graph_stats.csv\"\n", + " df = pd.read_csv(csv_file)\n", + " # Sort the DataFrame by the 'adj_thresh' column in ascending order\n", + " df = df.sort_values(by=\"adj_thresh\", ascending=True)\n", + "\n", + " # Plot the evolution of graph statistics with respect to adj_thresh\n", + " plt.figure(figsize=(14, 10))\n", + " plt.suptitle(metric, fontsize=18)\n", + "\n", + " # Plot number of edges\n", + " plt.subplot(3, 2, 1)\n", + " plt.plot(df[\"adj_thresh\"], df[\"num_edges\"], label=\"Number of Edges\", color=\"green\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Number of Edges\")\n", + " plt.title(\"Number of Edges vs. Adjacency Threshold\")\n", + " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", + " plt.grid(True)\n", + "\n", + " # Plot average degree\n", + " plt.subplot(3, 2, 2)\n", + " plt.plot(df[\"adj_thresh\"], df[\"avg_degree\"], label=\"Average Degree\", color=\"orange\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Average Degree\")\n", + " plt.title(\"Average Degree vs. Adjacency Threshold\")\n", + " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", + " plt.grid(True)\n", + "\n", + " # Plot density\n", + " plt.subplot(3, 2, 3)\n", + " plt.plot(df[\"adj_thresh\"], df[\"density\"], label=\"Density\", color=\"red\")\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Density\")\n", + " plt.title(\"Density vs. Adjacency Threshold\")\n", + " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", + " plt.grid(True)\n", + "\n", + " # Plot number of connected components\n", + " plt.subplot(3, 2, 4)\n", + " plt.plot(\n", + " df[\"adj_thresh\"],\n", + " df[\"number_connected_components\"],\n", + " label=\"Connected Components\",\n", + " color=\"purple\",\n", + " )\n", + " plt.xlabel(\"Adjacency Threshold\")\n", + " plt.ylabel(\"Number of Connected Components\")\n", + " plt.title(\"Connected Components vs. Adjacency Threshold\")\n", + " plt.yscale(\"log\") # Set y-axis to logarithmic scale\n", + " plt.grid(True)\n", + "\n", + " # Adjust layout and show the plots\n", + " plt.tight_layout()\n", + " plt.show()\n", + " plt.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12e6b900", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "proteo", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/webapp/README.md b/webapp/README.md index 4ccf4545..b8626dc5 100644 --- a/webapp/README.md +++ b/webapp/README.md @@ -102,6 +102,7 @@ python precompute_stats.py ``` This will: + 1. Download datasets from HuggingFace (`/bgbench`) 2. Compute graph statistics for all 324 combinations: - 3 datasets × 6 ratios × 3 methods × 6 thresholds @@ -109,12 +110,12 @@ This will: **Parameters computed:** -| Parameter | Values | -|-----------|--------| -| Datasets | `motrpac`, `addneuromed`, `parkinsons` | -| Node sample ratios | 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 | -| Selection methods | `variance`, `correlation`, `random` | -| Adjacency thresholds | 0.02, 0.1, 0.2, 0.3, 0.4, 0.5 | +| Parameter | Values | +| -------------------- | -------------------------------------- | +| Datasets | `motrpac`, `addneuromed`, `parkinsons` | +| Node sample ratios | 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 | +| Selection methods | `variance`, `correlation`, `random` | +| Adjacency thresholds | 0.02, 0.1, 0.2, 0.3, 0.4, 0.5 | **Metrics computed per graph:** @@ -173,12 +174,14 @@ This will: ### How to Update 1. **Replace the JSON files** in `public/data/`: + ```bash cp /path/to/new/results.json public/data/results.json cp /path/to/new/stats.json public/data/stats.json ``` 2. **Rebuild and deploy**: + ```bash make deploy ``` @@ -189,6 +192,7 @@ Results key: `{dataset}|{ratio}|{method}|{threshold}|{model}` Stats key: `{dataset}|{ratio}|{method}|{threshold}` Where: + - `dataset`: `motrpac`, `addneuromed`, or `parkinsons` - `ratio`: node sample ratio (0.5–0.9) - `method`: `variance`, `correlation`, or `random` diff --git a/webapp/precompute_stats.py b/webapp/precompute_stats.py index 346fe1f1..e6961c6f 100644 --- a/webapp/precompute_stats.py +++ b/webapp/precompute_stats.py @@ -56,10 +56,10 @@ def load_raw_data(dataset_name: str) -> tuple[pd.DataFrame, np.ndarray]: """Load raw dataset from HuggingFace (cached).""" if dataset_name in _data_cache: return _data_cache[dataset_name] - + print(f' Loading {dataset_name} from HuggingFace...') config = DATASETS[dataset_name] - + data_file = hf_hub_download( repo_id=HF_REPO_ID, repo_type='dataset', @@ -72,30 +72,30 @@ def load_raw_data(dataset_name: str) -> tuple[pd.DataFrame, np.ndarray]: revision=config['revision'], filename=f'{dataset_name}_targets.parquet', ) - + raw_data = pd.read_parquet(data_file) targets_df = pd.read_parquet(targets_file) - + if 'target' in raw_data.columns: raw_data = raw_data.drop('target', axis=1) - + targets = targets_df['target'].values raw_data, targets = shuffle(raw_data, targets, random_state=42) - + train_val_test_split = config['train_val_test_split'] train_idx = int(len(targets) * train_val_test_split[0]) - + train_data = raw_data.iloc[:train_idx] train_targets = targets[:train_idx] - + imputer = SimpleImputer(strategy='mean') train_data_imputed = imputer.fit_transform(train_data) train_data = pd.DataFrame( train_data_imputed, columns=train_data.columns, index=train_data.index ) - + print(f' {dataset_name}: {train_data.shape[0]} samples, {train_data.shape[1]} features') - + _data_cache[dataset_name] = (train_data, train_targets) return train_data, train_targets @@ -105,7 +105,7 @@ def select_nodes( ) -> np.ndarray: """Select nodes based on feature importance.""" np.random.seed(42) - + if method == 'variance': variances = np.std(data, axis=0) ranked_nodes = np.argsort(variances)[::-1] @@ -119,7 +119,7 @@ def select_nodes( ranked_nodes = np.random.permutation(data.shape[1]) else: raise ValueError(f'Invalid method: {method}') - + return ranked_nodes[:n_selected] @@ -138,38 +138,38 @@ def calculate_adjacency_matrix( def compute_graph_metrics(adj_matrix: np.ndarray) -> dict[str, float]: """Compute various graph metrics from adjacency matrix.""" n_nodes = adj_matrix.shape[0] - + adj_no_diag = adj_matrix.copy() np.fill_diagonal(adj_no_diag, 0) - + graph = nx.from_numpy_array(adj_no_diag) - + node_degrees = np.sum(adj_no_diag, axis=1) n_edges = np.sum(adj_no_diag) / 2 max_edges = n_nodes * (n_nodes - 1) / 2 density = n_edges / max_edges if max_edges > 0 else 0 mean_degree = np.mean(node_degrees) if n_nodes > 0 else 0 std_degree = np.std(node_degrees) if n_nodes > 0 else 0 - + n_components = nx.number_connected_components(graph) - + if n_nodes > 0 and n_components > 0: largest_cc = max(nx.connected_components(graph), key=len) largest_cc_ratio = len(largest_cc) / n_nodes * 100 else: largest_cc_ratio = 0 largest_cc = set() - + try: avg_clustering = nx.average_clustering(graph) except Exception: avg_clustering = 0 - + try: if n_components > 0 and len(largest_cc) > 1: subgraph = graph.subgraph(largest_cc) if len(largest_cc) > 100: - sample_nodes = list(largest_cc)[:min(50, len(largest_cc))] + sample_nodes = list(largest_cc)[: min(50, len(largest_cc))] path_lengths = [] for source in sample_nodes[:25]: lengths = nx.single_source_shortest_path_length(subgraph, source) @@ -181,7 +181,7 @@ def compute_graph_metrics(adj_matrix: np.ndarray) -> dict[str, float]: avg_path_length = 0 except Exception: avg_path_length = 0 - + return { 'n_nodes': n_nodes, 'n_edges': int(n_edges), @@ -196,14 +196,11 @@ def compute_graph_metrics(adj_matrix: np.ndarray) -> dict[str, float]: def get_graph_stats_for_params( - dataset_name: str, - node_sample_ratio: float, - method: str, - adjacency_threshold: float + dataset_name: str, node_sample_ratio: float, method: str, adjacency_threshold: float ) -> dict[str, float]: """Compute graph statistics for given parameters.""" train_data, train_targets = load_raw_data(dataset_name) - + n_training_samples = len(train_targets) if node_sample_ratio >= 1.0: n_nodes = min(train_data.shape[1], 1000) @@ -212,83 +209,83 @@ def get_graph_stats_for_params( if n_nodes > train_data.shape[1]: n_nodes = train_data.shape[1] n_nodes = min(n_nodes, 1000) - + selected_nodes = select_nodes( train_data.values, train_targets, n_selected=n_nodes, method=method ) train_selected = train_data.iloc[:, selected_nodes] - + adj_matrix = calculate_adjacency_matrix(train_selected, adjacency_threshold) metrics = compute_graph_metrics(adj_matrix) metrics['dataset'] = dataset_name - + return metrics def main(): """Precompute all graph statistics and save to file.""" output_file = Path(__file__).parent / 'public' / 'data' / 'stats.json' - - print('='*60) + + print('=' * 60) print('Precomputing Graph Statistics') - print('='*60) - + print('=' * 60) + # Calculate total combinations total = ( - len(DATASETS) * - len(NODE_SAMPLE_RATIOS) * - len(NODE_SELECTION_METHODS) * - len(ADJACENCY_THRESHOLDS) + len(DATASETS) + * len(NODE_SAMPLE_RATIOS) + * len(NODE_SELECTION_METHODS) + * len(ADJACENCY_THRESHOLDS) ) print(f'\nTotal combinations to compute: {total}') print(f' Datasets: {list(DATASETS.keys())}') print(f' Node sample ratios: {NODE_SAMPLE_RATIOS}') print(f' Selection methods: {NODE_SELECTION_METHODS}') print(f' Adjacency thresholds: {ADJACENCY_THRESHOLDS}') - + # Preload all datasets print('\n--- Loading datasets ---') for dataset_name in DATASETS.keys(): load_raw_data(dataset_name) - + # Compute all combinations print('\n--- Computing statistics ---') results = {} start_time = time.time() - - combinations = list(itertools.product( - DATASETS.keys(), - NODE_SAMPLE_RATIOS, - NODE_SELECTION_METHODS, - ADJACENCY_THRESHOLDS - )) - + + combinations = list( + itertools.product( + DATASETS.keys(), NODE_SAMPLE_RATIOS, NODE_SELECTION_METHODS, ADJACENCY_THRESHOLDS + ) + ) + for i, (dataset, ratio, method, threshold) in enumerate(combinations, 1): # Create cache key as string for JSON cache_key = f'{dataset}|{ratio}|{method}|{threshold}' - + if i % 20 == 0 or i == 1: elapsed = time.time() - start_time eta = (elapsed / i) * (total - i) if i > 0 else 0 - print(f' [{i}/{total}] ETA: {eta:.0f}s - {dataset}, p={ratio}, {method}, τ={threshold}') - + print( + f' [{i}/{total}] ETA: {eta:.0f}s - {dataset}, p={ratio}, {method}, τ={threshold}' + ) + try: stats = get_graph_stats_for_params(dataset, ratio, method, threshold) results[cache_key] = stats except Exception as e: print(f' ERROR: {dataset}, p={ratio}, {method}, τ={threshold}: {e}') - + # Save results print(f'\n--- Saving to {output_file} ---') with open(output_file, 'w') as f: json.dump(results, f, indent=2) - + elapsed = time.time() - start_time print(f'\nDone! Computed {len(results)} combinations in {elapsed:.1f}s') print(f'Results saved to: {output_file}') - print('='*60) + print('=' * 60) if __name__ == '__main__': main() -