diff --git a/.gitignore b/.gitignore index d0a90784..5ba05b56 100644 --- a/.gitignore +++ b/.gitignore @@ -192,3 +192,9 @@ logs/ *.parquet datasets/ run_data/ + +# Anonymization: exclude files with identifying info for ICML submission +search_results/ +outputs/ +wandb_analysis_report.md +.cursor/ diff --git a/LICENSE b/LICENSE index 86852bbf..c312126d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 geometric-intelligence +Copyright (c) 2025 Anonymous Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 36181af6..445382f9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,3 @@ -[![Code Quality](https://github.com/geometric-intelligence/bgbench/actions/workflows/code-quality-main.yaml/badge.svg)](https://github.com/geometric-intelligence/bgbench/actions/workflows/code-quality-main.yaml) -[![Dependencies](https://github.com/geometric-intelligence/bgbench/actions/workflows/dependabot/dependabot-updates/badge.svg)](https://github.com/geometric-intelligence/bgbench/actions/workflows/dependabot/dependabot-updates) -[![Tests](https://github.com/geometric-intelligence/bgbench/actions/workflows/test.yml/badge.svg)](https://github.com/geometric-intelligence/bgbench/actions/workflows/test.yml) - # Big Graph Bench (BGBench) A comprehensive benchmarking framework for Graph Neural Networks (GNNs) on omics datasets for classification tasks. This repository provides standardized datasets, preprocessing pipelines, and evaluation metrics to enable fair comparison of different GNN architectures on biological data. @@ -24,7 +20,7 @@ The easiest way to set up BGBench is using the provided environment setup script ```bash # Clone the repository -git clone git@github.com:geometric-intelligence/bgbench.git +git clone cd bgbench # Run the automated setup script @@ -44,7 +40,7 @@ If you prefer manual setup: ```bash # Clone the repository -git clone git@github.com:geometric-intelligence/bgbench.git +git clone cd bgbench # Create conda environment @@ -91,7 +87,7 @@ BGBench includes three curated omics datasets for graph-based classification: ### Dataset Storage and Access -All datasets are stored on Hugging Face Hub at `geometric-intelligence/bgbench` and automatically downloaded when needed. The datasets are preprocessed and stored in Parquet format for efficient loading. +All datasets are stored on Hugging Face Hub at `/bgbench` and automatically downloaded when needed. The datasets are preprocessed and stored in Parquet format for efficient loading. ## Dataset Preprocessing @@ -392,9 +388,9 @@ If you use BGBench in your research, please cite: ```bibtex @software{bgbench2024, title={Big Graph Bench: A Benchmarking Framework for Graph Neural Networks on Omics Data}, - author={Geometric Intelligence Team}, + author={Anonymous Authors}, year={2024}, - url={https://github.com/geometric-intelligence/bgbench} + url={} } ``` diff --git a/notebooks/datasets.ipynb b/notebooks/datasets.ipynb index a40af1c5..1aba645f 100644 --- a/notebooks/datasets.ipynb +++ b/notebooks/datasets.ipynb @@ -17,7 +17,7 @@ "import pandas as pd\n", "import seaborn as sns\n", "\n", - "sys.path.append(\"/home/johmathe/bgbench\")\n", + "sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(\".\"))))\n", "os.environ[\"PYTHONPATH\"] = os.pathsep.join(sys.path)\n", "from src.data import hf_datamodule\n" ] diff --git a/ogbench/baseline.py b/ogbench/baseline.py index 58370398..c3d246a5 100644 --- a/ogbench/baseline.py +++ b/ogbench/baseline.py @@ -14,7 +14,6 @@ import pandas as pd import rootutils import seaborn as sns -import wandb from huggingface_hub import hf_hub_download from omegaconf import DictConfig, OmegaConf from sklearn.decomposition import PCA @@ -35,6 +34,8 @@ from sklearn.pipeline import Pipeline from sklearn.utils import shuffle +import wandb + rootutils.setup_root(__file__, indicator='.project-root', pythonpath=True) # Set matplotlib style @@ -171,7 +172,7 @@ def load_metadata(data_name: str, cfg: DictConfig) -> dict[str, Any] | None: # Download from HuggingFace try: logger.info('Downloading metadata from HuggingFace...') - hf_repo_id = 'geometric-intelligence/bgbench' + hf_repo_id = '/bgbench' revision = cfg.dataset.loader.parameters.get('revision', 'e1631e8') metadata_file = hf_hub_download( # nosec @@ -248,7 +249,7 @@ def load_and_prepare_data(cfg: DictConfig) -> DatasetContainer: # Download from HuggingFace logger.info('Downloading from HuggingFace...') - hf_repo_id = 'geometric-intelligence/bgbench' + hf_repo_id = '/bgbench' revision = cfg.dataset.loader.parameters.get('revision', 'e1631e8') data_file = hf_hub_download( # nosec diff --git a/pyproject.toml b/pyproject.toml index 251444e9..681631e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,9 +94,8 @@ all = ["ogbench[dev, doc]"] [project.scripts] ogbench-train = "ogbench.run:main" -[project.urls] -homepage="https://geometric-intelligence.github.io/bgbench/index.html" -repository="https://github.com/geometric-intelligence/bgbench" +# [project.urls] +# homepage and repository URLs removed for anonymous review [tool.ruff] line-length = 99 diff --git a/scripts/export_wandb.py b/scripts/export_wandb.py index f2588b76..59aff415 100644 --- a/scripts/export_wandb.py +++ b/scripts/export_wandb.py @@ -2,6 +2,7 @@ from typing import Any import pandas as pd + import wandb @@ -18,7 +19,7 @@ def flatten_dict(d: dict[str, Any], parent_key: str = '', sep: str = '_') -> dic def main() -> None: api = wandb.Api() - runs = api.runs('johmathe/biggraphbench') + runs = api.runs('/biggraphbench') summary_list: list[dict[str, Any]] = [] config_list: list[dict[str, Any]] = [] diff --git a/scripts/plot_adjacency_threshold_analysis.py b/scripts/plot_adjacency_threshold_analysis.py index f54e3811..6c424ace 100755 --- a/scripts/plot_adjacency_threshold_analysis.py +++ b/scripts/plot_adjacency_threshold_analysis.py @@ -49,7 +49,7 @@ }, } -HF_REPO_ID = 'geometric-intelligence/bgbench' +HF_REPO_ID = '/bgbench' def load_and_preprocess_dataset( diff --git a/scripts/utils.py b/scripts/utils.py index 7b926cf3..5d9edb3d 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -79,7 +79,7 @@ def upload_to_huggingface( """ try: api = huggingface_hub.HfApi() - repo_id = 'geometric-intelligence/bgbench' + repo_id = '/bgbench' # Create repository if it doesn't exist try: diff --git a/tutorials/dataset_stats.ipynb b/tutorials/dataset_stats.ipynb index 8224f7c0..91753512 100644 --- a/tutorials/dataset_stats.ipynb +++ b/tutorials/dataset_stats.ipynb @@ -83,7 +83,7 @@ "metadata": {}, "outputs": [], "source": [ - "root = \"/home/lcornelis/code/bgbench/run_data/omics/\"\n", + "root = \"./run_data/omics/\"\n", "name = osp.join(\n", " root,\n", " f\"{dataset.data_name}\",\n", @@ -119,7 +119,7 @@ " Get statistics of the graph.\n", " \"\"\"\n", " # Load the adjacency matrix\n", - " root = \"/home/lcornelis/code/bgbench/run_data/omics/\"\n", + " root = \"./run_data/omics/\"\n", " name = osp.join(\n", " root,\n", " f\"{dataset.data_name}\",\n", diff --git a/tutorials/dataset_stats_analysis.py b/tutorials/dataset_stats_analysis.py index bc8243a1..5ecdf2b9 100644 --- a/tutorials/dataset_stats_analysis.py +++ b/tutorials/dataset_stats_analysis.py @@ -39,7 +39,7 @@ def load_dataset( if node_sample_ratio == 'full': dataset = HFOmicsDataset( - root='/home/johmathe/bgbench/run_data/omics', + root='./run_data/omics', data_name=dataset_name, method=method, adjacency_threshold=adj_thresh, @@ -49,7 +49,7 @@ def load_dataset( ) else: dataset = HFOmicsDataset( - root='/home/johmathe/bgbench/run_data/omics', + root='./run_data/omics', data_name=dataset_name, method=method, adjacency_threshold=adj_thresh, @@ -85,7 +85,7 @@ def get_graph_stats(dataset: Any) -> dict[str, float]: graph.add_nodes_from(range(num_nodes)) graph.add_edges_from(edge_list) else: - root = '/home/johmathe/bgbench/run_data/omics/' + root = './run_data/omics/' name = osp.join( root, f'{dataset.data_name}', diff --git a/webapp/README.md b/webapp/README.md index 9859ead4..b8626dc5 100644 --- a/webapp/README.md +++ b/webapp/README.md @@ -102,19 +102,20 @@ python precompute_stats.py ``` This will: -1. Download datasets from HuggingFace (`geometric-intelligence/bgbench`) + +1. Download datasets from HuggingFace (`/bgbench`) 2. Compute graph statistics for all 324 combinations: - 3 datasets × 6 ratios × 3 methods × 6 thresholds 3. Save results to `public/data/stats.json` **Parameters computed:** -| Parameter | Values | -|-----------|--------| -| Datasets | `motrpac`, `addneuromed`, `parkinsons` | -| Node sample ratios | 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 | -| Selection methods | `variance`, `correlation`, `random` | -| Adjacency thresholds | 0.02, 0.1, 0.2, 0.3, 0.4, 0.5 | +| Parameter | Values | +| -------------------- | -------------------------------------- | +| Datasets | `motrpac`, `addneuromed`, `parkinsons` | +| Node sample ratios | 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 | +| Selection methods | `variance`, `correlation`, `random` | +| Adjacency thresholds | 0.02, 0.1, 0.2, 0.3, 0.4, 0.5 | **Metrics computed per graph:** @@ -173,12 +174,14 @@ This will: ### How to Update 1. **Replace the JSON files** in `public/data/`: + ```bash cp /path/to/new/results.json public/data/results.json cp /path/to/new/stats.json public/data/stats.json ``` 2. **Rebuild and deploy**: + ```bash make deploy ``` @@ -189,6 +192,7 @@ Results key: `{dataset}|{ratio}|{method}|{threshold}|{model}` Stats key: `{dataset}|{ratio}|{method}|{threshold}` Where: + - `dataset`: `motrpac`, `addneuromed`, or `parkinsons` - `ratio`: node sample ratio (0.5–0.9) - `method`: `variance`, `correlation`, or `random` diff --git a/webapp/precompute_stats.py b/webapp/precompute_stats.py index 7fc01caa..e6961c6f 100644 --- a/webapp/precompute_stats.py +++ b/webapp/precompute_stats.py @@ -41,7 +41,7 @@ }, } -HF_REPO_ID = 'geometric-intelligence/bgbench' +HF_REPO_ID = '/bgbench' # Parameter grids NODE_SAMPLE_RATIOS = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0] @@ -56,10 +56,10 @@ def load_raw_data(dataset_name: str) -> tuple[pd.DataFrame, np.ndarray]: """Load raw dataset from HuggingFace (cached).""" if dataset_name in _data_cache: return _data_cache[dataset_name] - + print(f' Loading {dataset_name} from HuggingFace...') config = DATASETS[dataset_name] - + data_file = hf_hub_download( repo_id=HF_REPO_ID, repo_type='dataset', @@ -72,30 +72,30 @@ def load_raw_data(dataset_name: str) -> tuple[pd.DataFrame, np.ndarray]: revision=config['revision'], filename=f'{dataset_name}_targets.parquet', ) - + raw_data = pd.read_parquet(data_file) targets_df = pd.read_parquet(targets_file) - + if 'target' in raw_data.columns: raw_data = raw_data.drop('target', axis=1) - + targets = targets_df['target'].values raw_data, targets = shuffle(raw_data, targets, random_state=42) - + train_val_test_split = config['train_val_test_split'] train_idx = int(len(targets) * train_val_test_split[0]) - + train_data = raw_data.iloc[:train_idx] train_targets = targets[:train_idx] - + imputer = SimpleImputer(strategy='mean') train_data_imputed = imputer.fit_transform(train_data) train_data = pd.DataFrame( train_data_imputed, columns=train_data.columns, index=train_data.index ) - + print(f' {dataset_name}: {train_data.shape[0]} samples, {train_data.shape[1]} features') - + _data_cache[dataset_name] = (train_data, train_targets) return train_data, train_targets @@ -105,7 +105,7 @@ def select_nodes( ) -> np.ndarray: """Select nodes based on feature importance.""" np.random.seed(42) - + if method == 'variance': variances = np.std(data, axis=0) ranked_nodes = np.argsort(variances)[::-1] @@ -119,7 +119,7 @@ def select_nodes( ranked_nodes = np.random.permutation(data.shape[1]) else: raise ValueError(f'Invalid method: {method}') - + return ranked_nodes[:n_selected] @@ -138,38 +138,38 @@ def calculate_adjacency_matrix( def compute_graph_metrics(adj_matrix: np.ndarray) -> dict[str, float]: """Compute various graph metrics from adjacency matrix.""" n_nodes = adj_matrix.shape[0] - + adj_no_diag = adj_matrix.copy() np.fill_diagonal(adj_no_diag, 0) - + graph = nx.from_numpy_array(adj_no_diag) - + node_degrees = np.sum(adj_no_diag, axis=1) n_edges = np.sum(adj_no_diag) / 2 max_edges = n_nodes * (n_nodes - 1) / 2 density = n_edges / max_edges if max_edges > 0 else 0 mean_degree = np.mean(node_degrees) if n_nodes > 0 else 0 std_degree = np.std(node_degrees) if n_nodes > 0 else 0 - + n_components = nx.number_connected_components(graph) - + if n_nodes > 0 and n_components > 0: largest_cc = max(nx.connected_components(graph), key=len) largest_cc_ratio = len(largest_cc) / n_nodes * 100 else: largest_cc_ratio = 0 largest_cc = set() - + try: avg_clustering = nx.average_clustering(graph) except Exception: avg_clustering = 0 - + try: if n_components > 0 and len(largest_cc) > 1: subgraph = graph.subgraph(largest_cc) if len(largest_cc) > 100: - sample_nodes = list(largest_cc)[:min(50, len(largest_cc))] + sample_nodes = list(largest_cc)[: min(50, len(largest_cc))] path_lengths = [] for source in sample_nodes[:25]: lengths = nx.single_source_shortest_path_length(subgraph, source) @@ -181,7 +181,7 @@ def compute_graph_metrics(adj_matrix: np.ndarray) -> dict[str, float]: avg_path_length = 0 except Exception: avg_path_length = 0 - + return { 'n_nodes': n_nodes, 'n_edges': int(n_edges), @@ -196,14 +196,11 @@ def compute_graph_metrics(adj_matrix: np.ndarray) -> dict[str, float]: def get_graph_stats_for_params( - dataset_name: str, - node_sample_ratio: float, - method: str, - adjacency_threshold: float + dataset_name: str, node_sample_ratio: float, method: str, adjacency_threshold: float ) -> dict[str, float]: """Compute graph statistics for given parameters.""" train_data, train_targets = load_raw_data(dataset_name) - + n_training_samples = len(train_targets) if node_sample_ratio >= 1.0: n_nodes = min(train_data.shape[1], 1000) @@ -212,83 +209,83 @@ def get_graph_stats_for_params( if n_nodes > train_data.shape[1]: n_nodes = train_data.shape[1] n_nodes = min(n_nodes, 1000) - + selected_nodes = select_nodes( train_data.values, train_targets, n_selected=n_nodes, method=method ) train_selected = train_data.iloc[:, selected_nodes] - + adj_matrix = calculate_adjacency_matrix(train_selected, adjacency_threshold) metrics = compute_graph_metrics(adj_matrix) metrics['dataset'] = dataset_name - + return metrics def main(): """Precompute all graph statistics and save to file.""" output_file = Path(__file__).parent / 'public' / 'data' / 'stats.json' - - print('='*60) + + print('=' * 60) print('Precomputing Graph Statistics') - print('='*60) - + print('=' * 60) + # Calculate total combinations total = ( - len(DATASETS) * - len(NODE_SAMPLE_RATIOS) * - len(NODE_SELECTION_METHODS) * - len(ADJACENCY_THRESHOLDS) + len(DATASETS) + * len(NODE_SAMPLE_RATIOS) + * len(NODE_SELECTION_METHODS) + * len(ADJACENCY_THRESHOLDS) ) print(f'\nTotal combinations to compute: {total}') print(f' Datasets: {list(DATASETS.keys())}') print(f' Node sample ratios: {NODE_SAMPLE_RATIOS}') print(f' Selection methods: {NODE_SELECTION_METHODS}') print(f' Adjacency thresholds: {ADJACENCY_THRESHOLDS}') - + # Preload all datasets print('\n--- Loading datasets ---') for dataset_name in DATASETS.keys(): load_raw_data(dataset_name) - + # Compute all combinations print('\n--- Computing statistics ---') results = {} start_time = time.time() - - combinations = list(itertools.product( - DATASETS.keys(), - NODE_SAMPLE_RATIOS, - NODE_SELECTION_METHODS, - ADJACENCY_THRESHOLDS - )) - + + combinations = list( + itertools.product( + DATASETS.keys(), NODE_SAMPLE_RATIOS, NODE_SELECTION_METHODS, ADJACENCY_THRESHOLDS + ) + ) + for i, (dataset, ratio, method, threshold) in enumerate(combinations, 1): # Create cache key as string for JSON cache_key = f'{dataset}|{ratio}|{method}|{threshold}' - + if i % 20 == 0 or i == 1: elapsed = time.time() - start_time eta = (elapsed / i) * (total - i) if i > 0 else 0 - print(f' [{i}/{total}] ETA: {eta:.0f}s - {dataset}, p={ratio}, {method}, τ={threshold}') - + print( + f' [{i}/{total}] ETA: {eta:.0f}s - {dataset}, p={ratio}, {method}, τ={threshold}' + ) + try: stats = get_graph_stats_for_params(dataset, ratio, method, threshold) results[cache_key] = stats except Exception as e: print(f' ERROR: {dataset}, p={ratio}, {method}, τ={threshold}: {e}') - + # Save results print(f'\n--- Saving to {output_file} ---') with open(output_file, 'w') as f: json.dump(results, f, indent=2) - + elapsed = time.time() - start_time print(f'\nDone! Computed {len(results)} combinations in {elapsed:.1f}s') print(f'Results saved to: {output_file}') - print('='*60) + print('=' * 60) if __name__ == '__main__': main() -