From 9e389468b2bfb4be33a979c8820d98f34c5125b9 Mon Sep 17 00:00:00 2001 From: TheVidz Date: Tue, 31 Mar 2026 01:31:17 +0530 Subject: [PATCH 1/7] fix naming issue deep_ancestry, redirecting to flan --- deep_ancestry/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 deep_ancestry/__init__.py diff --git a/deep_ancestry/__init__.py b/deep_ancestry/__init__.py new file mode 100644 index 0000000..850fd39 --- /dev/null +++ b/deep_ancestry/__init__.py @@ -0,0 +1 @@ +from flan import * \ No newline at end of file From 242990bb05631b613ccc62b4c3c128a54e01888a Mon Sep 17 00:00:00 2001 From: TheVidz Date: Tue, 31 Mar 2026 01:58:36 +0530 Subject: [PATCH 2/7] error in qc.py file causing issue on "prepare" --- flan/preprocess/qc.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/flan/preprocess/qc.py b/flan/preprocess/qc.py index 28b2b6e..db3a252 100644 --- a/flan/preprocess/qc.py +++ b/flan/preprocess/qc.py @@ -16,11 +16,23 @@ def __init__(self, qc_config: Dict) -> None: self.qc_config = qc_config def fit_transform(self, cache: FileCache) -> None: - run_plink(args_list=['--pfile', str(cache.pfile_path()), 'vzs', '--make-pgen'], - args_dict={**{'--out': str(cache.pfile_path()), # Merging dicts here - '--set-missing-var-ids': '@:#'}, - **self.qc_config}) - + # Create a new output path for QC-processed data + qc_path = str(cache.pfile_path()) + "_qc" + + run_plink( + args_list=[ + '--pfile', str(cache.pfile_path()), + '--make-pgen' + ], + args_dict={ + '--out': qc_path, + '--set-missing-var-ids': '@:#', + **self.qc_config + } + ) + + # ✅ VERY IMPORTANT: update cache to point to QC output + cache._pfile_path = qc_path def transform(self, source_path: str, dest_path: str) -> None: run_plink(args_list=['--make-pgen', '--pfile', str(source_path)], From e9e9d8a428932816642fac4476479b2175ec1582 Mon Sep 17 00:00:00 2001 From: TheVidz Date: Tue, 31 Mar 2026 02:16:04 +0530 Subject: [PATCH 3/7] fix num folds error in sample_splitter --- flan/preprocess/sample_splitter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/flan/preprocess/sample_splitter.py b/flan/preprocess/sample_splitter.py index 5a38c34..82737ab 100644 --- a/flan/preprocess/sample_splitter.py +++ b/flan/preprocess/sample_splitter.py @@ -29,6 +29,10 @@ def _split_ids(self, y: y can be passed to trigger StratifiedKFold instead of KFold random_state (int): Fixed random_state for train_test_split sklearn function """ + # adding min 5 folds + num_folds = getattr(self.args, "num_folds", 5) + self.args.num_folds = num_folds + ids = pandas.read_table(cache.ids_path()).rename(columns={'#IID': 'IID'}).filter(['FID', 'IID']) indices = numpy.arange(ids.shape[0]) if self.args.num_folds == 1: From a339c66ba70f375bd02c24903cef19bbe377b9e6 Mon Sep 17 00:00:00 2001 From: TheVidz Date: Tue, 31 Mar 2026 13:15:25 +0530 Subject: [PATCH 4/7] Fixing Error: --read-freq variant ID '.' appears multiple times --- flan/preprocess/qc.py | 2 +- flan/preprocess/sample_splitter.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/flan/preprocess/qc.py b/flan/preprocess/qc.py index db3a252..fa37fa5 100644 --- a/flan/preprocess/qc.py +++ b/flan/preprocess/qc.py @@ -26,7 +26,7 @@ def fit_transform(self, cache: FileCache) -> None: ], args_dict={ '--out': qc_path, - '--set-missing-var-ids': '@:#', + '--set-missing-var-ids': '@:#:$r:$a', **self.qc_config } ) diff --git a/flan/preprocess/sample_splitter.py b/flan/preprocess/sample_splitter.py index 82737ab..c4ffe1b 100644 --- a/flan/preprocess/sample_splitter.py +++ b/flan/preprocess/sample_splitter.py @@ -71,12 +71,17 @@ def _split_ids(self, ids.iloc[indices, :].to_csv(out_path, sep='\t', index=False) def _split_genotypes(self, cache: FileCache) -> None: + # 🔥 Force use of QC-processed genotype + base_path = str(cache.pfile_path()) + if not base_path.endswith("_qc"): + base_path = base_path + "_qc" + for fold_index, part in product(range(cache.num_folds), ['train', 'val', 'test']): run_plink( args_dict={ - '--pfile': str(cache.pfile_path()), + '--pfile': base_path, # ✅ FIXED: use QC data '--keep': str(cache.ids_path(fold_index, part)), - '--out': str(cache.pfile_path(fold_index, part)) + '--out': str(cache.pfile_path(fold_index, part)) }, args_list=['--make-pgen'] ) @@ -93,7 +98,9 @@ def _split_phenotypes(self, cache: FileCache) -> None: ) def fit_transform(self, cache: FileCache) -> None: - + # Force splitter to use QC output + if not str(cache.pfile_path()).endswith("_qc"): + cache._pfile_path = str(cache.pfile_path()) + "_qc" self._split_ids(cache) self._split_genotypes(cache) self._split_phenotypes(cache) From e0d0bd1def5ae52cd566cc853b222cd918761ecd Mon Sep 17 00:00:00 2001 From: TheVidz Date: Sun, 24 May 2026 20:36:31 +0530 Subject: [PATCH 5/7] revert commit a339c66 --- flan/preprocess/qc.py | 2 +- flan/preprocess/sample_splitter.py | 15 ++++----------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/flan/preprocess/qc.py b/flan/preprocess/qc.py index fa37fa5..db3a252 100644 --- a/flan/preprocess/qc.py +++ b/flan/preprocess/qc.py @@ -26,7 +26,7 @@ def fit_transform(self, cache: FileCache) -> None: ], args_dict={ '--out': qc_path, - '--set-missing-var-ids': '@:#:$r:$a', + '--set-missing-var-ids': '@:#', **self.qc_config } ) diff --git a/flan/preprocess/sample_splitter.py b/flan/preprocess/sample_splitter.py index c4ffe1b..da7fa25 100644 --- a/flan/preprocess/sample_splitter.py +++ b/flan/preprocess/sample_splitter.py @@ -71,20 +71,15 @@ def _split_ids(self, ids.iloc[indices, :].to_csv(out_path, sep='\t', index=False) def _split_genotypes(self, cache: FileCache) -> None: - # 🔥 Force use of QC-processed genotype - base_path = str(cache.pfile_path()) - if not base_path.endswith("_qc"): - base_path = base_path + "_qc" - for fold_index, part in product(range(cache.num_folds), ['train', 'val', 'test']): run_plink( args_dict={ - '--pfile': base_path, # ✅ FIXED: use QC data + '--pfile': str(cache.pfile_path()), '--keep': str(cache.ids_path(fold_index, part)), - '--out': str(cache.pfile_path(fold_index, part)) + '--out': str(cache.pfile_path(fold_index, part)) }, args_list=['--make-pgen'] - ) + ) def _split_phenotypes(self, cache: FileCache) -> None: phenotype = pandas.read_table(cache.phenotype_path(), names=['IID', 'ancestry', 'in_phase3']) @@ -98,9 +93,7 @@ def _split_phenotypes(self, cache: FileCache) -> None: ) def fit_transform(self, cache: FileCache) -> None: - # Force splitter to use QC output - if not str(cache.pfile_path()).endswith("_qc"): - cache._pfile_path = str(cache.pfile_path()) + "_qc" + self._split_ids(cache) self._split_genotypes(cache) self._split_phenotypes(cache) From ffa8170bcc47d64cc5ec75a4241cf255dff6876b Mon Sep 17 00:00:00 2001 From: TheVidz Date: Sun, 24 May 2026 20:37:35 +0530 Subject: [PATCH 6/7] coment-out commit e9e9d8a --- flan/preprocess/sample_splitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flan/preprocess/sample_splitter.py b/flan/preprocess/sample_splitter.py index da7fa25..8644174 100644 --- a/flan/preprocess/sample_splitter.py +++ b/flan/preprocess/sample_splitter.py @@ -30,8 +30,8 @@ def _split_ids(self, random_state (int): Fixed random_state for train_test_split sklearn function """ # adding min 5 folds - num_folds = getattr(self.args, "num_folds", 5) - self.args.num_folds = num_folds + # num_folds = getattr(self.args, "num_folds", 5) + # self.args.num_folds = num_folds ids = pandas.read_table(cache.ids_path()).rename(columns={'#IID': 'IID'}).filter(['FID', 'IID']) indices = numpy.arange(ids.shape[0]) From 2de0dc611cc50ae21b82bfe268eeb85ca2ee22a9 Mon Sep 17 00:00:00 2001 From: TheVidz Date: Sun, 24 May 2026 20:38:52 +0530 Subject: [PATCH 7/7] revert commit 242990b --- flan/preprocess/qc.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/flan/preprocess/qc.py b/flan/preprocess/qc.py index db3a252..2616fb9 100644 --- a/flan/preprocess/qc.py +++ b/flan/preprocess/qc.py @@ -14,25 +14,11 @@ class QCArgs: class QC: def __init__(self, qc_config: Dict) -> None: self.qc_config = qc_config - def fit_transform(self, cache: FileCache) -> None: - # Create a new output path for QC-processed data - qc_path = str(cache.pfile_path()) + "_qc" - - run_plink( - args_list=[ - '--pfile', str(cache.pfile_path()), - '--make-pgen' - ], - args_dict={ - '--out': qc_path, - '--set-missing-var-ids': '@:#', - **self.qc_config - } - ) - - # ✅ VERY IMPORTANT: update cache to point to QC output - cache._pfile_path = qc_path + run_plink(args_list=['--pfile', str(cache.pfile_path()), 'vzs', '--make-pgen'], + args_dict={**{'--out': str(cache.pfile_path()), # Merging dicts here + '--set-missing-var-ids': '@:#'}, + **self.qc_config}) def transform(self, source_path: str, dest_path: str) -> None: run_plink(args_list=['--make-pgen', '--pfile', str(source_path)],