From 37186391209e6d67005dfff88219b8bebfcefab7 Mon Sep 17 00:00:00 2001 From: Dan-RAI <20709058+Dan-RAI@users.noreply.github.com> Date: Tue, 17 Oct 2023 15:03:41 -0400 Subject: [PATCH 1/3] Fixes to .vcf import (multi-allelic case) --- python/PascalX/refpanel.py | 150 ++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 70 deletions(-) diff --git a/python/PascalX/refpanel.py b/python/PascalX/refpanel.py index 11ea506..aeff176 100755 --- a/python/PascalX/refpanel.py +++ b/python/PascalX/refpanel.py @@ -230,83 +230,93 @@ def _import_reference_thread_vcf(self,i,keepfile,qualityT,SNPonly,regEx=None): # Main data import loop for line in f: - - # Data line - data = line.split("\t") - - # Get GT pos - tmp = data[8].split(":") - GT = -1 - for j in range(0,len(tmp)): - if tmp[j] == 'GT': - GT = j - break - - # Checks - if (GT == -1) or (data[2][:2] != 'rs') or (data[6] != 'PASS' and qualityT is not None and (int(data[5]) < qualityT)): - continue - - # Read genotype - genotypes = data[9:] - - # Infer alternate alleles (pos 0: ref allele) - alleles = [data[3]] - alleles.extend(data[4].split(",")) - - if SNPonly and (len(data[3]) > 1): - continue - - counter = np.zeros(len(alleles),dtype='int') - - # Only read samples in sampleMap - genomap = {} - for j in range(0,len(sampleKeys)): - - geno = genotypes[sampleKeys[j]].split(":")[GT] - - # Ignore half-calls - if geno[0] != "." and geno[2] != ".": - counter[int(geno[0])] += 1 - counter[int(geno[2])] += 1 - - genomap[sampleKeys[j]] = geno + try: + # Data line + data = line.split("\t") + + # Get GT pos + tmp = data[8].split(":") + GT = -1 + for j in range(0,len(tmp)): + if tmp[j] == 'GT': + GT = j + break + + # Checks + if (GT == -1) or (data[2][:2] != 'rs') or (data[6] != 'PASS' and qualityT is not None and (int(data[5]) < qualityT)): + continue + + # Infer alternate alleles (pos 0: ref allele) + alleles = [data[3]] + alleles.extend(data[4].split(",")) - # Reference allele - refp = 0 - - SC = np.argsort(counter) # Sort alleles count - for p in SC: + if SNPonly and (len(data[3]) > 1): + continue + + counter = np.zeros(len(alleles),dtype='int') - if p != refp: - if SNPonly and len(alleles[p]) > 1: - continue - - minp = str(p) - - gd = np.zeros(len(sampleKeys),dtype='B') + # Read genotype + genotypes = data[9:] - for j in range(0,len(sampleKeys)): - #geno = genotypes[sampleKeys[j]].split(":")[GT] - geno = genomap[sampleKeys[j]] - - # Ignore half-calls - if geno[0] != '.' and geno[2] != '.': - - if geno[0] == minp: - gd[j] += 1 - - if geno[2] == minp: - gd[j] += 1 + # Multi-allelic flag + mallelic = False + + # Only read samples in sampleMap + genomap = {} + for j in range(0,len(sampleKeys)): + + geno = genotypes[sampleKeys[j]].split(":")[GT].split("|") - # Compute MAF - MAF = np.mean(gd)/2. - if (MAF > 0.5): - MAF = 1.0 - MAF; + if len(geno) > 2: + mallelic = True + break + + # Ignore half-calls + if geno[0] != "." and geno[1] != ".": + counter[int(geno[0])] += 1 + counter[int(geno[1])] += 1 - T = [data[2],MAF,gd,alleles[p],alleles[refp]] # Stores alt and ref allele + genomap[sampleKeys[j]] = geno - db.insert({int(data[1]):T}) + if not mallelic: # Skip if more than two alleles + # Reference allele + refp = 0 + + SC = np.argsort(counter) # Sort alleles count + for p in SC: + + if p != refp: + if SNPonly and len(alleles[p]) > 1: + continue + + minp = str(p) + + gd = np.zeros(len(sampleKeys),dtype='B') + + for j in range(0,len(sampleKeys)): + geno = genomap[sampleKeys[j]].split("|") + + # Ignore half-calls + if geno[0] != '.' and geno[1] != '.': + + if geno[0] == minp: + gd[j] += 1 + + if geno[1] == minp: + gd[j] += 1 + + # Compute MAF + MAF = np.mean(gd)/2. + if (MAF > 0.5): + MAF = 1.0 - MAF; + + T = [data[2],MAF,gd,alleles[p],alleles[refp]] # Stores alt and ref allele + + db.insert({int(data[1]):T}) + except Exception as e: + print("[WARNING] Data line broken:",line) + f.close() db.close() From 6eebde1e3260ecc25ec992f4031eb8e3e92833cb Mon Sep 17 00:00:00 2001 From: Dan-RAI <20709058+Dan-RAI@users.noreply.github.com> Date: Tue, 17 Oct 2023 17:08:03 -0400 Subject: [PATCH 2/3] fix --- python/PascalX/refpanel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/PascalX/refpanel.py b/python/PascalX/refpanel.py index aeff176..a6bd956 100755 --- a/python/PascalX/refpanel.py +++ b/python/PascalX/refpanel.py @@ -294,7 +294,7 @@ def _import_reference_thread_vcf(self,i,keepfile,qualityT,SNPonly,regEx=None): gd = np.zeros(len(sampleKeys),dtype='B') for j in range(0,len(sampleKeys)): - geno = genomap[sampleKeys[j]].split("|") + geno = genomap[sampleKeys[j]] # Ignore half-calls if geno[0] != '.' and geno[1] != '.': From 4c0c21f00202ec9a0f52f6ce09f8e6e523e446ce Mon Sep 17 00:00:00 2001 From: Dan-RAI <20709058+Dan-RAI@users.noreply.github.com> Date: Wed, 18 Oct 2023 01:05:02 -0400 Subject: [PATCH 3/3] Changed splitter to pre-compiled re --- python/PascalX/refpanel.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/PascalX/refpanel.py b/python/PascalX/refpanel.py index a6bd956..0912b2d 100755 --- a/python/PascalX/refpanel.py +++ b/python/PascalX/refpanel.py @@ -166,6 +166,9 @@ def _import_reference_thread_tped(self,i): return True def _import_reference_thread_vcf(self,i,keepfile,qualityT,SNPonly,regEx=None): + # Pre-compile genotype splitter + Gsplitter = re.compile('/|\|') + # Load filter info keep = set([]) if keepfile is not None: @@ -265,7 +268,7 @@ def _import_reference_thread_vcf(self,i,keepfile,qualityT,SNPonly,regEx=None): genomap = {} for j in range(0,len(sampleKeys)): - geno = genotypes[sampleKeys[j]].split(":")[GT].split("|") + geno = Gsplitter.split( genotypes[sampleKeys[j]].split(":")[GT] ) if len(geno) > 2: mallelic = True