From 30aa6de9070841bd693b37eca489c5aaad5a20c0 Mon Sep 17 00:00:00 2001 From: theob0t Date: Thu, 26 Feb 2026 11:19:29 -0500 Subject: [PATCH] Fix prepDE.py3 KeyError for transcripts missing from first sample MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When StringTie -e is used with piped input (e.g. samtools view -q 255 | stringtie -), transcripts with zero passing-filter reads may be omitted from the output GTF. Since geneIDs is only populated from the first sample (loop 1 breaks after the first successful parse), transcripts that appear in later samples but not the first cause a KeyError at line 279. The current defaultdict(lambda: str) suppresses the crash but silently maps missing transcripts to the str type object, corrupting the gene count matrix. Fix: skip transcripts not present in geneIDs. These are transcripts with zero counts in the first sample — their transcript-level counts are still written correctly to the transcript count matrix from t_dict. Fixes #428, related to #337. --- prepDE.py3 | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/prepDE.py3 b/prepDE.py3 index ab5ee66..73127eb 100755 --- a/prepDE.py3 +++ b/prepDE.py3 @@ -275,11 +275,9 @@ for q, s in enumerate(samples): for i,v in t_dict.items(): ## print i,v - try: - geneDict[geneIDs[i]][s[0]]+=v[s[0]] - except KeyError: - print("Error: could not locate transcript %s entry for sample %s" % ( i, s[0] )) - raise + if i not in geneIDs: + continue + geneDict[geneIDs[i]][s[0]]+=v[s[0]] if opts.v: print("..writing %s " % ( opts.t ))