-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgb_gff_filter.pl
More file actions
executable file
·65 lines (61 loc) · 1.87 KB
/
gb_gff_filter.pl
File metadata and controls
executable file
·65 lines (61 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/perl
use strict;
use Getopt::Std;
my $usage = q/gb_gff_filter.pl [options] <input_GFF_stream>
Processes genomic annotation GFF files from NCBI
filtering out anything but the following gene_biotype genes:
lncRNA
protein_coding
/;
getopts('ho:') || die($usage."\n");
my $outfile=$Getopt::Std::opt_o;
if ($outfile) {
open(OUTFILE, '>'.$outfile) || die ("Error creating file $outfile\n");
select(OUTFILE);
}
my @OKbiotypes=('lncRNA', 'protein_coding');
my %biotypes;
@biotypes{@OKbiotypes}= (1) x scalar(@OKbiotypes);
my %OKgenes; # chr|ID for "gene" features with OK biotype
my %OKtranscripts; # chr|ID for children of OK genes
my ($numgenes, $numtranscripts);
while (<>) {
next if m/^\s*#/;
my $line=$_;
chomp;
my ($chr, $track, $f, $fstart, $fend, $fscore,
$strand, $frame, $lnum)=split(/\t/);
my ($ID)=($lnum=~m/\bID=([^;]+)/);
my ($Parent)=($lnum=~m/\bParent=([^;]+)/);
die("Error: no ID or Parent found for:\n$line\n")
unless ($ID || $Parent);
next unless ($ID || $Parent);
my ($g_id, $t_id);
if ($f eq 'gene') {
my ($btype)=($lnum=~m/\bgene_biotype=([^;]+)/);
die("Error: biotype not found for gene line:\n$line\n") unless $btype;
#print STDERR "biotype=<$btype>\n";
next unless ($btype && exists($biotypes{$btype}));
$g_id=$chr.'|'.$ID;
$OKgenes{$g_id}=[] unless exists($OKgenes{$g_id});
print STDOUT $line;
next;
}
if ($f=~m/transcript$/ || $f=~m/RNA$/) {
$t_id=$chr.'|'.$ID;
$g_id=$chr.'|'.$Parent;
next unless (exists($OKgenes{$g_id}));
print STDOUT $line;
$OKtranscripts{$t_id}=[] unless exists($OKtranscripts{$t_id});
next;
}
if ($f=~m/exon$/ || $f=~m/CDS$/) {
$t_id=$chr.'|'.$Parent;
if (exists($OKgenes{$t_id})) {
print STDERR "Gene-transcript exception (exon/CDS parented by gene $t_id)\n";
$OKtranscripts{$t_id}=[];
}
next unless (exists($OKtranscripts{$t_id}));
print STDOUT $line;
}
}