1 |
gpertea |
23 |
#!/usr/bin/perl |
2 |
|
|
use strict; |
3 |
|
|
use Getopt::Std; |
4 |
|
|
use FindBin;use lib $FindBin::Bin; |
5 |
|
|
|
6 |
|
|
my $usage = q/Usage: |
7 |
|
|
gffsort [-o <outfile>] [-G] [-T] [-R] <gff_files>... |
8 |
|
|
|
9 |
|
|
Sort the gff\/gtf records found in input <gff_files> by genomic sequence |
10 |
|
|
(1st column) and starting coordinate (4th column) of the main gene\/mRNA |
11 |
|
|
features. Subfeatures (exon,CDS) are kept grouped together with their parents, |
12 |
|
|
the sorting is applied to parent features. |
13 |
|
|
|
14 |
|
|
Options: |
15 |
|
|
-o the sorted output is sent to <outfile>.gff3 instead of stdout |
16 |
|
|
-R sort the reverse strand gff records by the end (maximum) |
17 |
|
|
coordinate of the main features and place them in a separate |
18 |
|
|
output file; this requires the -o option to be present, |
19 |
|
|
the two output files will be called <outfile>.f.gff3 and |
20 |
|
|
<outfile>.r.gff3 (for forward and reverse strands separately) |
21 |
|
|
-T rewrite the 'track' name (2nd GFF column) for each line of |
22 |
|
|
the output with a prefix based on the input filename |
23 |
|
|
-G ignore genomic sequence (1st GFF column) |
24 |
|
|
/; |
25 |
|
|
umask 0002; |
26 |
|
|
getopts('GTRxuo:') || die($usage."\n"); |
27 |
|
|
my $outfile=$Getopt::Std::opt_o; |
28 |
|
|
my $revsort=$Getopt::Std::opt_R; |
29 |
|
|
die ("Error: -R option requires -o \n") if $revsort && !$outfile; |
30 |
|
|
my $no_gsort=$Getopt::Std::opt_G; |
31 |
|
|
my $ftrack=$Getopt::Std::opt_T; |
32 |
|
|
my $fltunpack=$Getopt::Std::opt_u; |
33 |
|
|
my $uflt="$0 -u"; # restore filter (unpack lines) |
34 |
|
|
unless (-x $0) { |
35 |
|
|
die("Error: $0 must have execute permissions!\n"); |
36 |
|
|
} |
37 |
|
|
#---------- unpack filter mode: |
38 |
|
|
if ($fltunpack) { |
39 |
|
|
if ($outfile) { |
40 |
|
|
open(OUTFILE, ">$outfile") || |
41 |
|
|
die ("Error (in unpack filter) creating file $outfile!\n"); |
42 |
|
|
select(OUTFILE); |
43 |
|
|
} |
44 |
|
|
while (<STDIN>) { |
45 |
|
|
tr/\x01/\n/; |
46 |
|
|
print $_; |
47 |
|
|
} |
48 |
|
|
if ($outfile) { |
49 |
|
|
select(STDOUT); |
50 |
|
|
close(OUTFILE); |
51 |
|
|
} |
52 |
|
|
exit; |
53 |
|
|
# --- filter mode ends here -- |
54 |
|
|
} #unpack filter mode |
55 |
|
|
|
56 |
|
|
die($usage."\n") unless @ARGV; |
57 |
|
|
|
58 |
|
|
#--- packing and piping into sort --- |
59 |
|
|
my $gs=''; |
60 |
|
|
$gs='-k1,1' unless $no_gsort; |
61 |
|
|
|
62 |
|
|
if ($revsort) { |
63 |
|
|
$outfile=~s/\.g[tf]f\d?$//i; |
64 |
|
|
my $routfile=$outfile.'.r.gff3'; |
65 |
|
|
$outfile.='.f.gff3'; |
66 |
|
|
my $oe=open(ROUT, "| sort $gs -k5,5nr | $uflt -o $routfile"); |
67 |
|
|
die("Error opening sort pipe!\n") if $? || $oe==0; |
68 |
|
|
} |
69 |
|
|
if ($outfile) { |
70 |
|
|
my $oe=open(FOUT, "| sort $gs -k4,4n | $uflt -o $outfile"); |
71 |
|
|
die("Error opening sort pipe!\n") if $? || $oe==0; |
72 |
|
|
} |
73 |
|
|
else { |
74 |
|
|
my $oe=open(FOUT, "| sort $gs -k4,4n | $uflt"); |
75 |
|
|
} |
76 |
|
|
|
77 |
|
|
my @files=@ARGV; |
78 |
|
|
foreach my $fn (@files) { |
79 |
|
|
die("Error: file $fn not found!\n") unless -f $fn; |
80 |
|
|
} |
81 |
|
|
foreach my $fn (@files) { |
82 |
|
|
next unless -s $fn; |
83 |
|
|
my $trackpre; |
84 |
|
|
if ($ftrack) { |
85 |
|
|
my $p=$fn; |
86 |
|
|
$p=~s/\.\w+$//; #remove extension, if any |
87 |
|
|
($trackpre)=($p=~m/(\w+)$/); #track prefix to use |
88 |
|
|
} |
89 |
|
|
open(GFIN, $fn) || die("Error opening file $fn !\n"); |
90 |
|
|
#------- |
91 |
|
|
my $p_id; #last main feature ID |
92 |
|
|
my $pstrand; # and strand |
93 |
|
|
my @outbuf; |
94 |
|
|
my ($omin, $omax); # |
95 |
|
|
while (<GFIN>) { |
96 |
|
|
chomp; |
97 |
|
|
my ($gn, $track, $feat, $start, $end, $score, $strand, $frame, $attr)=split("\t",$_,9); |
98 |
|
|
next unless $attr && ($strand eq '+' || $strand eq '-' || $strand eq '.'); |
99 |
|
|
my $lcfeat=lc($feat); |
100 |
|
|
my $id; |
101 |
|
|
my $gtflike=1; |
102 |
|
|
if ($attr=~m/\bParent="?([^;"]+)/ && (index($lcfeat,'exon')<0 || $lcfeat eq 'cds')) { |
103 |
|
|
# RNA subfeature in gff3 format (exon or CDS) |
104 |
|
|
$id=$1; |
105 |
|
|
} |
106 |
|
|
elsif ($attr=~m/\bID="?([^;"]+)/) { |
107 |
|
|
# gff3 format, possibly a parent feature |
108 |
|
|
next if $lcfeat eq 'gene' || $lcfeat eq 'locus'; # skip extraneous high-level features in gff3 |
109 |
|
|
$id=$1; |
110 |
|
|
$gtflike=0; |
111 |
|
|
} |
112 |
|
|
elsif ($attr=~m/\btranscript\w*[ =]\"?([^;"]+)/) { |
113 |
|
|
#gtf |
114 |
|
|
$id=$1; |
115 |
|
|
$attr="Parent=$id"; |
116 |
|
|
} |
117 |
|
|
else { |
118 |
|
|
# jigsaw or other gtf-like GFF |
119 |
|
|
($id)=($attr=~m/^\"?([^;" ]+)/); |
120 |
|
|
$attr="Parent=$id"; |
121 |
|
|
if ($feat=~m/exon/i) { |
122 |
|
|
if ($track=~m/jigsaw/) { $feat='CDS' } |
123 |
|
|
else { $feat='exon'} |
124 |
|
|
} |
125 |
|
|
} |
126 |
|
|
next unless $id; # shouldn't happen! |
127 |
|
|
$track=$trackpre if $trackpre; |
128 |
|
|
($start, $end)=($end, $start) if $end<$start; |
129 |
|
|
if ($id ne $p_id) { |
130 |
|
|
#--- ID change |
131 |
|
|
flushgff($pstrand, \@outbuf, $omin, $omax) if $p_id; |
132 |
|
|
$pstrand=$strand; |
133 |
|
|
$omin=$start;$omax=$end; |
134 |
|
|
$p_id=$id; |
135 |
|
|
@outbuf=(); |
136 |
|
|
if ($gtflike) { #this is really true when $id ne $p_id |
137 |
|
|
#build a parent line placeholder |
138 |
|
|
push(@outbuf, |
139 |
|
|
join("\t",$gn, $track, 'mRNA', "\x81","\x82",'.',$strand,'.',"ID=$id;Name=$id")); |
140 |
|
|
} |
141 |
|
|
else { # even if we have a parent line, we want to set the ends |
142 |
|
|
($start,$end)=("\x81","\x82"); |
143 |
|
|
} |
144 |
|
|
} # ID change |
145 |
|
|
else { # another line under the same ID |
146 |
|
|
$omin=$start if $start<$omin; |
147 |
|
|
$omax=$end if $end>$omax; |
148 |
|
|
} |
149 |
|
|
push(@outbuf, |
150 |
|
|
join("\t",$gn, $track, $feat, $start, $end, $score, $strand, $frame, $attr)); |
151 |
|
|
} # while reading lines |
152 |
|
|
flushgff($pstrand, \@outbuf, $omin, $omax); |
153 |
|
|
#------- |
154 |
|
|
close(GFIN); |
155 |
|
|
} # for each input gff/gtf file |
156 |
|
|
|
157 |
|
|
|
158 |
|
|
|
159 |
|
|
##=============== SUBROUTINES ============ |
160 |
|
|
sub flushgff { |
161 |
|
|
my ($s, $ar, $min, $max)=@_; |
162 |
|
|
return unless @$ar>0; |
163 |
|
|
substr($$ar[0], index($$ar[0],"\x81"), 1, $min); |
164 |
|
|
substr($$ar[0], index($$ar[0],"\x82"), 1, $max); |
165 |
|
|
# same as: |
166 |
|
|
# $$ar[0]=~s/\x81/$min/; |
167 |
|
|
# $$ar[0]=~s/\x82/$max/; |
168 |
|
|
if ($revsort) { |
169 |
|
|
if ($s ne '-') { |
170 |
|
|
print FOUT join("\x01",@$ar)."\n"; |
171 |
|
|
} |
172 |
|
|
if ($s ne '+') { |
173 |
|
|
print ROUT join("\x01",@$ar)."\n"; |
174 |
|
|
} |
175 |
|
|
} |
176 |
|
|
else { |
177 |
|
|
print FOUT join("\x01",@$ar)."\n"; |
178 |
|
|
} |
179 |
|
|
@$ar=(); |
180 |
|
|
} |