ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gffsort
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 5489 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 gffsort [-o <outfile>] [-G] [-T] [-R] <gff_files>...
8
9 Sort the gff\/gtf records found in input <gff_files> by genomic sequence
10 (1st column) and starting coordinate (4th column) of the main gene\/mRNA
11 features. Subfeatures (exon,CDS) are kept grouped together with their parents,
12 the sorting is applied to parent features.
13
14 Options:
15 -o the sorted output is sent to <outfile>.gff3 instead of stdout
16 -R sort the reverse strand gff records by the end (maximum)
17 coordinate of the main features and place them in a separate
18 output file; this requires the -o option to be present,
19 the two output files will be called <outfile>.f.gff3 and
20 <outfile>.r.gff3 (for forward and reverse strands separately)
21 -T rewrite the 'track' name (2nd GFF column) for each line of
22 the output with a prefix based on the input filename
23 -G ignore genomic sequence (1st GFF column)
24 /;
25 umask 0002;
26 getopts('GTRxuo:') || die($usage."\n");
27 my $outfile=$Getopt::Std::opt_o;
28 my $revsort=$Getopt::Std::opt_R;
29 die ("Error: -R option requires -o \n") if $revsort && !$outfile;
30 my $no_gsort=$Getopt::Std::opt_G;
31 my $ftrack=$Getopt::Std::opt_T;
32 my $fltunpack=$Getopt::Std::opt_u;
33 my $uflt="$0 -u"; # restore filter (unpack lines)
34 unless (-x $0) {
35 die("Error: $0 must have execute permissions!\n");
36 }
37 #---------- unpack filter mode:
38 if ($fltunpack) {
39 if ($outfile) {
40 open(OUTFILE, ">$outfile") ||
41 die ("Error (in unpack filter) creating file $outfile!\n");
42 select(OUTFILE);
43 }
44 while (<STDIN>) {
45 tr/\x01/\n/;
46 print $_;
47 }
48 if ($outfile) {
49 select(STDOUT);
50 close(OUTFILE);
51 }
52 exit;
53 # --- filter mode ends here --
54 } #unpack filter mode
55
56 die($usage."\n") unless @ARGV;
57
58 #--- packing and piping into sort ---
59 my $gs='';
60 $gs='-k1,1' unless $no_gsort;
61
62 if ($revsort) {
63 $outfile=~s/\.g[tf]f\d?$//i;
64 my $routfile=$outfile.'.r.gff3';
65 $outfile.='.f.gff3';
66 my $oe=open(ROUT, "| sort $gs -k5,5nr | $uflt -o $routfile");
67 die("Error opening sort pipe!\n") if $? || $oe==0;
68 }
69 if ($outfile) {
70 my $oe=open(FOUT, "| sort $gs -k4,4n | $uflt -o $outfile");
71 die("Error opening sort pipe!\n") if $? || $oe==0;
72 }
73 else {
74 my $oe=open(FOUT, "| sort $gs -k4,4n | $uflt");
75 }
76
77 my @files=@ARGV;
78 foreach my $fn (@files) {
79 die("Error: file $fn not found!\n") unless -f $fn;
80 }
81 foreach my $fn (@files) {
82 next unless -s $fn;
83 my $trackpre;
84 if ($ftrack) {
85 my $p=$fn;
86 $p=~s/\.\w+$//; #remove extension, if any
87 ($trackpre)=($p=~m/(\w+)$/); #track prefix to use
88 }
89 open(GFIN, $fn) || die("Error opening file $fn !\n");
90 #-------
91 my $p_id; #last main feature ID
92 my $pstrand; # and strand
93 my @outbuf;
94 my ($omin, $omax); #
95 while (<GFIN>) {
96 chomp;
97 my ($gn, $track, $feat, $start, $end, $score, $strand, $frame, $attr)=split("\t",$_,9);
98 next unless $attr && ($strand eq '+' || $strand eq '-' || $strand eq '.');
99 my $lcfeat=lc($feat);
100 my $id;
101 my $gtflike=1;
102 if ($attr=~m/\bParent="?([^;"]+)/ && (index($lcfeat,'exon')<0 || $lcfeat eq 'cds')) {
103 # RNA subfeature in gff3 format (exon or CDS)
104 $id=$1;
105 }
106 elsif ($attr=~m/\bID="?([^;"]+)/) {
107 # gff3 format, possibly a parent feature
108 next if $lcfeat eq 'gene' || $lcfeat eq 'locus'; # skip extraneous high-level features in gff3
109 $id=$1;
110 $gtflike=0;
111 }
112 elsif ($attr=~m/\btranscript\w*[ =]\"?([^;"]+)/) {
113 #gtf
114 $id=$1;
115 $attr="Parent=$id";
116 }
117 else {
118 # jigsaw or other gtf-like GFF
119 ($id)=($attr=~m/^\"?([^;" ]+)/);
120 $attr="Parent=$id";
121 if ($feat=~m/exon/i) {
122 if ($track=~m/jigsaw/) { $feat='CDS' }
123 else { $feat='exon'}
124 }
125 }
126 next unless $id; # shouldn't happen!
127 $track=$trackpre if $trackpre;
128 ($start, $end)=($end, $start) if $end<$start;
129 if ($id ne $p_id) {
130 #--- ID change
131 flushgff($pstrand, \@outbuf, $omin, $omax) if $p_id;
132 $pstrand=$strand;
133 $omin=$start;$omax=$end;
134 $p_id=$id;
135 @outbuf=();
136 if ($gtflike) { #this is really true when $id ne $p_id
137 #build a parent line placeholder
138 push(@outbuf,
139 join("\t",$gn, $track, 'mRNA', "\x81","\x82",'.',$strand,'.',"ID=$id;Name=$id"));
140 }
141 else { # even if we have a parent line, we want to set the ends
142 ($start,$end)=("\x81","\x82");
143 }
144 } # ID change
145 else { # another line under the same ID
146 $omin=$start if $start<$omin;
147 $omax=$end if $end>$omax;
148 }
149 push(@outbuf,
150 join("\t",$gn, $track, $feat, $start, $end, $score, $strand, $frame, $attr));
151 } # while reading lines
152 flushgff($pstrand, \@outbuf, $omin, $omax);
153 #-------
154 close(GFIN);
155 } # for each input gff/gtf file
156
157
158
159 ##=============== SUBROUTINES ============
160 sub flushgff {
161 my ($s, $ar, $min, $max)=@_;
162 return unless @$ar>0;
163 substr($$ar[0], index($$ar[0],"\x81"), 1, $min);
164 substr($$ar[0], index($$ar[0],"\x82"), 1, $max);
165 # same as:
166 # $$ar[0]=~s/\x81/$min/;
167 # $$ar[0]=~s/\x82/$max/;
168 if ($revsort) {
169 if ($s ne '-') {
170 print FOUT join("\x01",@$ar)."\n";
171 }
172 if ($s ne '+') {
173 print ROUT join("\x01",@$ar)."\n";
174 }
175 }
176 else {
177 print FOUT join("\x01",@$ar)."\n";
178 }
179 @$ar=();
180 }

Properties

Name Value
svn:executable *