ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gffsort
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 8 months ago) by gpertea
File size: 5489 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5    
6     my $usage = q/Usage:
7     gffsort [-o <outfile>] [-G] [-T] [-R] <gff_files>...
8    
9     Sort the gff\/gtf records found in input <gff_files> by genomic sequence
10     (1st column) and starting coordinate (4th column) of the main gene\/mRNA
11     features. Subfeatures (exon,CDS) are kept grouped together with their parents,
12     the sorting is applied to parent features.
13    
14     Options:
15     -o the sorted output is sent to <outfile>.gff3 instead of stdout
16     -R sort the reverse strand gff records by the end (maximum)
17     coordinate of the main features and place them in a separate
18     output file; this requires the -o option to be present,
19     the two output files will be called <outfile>.f.gff3 and
20     <outfile>.r.gff3 (for forward and reverse strands separately)
21     -T rewrite the 'track' name (2nd GFF column) for each line of
22     the output with a prefix based on the input filename
23     -G ignore genomic sequence (1st GFF column)
24     /;
25     umask 0002;
26     getopts('GTRxuo:') || die($usage."\n");
27     my $outfile=$Getopt::Std::opt_o;
28     my $revsort=$Getopt::Std::opt_R;
29     die ("Error: -R option requires -o \n") if $revsort && !$outfile;
30     my $no_gsort=$Getopt::Std::opt_G;
31     my $ftrack=$Getopt::Std::opt_T;
32     my $fltunpack=$Getopt::Std::opt_u;
33     my $uflt="$0 -u"; # restore filter (unpack lines)
34     unless (-x $0) {
35     die("Error: $0 must have execute permissions!\n");
36     }
37     #---------- unpack filter mode:
38     if ($fltunpack) {
39     if ($outfile) {
40     open(OUTFILE, ">$outfile") ||
41     die ("Error (in unpack filter) creating file $outfile!\n");
42     select(OUTFILE);
43     }
44     while (<STDIN>) {
45     tr/\x01/\n/;
46     print $_;
47     }
48     if ($outfile) {
49     select(STDOUT);
50     close(OUTFILE);
51     }
52     exit;
53     # --- filter mode ends here --
54     } #unpack filter mode
55    
56     die($usage."\n") unless @ARGV;
57    
58     #--- packing and piping into sort ---
59     my $gs='';
60     $gs='-k1,1' unless $no_gsort;
61    
62     if ($revsort) {
63     $outfile=~s/\.g[tf]f\d?$//i;
64     my $routfile=$outfile.'.r.gff3';
65     $outfile.='.f.gff3';
66     my $oe=open(ROUT, "| sort $gs -k5,5nr | $uflt -o $routfile");
67     die("Error opening sort pipe!\n") if $? || $oe==0;
68     }
69     if ($outfile) {
70     my $oe=open(FOUT, "| sort $gs -k4,4n | $uflt -o $outfile");
71     die("Error opening sort pipe!\n") if $? || $oe==0;
72     }
73     else {
74     my $oe=open(FOUT, "| sort $gs -k4,4n | $uflt");
75     }
76    
77     my @files=@ARGV;
78     foreach my $fn (@files) {
79     die("Error: file $fn not found!\n") unless -f $fn;
80     }
81     foreach my $fn (@files) {
82     next unless -s $fn;
83     my $trackpre;
84     if ($ftrack) {
85     my $p=$fn;
86     $p=~s/\.\w+$//; #remove extension, if any
87     ($trackpre)=($p=~m/(\w+)$/); #track prefix to use
88     }
89     open(GFIN, $fn) || die("Error opening file $fn !\n");
90     #-------
91     my $p_id; #last main feature ID
92     my $pstrand; # and strand
93     my @outbuf;
94     my ($omin, $omax); #
95     while (<GFIN>) {
96     chomp;
97     my ($gn, $track, $feat, $start, $end, $score, $strand, $frame, $attr)=split("\t",$_,9);
98     next unless $attr && ($strand eq '+' || $strand eq '-' || $strand eq '.');
99     my $lcfeat=lc($feat);
100     my $id;
101     my $gtflike=1;
102     if ($attr=~m/\bParent="?([^;"]+)/ && (index($lcfeat,'exon')<0 || $lcfeat eq 'cds')) {
103     # RNA subfeature in gff3 format (exon or CDS)
104     $id=$1;
105     }
106     elsif ($attr=~m/\bID="?([^;"]+)/) {
107     # gff3 format, possibly a parent feature
108     next if $lcfeat eq 'gene' || $lcfeat eq 'locus'; # skip extraneous high-level features in gff3
109     $id=$1;
110     $gtflike=0;
111     }
112     elsif ($attr=~m/\btranscript\w*[ =]\"?([^;"]+)/) {
113     #gtf
114     $id=$1;
115     $attr="Parent=$id";
116     }
117     else {
118     # jigsaw or other gtf-like GFF
119     ($id)=($attr=~m/^\"?([^;" ]+)/);
120     $attr="Parent=$id";
121     if ($feat=~m/exon/i) {
122     if ($track=~m/jigsaw/) { $feat='CDS' }
123     else { $feat='exon'}
124     }
125     }
126     next unless $id; # shouldn't happen!
127     $track=$trackpre if $trackpre;
128     ($start, $end)=($end, $start) if $end<$start;
129     if ($id ne $p_id) {
130     #--- ID change
131     flushgff($pstrand, \@outbuf, $omin, $omax) if $p_id;
132     $pstrand=$strand;
133     $omin=$start;$omax=$end;
134     $p_id=$id;
135     @outbuf=();
136     if ($gtflike) { #this is really true when $id ne $p_id
137     #build a parent line placeholder
138     push(@outbuf,
139     join("\t",$gn, $track, 'mRNA', "\x81","\x82",'.',$strand,'.',"ID=$id;Name=$id"));
140     }
141     else { # even if we have a parent line, we want to set the ends
142     ($start,$end)=("\x81","\x82");
143     }
144     } # ID change
145     else { # another line under the same ID
146     $omin=$start if $start<$omin;
147     $omax=$end if $end>$omax;
148     }
149     push(@outbuf,
150     join("\t",$gn, $track, $feat, $start, $end, $score, $strand, $frame, $attr));
151     } # while reading lines
152     flushgff($pstrand, \@outbuf, $omin, $omax);
153     #-------
154     close(GFIN);
155     } # for each input gff/gtf file
156    
157    
158    
159     ##=============== SUBROUTINES ============
160     sub flushgff {
161     my ($s, $ar, $min, $max)=@_;
162     return unless @$ar>0;
163     substr($$ar[0], index($$ar[0],"\x81"), 1, $min);
164     substr($$ar[0], index($$ar[0],"\x82"), 1, $max);
165     # same as:
166     # $$ar[0]=~s/\x81/$min/;
167     # $$ar[0]=~s/\x82/$max/;
168     if ($revsort) {
169     if ($s ne '-') {
170     print FOUT join("\x01",@$ar)."\n";
171     }
172     if ($s ne '+') {
173     print ROUT join("\x01",@$ar)."\n";
174     }
175     }
176     else {
177     print FOUT join("\x01",@$ar)."\n";
178     }
179     @$ar=();
180     }

Properties

Name Value
svn:executable *