ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff_get_introns.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 4 months ago) by gpertea
File size: 5039 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5    
6     my $usage = q/Usage:
7     gff_get_introns.pl [-m <minlen>] <gff_data_stream..>
8     Extract intron coordinates from given GFF data stream.
9     Output this tabulated fomat:
10    
11     <chr><strand> <start> <end> <intron_id> <transcript_info>
12    
13     Options:
14     -m minium intron length to extract (default 100000)
15     /;
16     umask 0002;
17     getopts('m:o:') || die($usage."\n");
18     my $minlen=$Getopt::Std::opt_m || 100000;
19     my $outfile=$Getopt::Std::opt_o;
20     if ($outfile) {
21     open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
22     select(OUTF);
23     }
24     # --------------------- 0 1 2 3 4 5 6 7
25     my %recs; # recID => [ chr.strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
26     #
27     my $curtag; #chromosome and strand for current model
28     my ($gname, $tdescr);
29     my @exd; #exons for current model
30     while (<>) {
31     next if m/^\s*#/;
32     chomp;
33     my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
34     next unless $fstart>1 && $lnum;
35     next if $f eq 'gene'; # Warning: skipping any 'gene' features, unconditionally
36     my $gff3_ID;
37     my $gff3_Parent;
38     ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
39     ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
40     ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
41     if ($gff3_ID || $gff3_Parent) { # GFF format
42     $gff3_ID=~tr/"//d; #"
43     $gff3_Parent=~tr/"//d; #"
44     $gff3_Parent='' if ($f eq 'mRNA');
45     if ($gff3_ID && !$gff3_Parent) { #top level feature
46     if ($f=~m/RNA/i || $f=~/gene/) {
47     # try to parse the description, if any
48     $tdescr='';
49     $gname='';
50     if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
51     $tdescr=$1;
52     }
53     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
54     $tdescr=$1;
55     }
56     if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
57     $gname=$1;
58     }
59     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
60     $gname=$1;
61     }
62     $tdescr='' if ($tdescr eq $gname);
63     $gname='' if $gname eq $gff3_ID;
64     }
65     die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr|$gff3_ID"}));
66     my $recID="$chr|$gff3_ID";
67     $curtag=$chr.$strand;
68     $recs{$recID} = [$curtag, $f, $gname, $tdescr, $fstart, $fend, [], [] ];
69     next;
70     } # parent/top-level feature
71     } #GFF
72     else { #GTF format
73     next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
74     }
75     # -------------- exon/CDS line here:
76     my $recID;
77     if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
78     $recID=$chr.'.jsm.'.$lnum;
79     }
80     elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
81     $recID=$1;
82     $recID=~tr/"//d; #"
83     }
84     elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
85     $recID=$1;
86     $recID=~tr/"//d; #"
87     }
88     else {
89     die("Error: cannot parse locus/transcript name from input line:\n$_\n");
90     }
91     if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
92     $gname=$1;
93     $gname=~tr/"//d; #"
94     }
95     $tdescr='' if index($recID, $tdescr)>=0;
96     $gname='' if index($recID, $gname)>=0;
97     $curtag=$chr.$strand;
98     $recID=$chr.'|'.$recID;
99     my $ld = $recs{$recID};
100     if ($ld) { #existing entry
101     my $i=($f eq 'CDS') ? 7 : 6;
102     my ($lstart, $lend)=($$ld[4], $$ld[5]);
103     $$ld[4]=$fstart if $fstart<$lstart;
104     $$ld[5]=$fend if $fend>$lend;
105     push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
106     }
107     else { # first time seeing this locus/gene
108     $recs{$recID} = ($f eq 'CDS') ?
109     [$curtag, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] :
110     [$curtag, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ;
111     }
112     } #while <>
113    
114     writeModels();
115    
116     # --
117     if ($outfile) {
118     select(STDOUT);
119     close(OUTF);
120     }
121    
122     #************ Subroutines **************
123     sub writeModels {
124     return if keys(%recs)==0;
125     while ( my ($l, $ld)=each(%recs) ) {
126     # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
127     my ($chrstrand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld;
128     my ($mstart,$mend)=($lstart, $lend);
129     my @ex;
130     if (@$er<1 && @$cr>0) {
131     @ex = sort { $main::a->[0] <=> $main::b->[0] } @$cr;
132     }
133     else {
134     @ex = sort { $main::a->[0] <=> $main::b->[0] } @$er;
135     }
136     # -- now look at the introns
137     next if @ex<2;
138     my @in; #introns to be printed stored here
139     #my $icount=0; #intron output counter
140     ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
141     for (my $i=1;$i<@ex;$i++) {
142     my ($istart, $iend)=($ex[$i-1]->[1]+1,$ex[$i]->[0]-1);
143     next unless ($iend-$istart+1>=$minlen);
144     push(@in, [$istart, $iend]);
145     #$icount++;
146     my $ann="";
147     $ann=" gene:$gname" if $gname;
148     $ann.=" product:$descr" if $descr;
149     print join("\t", $chrstrand, $istart, $iend, $l.".i$i", $ann)."\n";
150     }
151     } #for each stored transcript
152     }

Properties

Name Value
svn:executable *