ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/mirfind_excise.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 6265 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4
5 my $usage = qq/
6 mirfind_excise.pl [-o <out_prefix>] <chromosomes.fa.cidx> <bowtie_sorted.map>
7
8 Excises potential microRNA precursor sequences from a genome
9 using the positions of aligned reads as guidelines.
10
11 <chromosomes.fa.cidx> must be the cdbyank index for the multifasta file
12 with all chromosomes.
13 The input file <bowtie_sorted.map> MUST BE SORTED
14 by chromosome AND then start position.
15
16 Two output files will be created:
17 1) <out_prefix>.xc.fa : fasta files with excised regions
18 2) <out_prefix>.xc.xmap : a pseudo-fasta file (suitable for indexing)
19 holding read mappings on to every excised region
20 (with coordinates relative to the excised sequence)
21
22 If -o option was not given, <out_prefix> will be set to the input file name.
23 /;
24
25 # -- Note: in miRDeep's script the coordinates in the resulting fasta file
26 # are relative to the reverse complemented strand for '-' alignments
27 # --
28 # this is NOT the way they are written here: coordinates of the excised
29 # ranges are always relative to the forward strand, like in a GFF file
30
31
32 getopts('o:') || die "$usage\n";
33 my $fgenomecidx=shift(@ARGV) or die $usage;
34 my $file_map=shift(@ARGV) or die $usage;
35 die($usage."\n") unless $fgenomecidx=~m/\.cidx$/ && -f $fgenomecidx;
36 my $fgenome=$fgenomecidx;
37 $fgenome=~s/\.cidx$//;
38 die($usage."Error: cannot locate fasta file $fgenome\n") unless -f $fgenome;
39 die($usage."Error: cannot locate mapping file $file_map\n") unless -f $file_map;
40
41 my $outprefix=$Getopt::Std::opt_o;
42 unless ($outprefix) {
43 $outprefix=$file_map;
44 $outprefix=~s/\.(\w+)$//;
45 }
46 my ($outfa, $outmap)=($outprefix.'.xc.fa', $outprefix.'.xc.xmap');
47 open(OUTFA, '>'.$outfa) || die("Error creating file $outfa!\n");
48 select(OUTFA);
49 open(OUTMAP, '>'.$outmap) || die("Error creating file $outmap!\n");
50
51 my ($gseqname, $gseq, $gseqlen, $gseqcounter);
52 excise_by_map($file_map);
53 select(STDOUT);
54 close(OUTFA);
55 close(OUTMAP);
56 exit;
57 # -------------------
58
59 #sub parse_file_blast_parsed{
60
61 sub excise_by_map { #processes bowtie map output
62 my($file)=@_;
63 open(FILENAME, $file) or die "Could not open file $file";
64
65 my (@r_start, @r_end);
66 #current region start and end coordinates of last region - by strand (0='+', 1='-')
67 my @xmap=([],[]); #read mappings data on the selected region, by strand (0='+', 1='-')
68 while (<FILENAME>) {
69 chomp;
70 my @t=split(/\t/);
71 next unless (@t>4) && ($t[3]>0);
72 my $query=$t[0];
73 my $strand=$t[1];
74 my $qlen=length($t[4]);
75 my $chr=$t[2];
76 my $sidx=($strand eq '-')? 1 : 0;
77 my $g_start=$t[3]+1;
78 my $g_end=$g_start+$qlen-1;
79 my $newgseq=($chr ne $gseqname);
80 if ($newgseq) {
81 foreach my $i (0..1) {
82 if ($r_end[$i]) {
83 my $rs=($i==0)?'+':'-';
84 gExcise($rs, $r_start[$i], $r_end[$i], $xmap[$i]); # assumes $gseqname was set properly
85 ($r_start[$i], $r_end[$i])=(0,0);
86 }
87 }
88 @xmap=([],[]);
89 loadGSeq($chr); #this will set ($gseqname, $gseq, $gseqlen)
90 }
91 #elsif ($r_end[$sidx] && $g_start-$r_end[$sidx]>30) {
92 elsif ($r_end[$sidx] && $g_start-$r_end[$sidx]>22) {
93 # same genomic sequence, but separated by a gap
94 gExcise($strand, $r_start[$sidx], $r_end[$sidx], $xmap[$sidx]);
95 ($r_start[$sidx], $r_end[$sidx])=(0,0);
96 $xmap[$sidx]=[];
97 }
98 push(@{$xmap[$sidx]}, [$query, $g_start, $g_end, $t[7], $t[8]]);
99 $r_start[$sidx]=$g_start if ($r_start[$sidx]==0 || $r_start[$sidx]>$g_start);
100 $r_end[$sidx]=$g_end if $r_end[$sidx]<$g_end;
101 } # for each bowtie mapping
102
103 #process last regions:
104 foreach my $i (0..1) {
105 if ($r_end[$i]) {
106 my $rs=($i==0)?'+':'-';
107 gExcise($rs, $r_start[$i], $r_end[$i], $xmap[$i]); # assumes $gseqname was set properly
108 ($r_start[$i], $r_end[$i])=(0,0);
109 $xmap[$i]=[];
110 }
111 }
112 close(FILENAME);
113 }
114
115 sub loadGSeq {
116 my ($subject) = @_;
117 return if $gseqname eq $subject && $gseqlen>10;
118 print STDERR "processing $subject.. \n";
119 # load genomic sequence in $gseq and update ($gseq, $gseqname, $gseqlen)
120 my $fofs=`cdbyank -a '$subject' -P $fgenomecidx`;
121 chomp($fofs);
122 die("Error: cannot cdbyank $subject from $fgenomecidx!\n") unless length($fofs)>0;
123 open(FASTA, $fgenome) or die "Error opening $fgenome\n";
124 binmode(FASTA);
125 seek(FASTA, $fofs, 0);
126 my $l=<FASTA>;
127 if ($l=~m/^>(\S+)/) {
128 my $new=$1;
129 die("Error: $new sequence name found, expected $subject\n") unless $new eq $subject;
130 $gseqname=$new;
131 }
132 else {
133 die("Error: no FASTA header in $fgenome at offset $fofs (qseq=$subject)\n");
134 }
135 #now read the sequence
136 $gseqlen=0;
137 $gseq='';
138 while (<FASTA>) {
139 last if m/^>/;
140 chomp;
141 $gseq.=$_;
142 $gseqlen+=length($_);
143 }
144 close(FASTA);
145 $gseqcounter=0;
146 }
147
148
149 sub gExcise{
150 my ($gstrand, $gstart, $gend, $xmaps)=@_;
151 #print STDERR "gexcise: $gstart..$gend\n";
152 return unless $gend>$gstart;
153 if ($gend-$gstart>60) {
154 #longer alignments should be excised as a single potential precursor, shorter as two
155 my $ext=136-($gend-$gstart);
156 if ($ext<0) { $ext=0; }
157 else { $ext>>=1; }
158 #print STDERR "gexcise ext=$ext\n";
159 writeExcision(\$gseqcounter, $gstrand, $gstart-$ext, $gend+$ext, $xmaps);
160 }
161 else { #short region (<=30)
162 writeExcision(\$gseqcounter, $gstrand, $gstart-22, $gend+84, $xmaps);
163 writeExcision(\$gseqcounter, $gstrand, $gstart-84, $gend+22, $xmaps);
164 }
165 }
166
167
168 sub writeExcision {
169 my ($k, $strand, $start, $end, $xm)=@_;
170 $start=1 if $start<1;
171 $end=$gseqlen if $end>$gseqlen;
172 my $len=$end-$start+1;
173 my $subseq=uc(substr($gseq, $start-1, $len));
174 $subseq=revcom($subseq) if $strand eq '-';
175 print ">$gseqname\_$$k $strand|$start-$end\n$subseq\n";
176 print OUTMAP ">$gseqname\_$$k $strand|$start-$end\n";
177 foreach my $m (@$xm) {
178 my ($mstart, $mend)=($$m[1]-$start+1, $$m[2]-$start+1);
179 # reverse complement coordinates if needed:
180 ($mstart, $mend)=($len-$mend+1, $len-$mstart+1) if ($strand eq '-');
181 my @wm=@$m;
182 @wm[1..2]=($mstart, $mend);
183 print OUTMAP join("\t",@wm)."\n";
184 }
185 $$k++;
186 }
187
188 sub revcom{
189 my $seq=reverse($_[0]);
190 $seq=~tr/acgtuACGTU/TGCAATGCAA/;
191 return $seq;
192 }

Properties

Name Value
svn:executable *