ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/genExonMers.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 3786 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 genExonMers.pl [-k <len>] [-X] <fasta_file> <id_tmap>
8
9 Generates a multi-fasta file with all exon-contained k-mers
10 for each sequence in <fasta_file>, with header augmented with
11 data taken from <id_tmap>
12
13 Use -X option to allow probes to cross exon bounderies
14 (default is: only generate probes fully contained within exons)
15
16 Use -M option when only k-mers from multi-match genes
17 should be generated (requires <id_tmap>)
18 /;
19 umask 0002;
20 getopts('XMk:o:') || die($usage."\n");
21 my $outfile=$Getopt::Std::opt_o;
22 my $klen=$Getopt::Std::opt_k || 50;
23 my $crossexons=$Getopt::Std::opt_X;
24 my $multimatch=$Getopt::Std::opt_M;
25
26 #================ SUBROUTINES ============
27 die($usage."\n") unless @ARGV>=1;
28 my ($fa, $tmap)=@ARGV;
29 if ($tmap) {
30 foreach my $f ($fa, $tmap) {
31 die "Error: cannot find file '$f'!\n" unless -f $f;
32 die "Error: invalid file size for '$f'!\n" unless -s $f;
33 }
34 }
35
36 my %tm; # cufflinks_id => ref_id, but also ref_id => cufflinks_id
37 my %hn; # count how many times a cluster pair was found
38 #
39
40 if ($tmap) {
41 open(TMAP, $tmap) || die("Error opening file '$tmap'!\n");
42 while (<TMAP>) {
43 chomp;
44 my @t=split("\t");
45 next unless $t[2];
46 $tm{$t[0]}=$t[2];
47 $tm{$t[2]}=$t[0];
48 if ($t[8]) {
49 my $clpair=$t[7].'_'.$t[8];
50 $hn{$clpair}++;
51 $hn{$t[2]}=$clpair;
52 }
53 }
54 close(TMAP);
55 }
56 open(FA, $fa) || die("Error opening file '$fa'!\n");
57 my $c_id;
58 my @c_exons;
59 my @c_segs;
60 my $c_seq;
61 my $c_loc;
62 while (<FA>) {
63 chomp;
64 if (m/^>(\S+)/) {
65 my $id=$1;
66 genProbes();
67 if ($tmap && $multimatch) {
68 my $clpair=$hn{$id};
69 $c_id='';
70 next if $hn{$clpair}<2;
71 }
72 $c_id=$id;
73 ($c_loc)=(m/ loc:(\S+)/);
74 my ($exons)=(m/ exons:(\S+)/);
75 my ($segs)=(m/ segs:(\S+)/);
76 die ("Error parsing line ($_), exons or segs not found!\n") unless $exons && $segs;
77 @c_exons=($exons=~m/(\d+\-\d+)/g);
78 @c_exons=map { [split('-')] } @c_exons;
79 @c_segs=($segs=~m/(\d+\-\d+)/g);
80 @c_segs=map { [split('-')] } @c_segs;
81 $c_seq='';
82 next;
83 }
84 $c_seq.=uc($_) if $c_id;
85 }
86 close(FA);
87 genProbes(); # uses c_* global variables
88
89 #-----------------
90
91 sub genProbes {
92 return unless $c_id;
93 my ($chr, $chr_pos, $chr_strand)=split('\|',$c_loc);
94 my $exnum=0;
95 my $excount = @c_segs;
96 my $clen=length($c_seq);
97 die("Error at $c_id ($c_loc) - exon count mismatch!\n") if ($excount != @c_exons);
98 foreach my $seg (@c_segs) {
99 my ($start, $end)=@$seg;
100 $start--;
101 my $seglen=$end-$start;
102 $exnum++;
103 next if $seglen<$klen && !$crossexons;
104 my $r_id=$tm{$c_id} || $c_id;
105 my ($xnum, $chrpos);
106 if ($chr_strand eq '-') {
107 $xnum=$excount-$exnum;
108 my $exon=$c_exons[$xnum];
109 $chrpos=$$exon[1];
110 }
111 else {
112 $xnum=$exnum-1;
113 my $exon=$c_exons[$xnum];
114 $chrpos=$$exon[0];
115 }
116 $xnum++;
117 if ($crossexons) {
118 for (my $p=$start;$p<$end && $p+$klen<=$clen;$p++) {
119 my $xpos=($chr_strand eq '-') ? $chrpos-$klen-($p-$start)+1 : $chrpos+($p-$start);
120 #my $exn=$exnum;
121 my $exn=$xnum;
122 $exn.='_' if ($p+$klen>$end); # crosses into next exon
123 my $rstart=$p+1;
124 print ">$c_id|$r_id|$chr|$chr_strand|$excount|$exn|$xpos|$rstart\n";
125 print substr($c_seq, $p, $klen)."\n";
126 }
127 }
128 else {
129 for (my $p=$start;$p+$klen<=$end;$p++) {
130 my $xpos=($chr_strand eq '-') ? $chrpos-$klen-($p-$start)+1 : $chrpos+($p-$start);
131 my $rstart=$p+1;
132 #print ">$c_id|$r_id|$chr|$chr_strand|$excount|$exnum|$xpos|$rstart\n";
133 print ">$c_id|$r_id|$chr|$chr_strand|$excount|$xnum|$xpos|$rstart\n";
134 print substr($c_seq, $p, $klen)."\n";
135 }
136 }
137 }
138 $c_seq='';
139 @c_exons=();
140 @c_segs=();
141 $c_loc='';
142 $c_id='';
143 }

Properties

Name Value
svn:executable *