ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/msatfinder/overlap_finder.pl
Revision: 1.1.1.1 (vendor branch)
Committed: Mon Mar 7 15:34:46 2005 UTC (11 years, 4 months ago) by knirirr
Branch: MAIN
CVS Tags: HEAD, HEAD
Changes since 1.1: +0 -0 lines
Log Message:
First import

Line File contents
1 #!/usr/local/bin/perl
2
3 use strict;
4 use Bio::SeqIO;
5 use DBI;
6
7 ################
8 # set stuff up #
9 ################
10
11 # directories for running the program and
12 # looking for genomes, respectively
13 my $dbdir = "/usr/db/data/gmine";
14 my @directories = qw {Bacteria Organelles Phage Plasmids Viroids Viruses};
15 my $outfile = "overlaps.csv";
16
17 #SQL line to get given data
18 #my $sql = "select r.genome, r.specific_taxon, r.definition, r.repeat, r.repeat_type, r.rep_start, r.rep_stop, coalesce(d.next_dist,0), coalesce(d.prev_dist,0) from repeats_bm as r, distances_bm as d where (r.genome = 'NC_002665' or r.genome = 'NC_002771' or r.genome = 'NC_002794') and r.repeat_name = d.msat_name order by r.genome, r.rep_start";
19 #my $sql = "select r.genome, r.specific_taxon, r.definition, r.repeat, r.repeat_type, r.rep_start, r.rep_stop, coalesce(d.next_dist,0), coalesce(d.prev_dist,0) from repeats_bm as r, distances_bm as d where r.repeat_name = d.msat_name order by r.genome, r.rep_start limit 20";
20 #my $sql = "select r.genome, r.specific_taxon, r.definition, r.repeat, r.repeat_type, r.rep_start, r.rep_stop, coalesce(d.next_dist,0), coalesce(d.prev_dist,0) from repeats as r, distances as d where r.repeat_name = d.msat_name order by r.genome, r.rep_start";
21 my $sql = "select r.genome, r.specific_taxon, r.definition, r.repeat, r.repeat_type, r.rep_start, r.rep_stop, coalesce(d.next_dist,0), coalesce(d.prev_dist,0) from repeats_bm as r, distances_bm as d where r.repeat_name = d.msat_name order by r.genome, r.rep_start";
22
23 # connection data
24 my $dbname = "gmine";
25 my $dbusername = "www";
26 my $dbpassword = "";
27
28 # make connection to development database
29 my $dbh = DBI->connect("dbi:Pg:host=localhost dbname=$dbname", $dbusername, $dbpassword, {RaiseError=>1, AutoCommit=>1, Taint=>1}) or die "Can't connect: $!";
30 my $sth = $dbh->prepare($sql);
31 $sth->execute();
32
33 # array of data to print out
34 my @totalout=();
35
36
37 ################
38 # process data #
39 ################
40
41 # go through list of msats
42 my %genseen = ();
43 my %genobj = ();
44 my %satmatch = ();
45 my %satlap = ();
46 my $oldgenome;
47 while (my @row = $sth->fetchrow_array())
48 {
49 # get msat info and save to new array
50 my @out=();
51 foreach my $item (@row)
52 {
53 if ($item =~ /,/)
54 {
55 my @array = split(",",$item);
56 $item = $array[0];
57 }
58 push(@out, $item);
59 }
60 my $genome = shift @row;
61 my $taxon = shift @row;
62 my $definition = shift @row;
63 my $repeat = shift @row;
64 my $rep_type = shift @row;
65 my $rep_start = shift @row;
66 my $rep_stop = shift @row;
67 my $next_dist = shift @row;
68 my $prev_dist = shift @row;
69 my $int = &frameshifter($rep_type); push (@out,$int);
70
71 # report to user
72 print "\nInvestigating genome: $genome\n" unless $genseen{$genome} == 1;
73 print "Msat at: $rep_start\n";
74
75 # open genome and get all data
76 # should change this code so there is only one
77 # access of the genome file, to save time
78 my $directory = &initcaps($taxon);
79 my ($seqio_object,$seq_object);
80 if ($genseen{$genome} == 1)
81 {
82 $seq_object = $genobj{$genome};
83 }
84 else
85 {
86 $seqio_object = Bio::SeqIO->new(-file =>"$dbdir/$directory/$genome.gbk");
87 $seq_object = $seqio_object->next_seq();
88 $genobj{$genome} = $seq_object;
89 $genobj{$oldgenome} = undef;
90 }
91 my $length = length($seq_object->seq);
92 $genseen{$genome} = 1;
93
94 # keep count of total number of hits, and
95 # a separate exact match column
96 # see $countfeat, above
97 my $exact_match = 0;
98 my @values;
99 my @feats;
100
101 #examine each feature in the genome
102 my ($oldstart,$oldstop,$caught);
103 my $percents = 0;
104 FEAT: foreach my $feat_object ($seq_object->get_SeqFeatures)
105 {
106 my $fstart=$feat_object->location->start;
107 my $fstop=$feat_object->location->end;
108
109 #ignores any feature which has the same length as the full sequence
110 unless ($fstart == 1 and $fstop == $length)
111 {
112 # look for exact matches
113 if ($rep_start >= ($fstart - $rep_type) and $rep_start <= ($fstart + $rep_type) and
114 $rep_stop >= ($fstop - $rep_type) and $rep_stop <= ($fstop + $rep_type))
115 {
116 $exact_match = 1;
117 }
118 #look for features which overlap with the msat
119 if (($rep_start < $fstart and $rep_stop >= $fstart) or
120 ($rep_start >= $fstart and $rep_start <= $fstop))
121 {
122 # get main tag
123 my $feature = $feat_object->primary_tag;
124 if ($exact_match == 1)
125 {
126 print "-- Feature found: $feature, start: $fstart - Exact match!\n";
127 }
128 else
129 {
130 print "-- Feature found: $feature, start: $fstart\n";
131 }
132 next FEAT if ($feature eq "");
133
134 # get more details
135 my $notes;
136 foreach my $tag ($feat_object->get_all_tags)
137 {
138 if ($tag eq 'note')
139 {
140 foreach my $value ($feat_object->get_tag_values($tag))
141 {
142 $value =~ s/\t//g;
143 $feature .= ":$value";
144 }
145 }
146 }
147 # find whereabouts in the feature the msat is found
148 if ($exact_match != 1 and $caught != 1)
149 {
150 my $whichway = $feat_object->strand();
151 if ($rep_start > $fstart and $rep_start < $fstop)
152 {
153 if ($whichway >= 0)
154 {
155 $percents = 100*(($rep_start - $fstart)/($fstop-$fstart));
156 }
157 elsif ($whichway <0)
158 {
159 $percents = 100*(($fstop - $rep_stop)/($fstop-$fstart));
160 }
161 }
162 elsif ($rep_start <= $fstart and $whichway > 0)
163 {
164 $percents = -100*(($rep_start - $fstart)/($fstop-$fstart));
165 }
166 }
167 if ($oldstart == $fstart and $oldstop == $fstop)
168 {
169 $satlap{$repeat}++;
170 }
171 $satmatch{$repeat}++;
172 push (@feats, $feature);
173 $oldstart = $fstart;
174 $oldstop = $fstop;
175 $caught = 1;
176 }
177 }
178 }
179 # save output
180 if (scalar @feats > 0)
181 {
182 my $matches = $satmatch{$repeat};
183 my $overlaps = $satlap{$repeat} || 0;
184 my $uniques = $matches - $overlaps;
185 push (@out,$exact_match);
186 push (@out,$matches);
187 push (@out,$overlaps);
188 push (@out,$uniques);
189 push (@out, $percents);
190 push (@out,join("|",@feats));
191 push (@totalout,\@out);
192 }
193 # delete old sequence objects to save memory
194 # - only works if sql sorts things in order
195 # of genome
196 $oldgenome = $genome;
197 }
198
199 ######################
200 # print out the data #
201 ######################
202
203 # determine whether a genome
204 # has been seen or not
205 my %seen=();
206 print "Printing out data...\n";
207 open (OUT,">$outfile") or die "Can't open $outfile: $!";
208 print OUT "genome|taxon|definition|msat|msat_type|start|stop|next_dist|last_dist|shifter|exact|feats|overlaps|uniques|from_start\n";
209 foreach my $line (@totalout)
210 {
211 my $outline = join("|", @{$line});
212 print OUT "$outline\n" unless ($seen{$outline} == 1);
213 $seen{$outline} = 1;
214 }
215 close OUT;
216
217 ############
218 # finished #
219 ############
220 $dbh->disconnect;
221 print "Huzzah!\n";
222
223 ###############
224 # subroutines #
225 ###############
226
227 # Capitalise initial letter of every word, lower-case the rest
228 sub initcaps
229 {
230 my $arg = shift;
231 $arg =~ s/([A-Za-z])([A-Za-z]+)/\U$1\E\L$2\E/g;
232 return $arg;
233 }
234
235 # return 0 if divisible by 3, 1 otherwise
236 sub frameshifter
237 {
238 my $arg = shift;
239 if (int($arg/3) == $arg/3)
240 {
241 return 0;
242 }
243 else
244 {
245 return 1;
246 }
247 }