ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/poly/poly
Revision: 1.2
Committed: Thu Dec 8 12:23:44 2005 UTC (10 years, 8 months ago) by jeff
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +23 -5 lines
Log Message:
*** empty log message ***

Line File contents
1 #!/usr/bin/env python
2
3 ################################################################################
4 #
5 # Poly
6 # Version 0.1.1
7 #
8 # Copyright (c) 1996-2001 J.W. Bizzaro & University of Massachusetts Lowell
9 #
10 # Program for the quantitative analysis of simple sequence repeats (SSRs)
11 # in DNA
12 #
13 # ------------
14 #
15 # Poly is free software; you can redistribute it and/or modify it under the
16 # terms of the GNU General Public License as published by the Free Software
17 # Foundation; either version 2, or (at your option) any later version.
18 #
19 # This program is distributed in the hope that it will be useful, but WITHOUT
20 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
21 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
22 # details.
23 #
24 # You should have received a copy of the GNU General Public License along with
25 # this program; if not, write to the Free Software Foundation, Inc.,
26 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
27 #
28 ################################################################################
29
30 import sys, string
31 from math import log
32
33 FALSE = 0
34 TRUE = 1
35 MONOMER_LENGTH = 1
36 MIN_MONOMER_COUNT_VALUE = 1
37 MAX_MONOMER_COUNT_VALUE = 50
38 MIN_POLYMER_COUNT_VALUE = 2
39 PRINT_WHAT = 'stats'
40
41
42 # Data structure
43 #
44 # monomer_list
45 # |
46 # |------Monomer (e.g, GCA)
47 # | monomer_count_list
48 # | |
49 # | |------MonomerCount (e.g., 3 == GCAGCAGCA)
50 # | | polymer_list
51 # | | |
52 # | | |------Polymer (e.g., GCAGCAGCA found somewhere)
53 # | | | |
54 # | | | |------feature
55 # | V V V
56 # |
57 # | spacer_count_list
58 # | |
59 # | |------SpacerCount (e.g., 5 == ACGTA)
60 # | | spacer_list
61 # | | |
62 # | | |------Spacer
63 # V V V
64
65
66
67 class Monomer:
68
69 def __init__(self):
70 """
71 """
72
73 # The name of the monomer
74 self.name = ''
75 self.monomer_count_list = []
76 self.monomer_spacer_list = []
77
78 # End __init__
79
80
81 class MonomerCount:
82
83 def __init__(self):
84 """
85 """
86
87 self.value = 0
88 self.polymer_list = []
89
90 # End __init__
91
92
93 class SpacerCount:
94
95 def __init__(self):
96 """
97 """
98
99 self.value = 0
100 self.spacer_list = []
101
102 # End __init__
103
104
105 class Polymer:
106
107 def __init__(self):
108 """
109 """
110
111 # The ID of the sequence containing the polymer
112 self.seq_id = ''
113
114 # End __init__
115
116
117 class Spacer:
118
119 def __init__(self):
120 """
121 """
122
123 self.value = 0
124 self.polymer_list = []
125
126 # End __init__
127
128
129 class Poly:
130
131 def __init__(self):
132 """
133 """
134
135 global MONOMER_LENGTH, MIN_MONOMER_COUNT_VALUE, PRINT_ALL
136
137 start = 'yes'
138 self.infiles = []
139 self.infilename = ''
140
141 index = 1
142 for option in sys.argv[1:]:
143
144 if (option == '-m') | (option == '--monomer_length'):
145 MONOMER_LENGTH = int(sys.argv[index+1])
146
147 elif (option == '-i') | (option == '--infiles'):
148 self.infiles = sys.argv[(index + 1):]
149
150 elif (option == '-n') | (option == '--min_monomers'):
151 MIN_MONOMER_COUNT_VALUE = int(sys.argv[index+1])
152
153 elif (option == '-w') | (option == '--print_what'):
154 PRINT_WHAT = sys.argv[index+1]
155
156 elif (option == '-h') | (option == '--help'):
157 print 'Please see the README file.'
158 start = 'no'
159
160 elif (option == '-v') | (option == '--version'):
161 print 'Version 0.1.1'
162 start = 'no'
163
164 index = index + 1
165
166 if len(self.infiles) == 0:
167 print 'no infiles'
168 start = 'no'
169
170 if start == 'yes':
171
172 # Do an analysis for each file in the filename list
173 for self.infilename in self.infiles:
174 self.infile = open(self.infilename, 'r')
175
176 self.total_bases = 0
177 self.base_count = 0
178 self.G_count = 0
179 self.C_count = 0
180 self.A_count = 0
181 self.T_count = 0
182 self.U_count = 0
183 self.Y_count = 0
184 self.N_count = 0
185
186 self.percent = 0
187 self.mark_printed = FALSE
188 self.partial = 0
189 self.whole_monomers = 1
190 self.highestpolyvalue = 0
191
192 # Create and initialize monomer list
193 self.monomer_list = []
194 self.init_monomer_list()
195
196 self.init_window()
197
198 base = self.next_base()
199
200 #
201 # MAIN LOOP
202 #
203 while base != '':
204 self.check_base(base)
205 self.add_base_to_window(base)
206 base = self.next_base()
207
208 self.total_bases = self.base_count
209
210 self.A_freq = float(self.A_count) / float(self.total_bases)
211 self.C_freq = float(self.C_count) / float(self.total_bases)
212 self.G_freq = float(self.G_count) / float(self.total_bases)
213 self.T_freq = float(self.T_count) / float(self.total_bases)
214 self.U_freq = float(self.U_count) / float(self.total_bases)
215 self.Y_freq = float(self.Y_count) / float(self.total_bases)
216 self.N_freq = float(self.N_count) / float(self.total_bases)
217
218 # sys.stdout.write(datetostr(date), ' ', timetostr(time))
219 sys.stdout.write('Sequence file: ' + self.infilename + '\n')
220 sys.stdout.write('Total bases: ' + str(self.total_bases) + '\n')
221 sys.stdout.write('Frequency of A: ' + str(self.A_freq) + '\n')
222 sys.stdout.write('Frequency of C: ' + str(self.C_freq) + '\n')
223 sys.stdout.write('Frequency of G: ' + str(self.G_freq) + '\n')
224 sys.stdout.write('Frequency of T: ' + str(self.T_freq) + '\n')
225 sys.stdout.write('Frequency of U: ' + str(self.U_freq) + '\n')
226 sys.stdout.write('Frequency of Y: ' + str(self.Y_freq) + '\n')
227 sys.stdout.write('Frequency of N: ' + str(self.N_freq) + '\n')
228 sys.stdout.write('GC composition: ' + str((self.G_freq + self.C_freq) * 100) + '%\n')
229 sys.stdout.write('AT composition: ' + str((self.A_freq + self.T_freq) * 100) + '%\n')
230
231 self.print_results()
232 self.infile.close()
233
234
235 # End __init__
236
237 def init_monomer_list(self):
238 """
239 """
240
241 monomer_name = ''
242 base_count = 0
243
244 # Add all possible monomers to list
245 self.append_base(monomer_name, base_count)
246
247 # End init_monomer_list
248
249
250 def append_base(self, root, base_count):
251 """
252 Combine all bases to produce all possible sequences up
253 to MONOMER_LENGTH, and put them in the monomer list
254 """
255
256 base_list = ['A', 'C', 'G', 'T', 'U', 'Y', 'N']
257 base_count = base_count + 1
258
259 # Go through bases
260 for base in base_list:
261 # Append the base to monomer name (sequence)
262 monomer_name = root + base
263
264 # If we haven't finished appending bases
265 if base_count < MONOMER_LENGTH:
266 # Then append a base (recursively call this method)
267 self.append_base(monomer_name, base_count)
268 # If we have finished
269 else:
270 # Then create monomer object
271 monomer = Monomer()
272 # Name is sequence
273 monomer.name = monomer_name
274 # Initialize monomer_count list
275 monomer.monomer_count_list = self.init_monomer_count_list(monomer.monomer_count_list)
276 # Add monomer to monomer list
277 self.monomer_list.append(monomer)
278
279 # End append_base
280
281
282 def init_monomer_count_list(self, monomer_count_list):
283 """
284 """
285
286 # MAX_MONOMER_COUNT_VALUE + 1 holds all counts over MAX_MONOMER_COUNT_VALUE
287 # e.g., if monomer_count == 55 and MAX_MONOMER_COUNT_VALUE == 50, then it gets put into 51
288 for value in range(1, MAX_MONOMER_COUNT_VALUE+2):
289 monomer_count = MonomerCount()
290 monomer_count.value = value
291 monomer_count_list.append(monomer_count)
292
293 return monomer_count_list
294
295 # End init_monomer_count_list
296
297
298
299 def init_window(self):
300 """
301 Create self.window and read in the first bases to fill self.window
302 """
303
304 self.window = []
305
306 for value in range(0, MONOMER_LENGTH):
307 base = self.next_base()
308 self.window.append(base)
309
310 # End init_wiindow
311
312
313 def next_base(self):
314 """
315 Read from infile the next wanted base
316 """
317
318
319 wanted = FALSE
320 while not(wanted):
321 base = self.infile.read(1)
322 # Convert base to uppercase
323 base = string.upper(base)
324 wanted = FALSE
325 if base == 'A':
326 wanted = TRUE
327 self.A_count = self.A_count + 1
328 elif base == 'C':
329 wanted = TRUE
330 self.C_count = self.C_count + 1
331 elif base == 'G':
332 wanted = TRUE
333 self.G_count = self.G_count + 1
334 elif base == 'T':
335 wanted = TRUE
336 self.T_count = self.T_count + 1
337 elif base == 'U':
338 wanted = TRUE
339 self.U_count = self.U_count + 1
340 elif base == 'Y':
341 wanted = TRUE
342 self.Y_count = self.Y_count + 1
343 elif base == 'N':
344 wanted = TRUE
345 self.N_count = self.N_count + 1
346 elif base == '<':
347 self.read_tag()
348 wanted = TRUE
349 elif base == '':
350 wanted = TRUE
351
352 # Some characters are wanted but not counted as bases
353 if not((base == '') | (base == '*')):
354 self.base_count = self.base_count + 1
355
356 # Temp
357 if base == '*': base = 'M'
358
359 return base
360
361 # End next_base
362
363
364
365 def read_tag(self):
366 """
367 """
368
369 parameter_count_value = 0
370 parameter, remaining = self.read_parameter()
371
372 while (parameter != '') & remaining:
373 parameter_count_value = parameter_count_value + 1
374 parameter, remaining = self.read_parameter()
375
376 ########
377 ########
378 ########
379
380 # End read_tag
381
382
383 def read_parameter(self):
384 """
385 """
386
387 parameter = ''
388 character = self.infile.read(1)
389
390 while character == ' ':
391 character = self.infile.read(1)
392
393 while (character != ' ') & (character != '>') & (character != ''):
394 # Convert character to lowercase
395 character = string.lower(character)
396 parameter = parameter + character
397 character = self.infile.read(1)
398
399 if character == ' ':
400 remaining = TRUE
401 else:
402 remaining = FALSE
403
404 return parameter, remaining
405
406 # End read_parameter
407
408
409
410 def check_base(self, base):
411 """
412 """
413
414 # If next base matches the first base in window,
415 # then there might be another monomer
416 if base == self.window[0]:
417 # We have at least a partial monomer
418 self.partial = self.partial + 1
419
420 # If we've reached the monomer length, then we have
421 # a whole monomer
422 if self.partial == MONOMER_LENGTH:
423 self.whole_monomers = self.whole_monomers + 1
424 self.partial = 0
425
426 # If next base does NOT match the first base in window
427 # then the polymer is complete, and it can be added to the list
428 else:
429 is_nested_polymer = FALSE
430
431 # Homopolymers can't be nested polymers
432 if MONOMER_LENGTH > 1:
433 # Check to see if the monomer contains its own polymer
434 is_nested_polymer = self.is_nested_polymer()
435
436 # If the polymer is not a nested polymer
437 if not(is_nested_polymer):
438 # Then the polymer can be created
439 monomer_name = self.list_to_string(self.window)
440 polymer = self.make_and_add_polymer(monomer_name, \
441 self.whole_monomers)
442
443 self.partial = 0
444 self.whole_monomers = 1
445
446 # End check_base
447
448
449
450 def add_base_to_window(self, base):
451 """
452 """
453
454 # Shift bases in self.window one place to the left
455 for value in range(0, len(self.window)-1):
456 self.window[value] = self.window[value+1]
457
458 self.window[len(self.window)-1] = base
459
460 # End add_base_to_window
461
462
463
464 def is_nested_polymer(self):
465
466 """
467 Routine to check for nested polymers (nps). This is when the
468 monomer being examined is actually a homopolymer or a polymer
469 of a smaller monomer size.
470
471 Returns TRUE or FALSE
472 """
473
474 is_nested_polymer = FALSE
475 startofpattern = FALSE
476 restofpattern = FALSE
477
478 for tab in range(0, MONOMER_LENGTH):
479 if (is_nested_polymer == FALSE) & ((MONOMER_LENGTH % tab) == 0):
480 startofpattern = TRUE
481
482 for tabjump in range(0, (MONOMER_LENGTH / tab)):
483 if self.window[0] <> self.window[0 + (tab * tabjump)]:
484 startofpattern = FALSE
485
486 if (startofpattern == TRUE) & (tab == 0):
487 is_nested_polymer = TRUE
488
489 if (startofpattern == TRUE) & (tab > 0):
490 restofpattern = TRUE
491
492 for index in range(1, tab+1):
493 for tabjump in range(1, (MONOMER_LENGTH / tab)):
494 if self.window[index] <> self.window[index + \
495 (tab * tabjump)]:
496 restofpattern = FALSE
497
498 if restofpattern == TRUE:
499 is_nested_polymer = TRUE
500
501 return is_nested_polymer
502
503 # End check_nested_polymer
504
505
506
507 def make_and_add_polymer(self, monomer_name, monomer_count_value):
508 """
509 Returns either polymer object or None
510 """
511
512 # Make polymer
513 polymer = Polymer()
514 # The ID of the sequence containing the polymer
515 polymer.seq_id = 'my_id'
516
517 # Search the list of monomers
518 for monomer in self.monomer_list:
519 # If monomer names match
520 if monomer.name == monomer_name:
521 # Then search the list of monomer_counts
522 for monomer_count in monomer.monomer_count_list:
523 # If monomer_counts match
524 if monomer_count.value == monomer_count_value:
525 # Then add the polymer to the monomer_count list
526 monomer_count.polymer_list.append(polymer)
527
528 return polymer
529
530 # End make_polymer
531
532
533 def list_to_string(self, list):
534 """
535 Convert a list into a single string
536 """
537
538 string = ''
539 for character in list:
540 string = string + character
541
542 return string
543
544 # End list_to_string
545
546
547
548 def print_results(self):
549 """
550 """
551
552 # For every monomer in the monomer list
553 for monomer in self.monomer_list:
554
555 # Open a new outfile
556 self.open_outfile(monomer.name)
557 # ???Do at some other point???
558 self.zero_three_dep = FALSE
559 self.zero_five_dep = FALSE
560 # Get the predicted frequency
561 monomer.freq_pred = self.get_freq_monomer_pred(monomer.name)
562 # Print the predicted frequency
563 self.print_extinction(monomer.name, monomer.freq_pred)
564 # Print results
565
566 if PRINT_WHAT == 'stats':
567 # Search the list
568 for monomer_count in monomer.monomer_count_list:
569 if monomer_count.value >= MIN_MONOMER_COUNT_VALUE:
570 polymer_count_value = 0
571 for polymer in monomer_count.polymer_list:
572 polymer_count_value = polymer_count_value + 1
573
574 # Print out the stats for polymers in list
575 self.print_stats(monomer.name, monomer_count.value, \
576 polymer_count_value, monomer.freq_pred)
577
578 elif PRINT_WHAT == 'polymers':
579 # Search the list
580 for monomer_count in monomer.monomer_count_list:
581 if monomer_count.value >= MIN_MONOMER_COUNT_VALUE:
582 polymer_count_value = 0
583 for polymer in monomer_count.polymer_list:
584 polymer_count_value = polymer_count_value + 1
585
586 self.print_a_polymer(monomer_count.value, \
587 polymer_count_value, polymer.seq_id)
588
589 elif PRINT_WHAT == 'spacers':
590 # Search the list
591 for spacer in monomer.spacer_list:
592
593
594
595
596
597 self.close_outfile()
598
599 # End print_results
600
601
602 def print_extinction(self, monomer_name, freq_pred):
603 """
604 """
605
606 extinction = self.get_extinction(freq_pred)
607 if extinction != -1:
608 sys.stdout.write('Extinction for ' \
609 + str(monomer_name) + ': ' \
610 + str(extinction) + '\n')
611 else:
612 sys.stdout.write('Div by 0 calculating extinction for ' \
613 + str(monomer_name) + '\n')
614
615 # End print_extinction
616
617
618 def open_outfile(self, monomer_name):
619 """
620 """
621
622 # Open a new outfile
623 outfilename = self.infilename + '.' + PRINT_WHAT + '.monomer=' \
624 + monomer_name
625 self.outfile = open(outfilename, 'w')
626
627 # End initialize outfile
628
629
630
631
632 def print_a_polymer(self, monomer_count_value, polymer_count_value, \
633 polymer_seq_id):
634 """
635 """
636
637 self.outfile.write(str(monomer_count_value) + chr(9) \
638 + str(polymer_count_value) + chr(9) \
639 + polymer_seq_id + '\n')
640
641 # End print_polymer
642
643
644 def get_freq_monomer_pred(self, monomer_name):
645 """
646 """
647
648 # Find the predicted frequency of the monomer's occurance.
649 # This is found by multiplying the individual OBSERVED base frequencies.
650 # This value is used to determine the departure of frequencies
651 # from predicted.
652 freq_monomer_pred = 1.0
653 for base in monomer_name:
654 if base == 'A':
655 freq_monomer_pred = freq_monomer_pred * self.A_freq
656 elif base == 'C':
657 freq_monomer_pred = freq_monomer_pred * self.C_freq
658 elif base == 'G':
659 freq_monomer_pred = freq_monomer_pred * self.G_freq
660 elif base == 'T':
661 freq_monomer_pred = freq_monomer_pred * self.T_freq
662 elif base == 'U':
663 freq_monomer_pred = freq_monomer_pred * self.U_freq
664 elif base == 'Y':
665 freq_monomer_pred = freq_monomer_pred * self.Y_freq
666 elif base == 'N':
667 freq_monomer_pred = freq_monomer_pred * self.N_freq
668
669 return freq_monomer_pred
670
671 # End get_freq_monomer_pred
672
673 def close_outfile(self):
674 """
675 """
676
677 self.outfile.close()
678
679 # End close_outfile
680
681
682 def get_extinction(self, freq_monomer_pred):
683 """
684 """
685
686 # extinction : log (1 / total_bases) / log (freq_monomer_pred)
687
688 # FIX!!! extinction : log (1 / (total_bases * (1 - freq_monomer_pred)^2) / log (freq_monomer_pred)
689
690 if log(freq_monomer_pred) != 0:
691 extinction = int(log(1 / float(self.total_bases)) \
692 / log(freq_monomer_pred))
693
694 else:
695 extinction = -1
696
697 return extinction
698
699 # End get_extinction
700
701
702 def print_stats(self, monomer_name, monomer_count_value, polymer_count_value, \
703 freq_monomer_pred):
704 """
705 """
706
707 # freq_polymer_obs : polymer_count_value / total_bases
708 # log_freq_polymer_obs : log(freq_polymer_obs)
709 # freq_monomer_pred : freq_base1_obs * freq_base2_obs * ...
710 # This is set when the polymer object is created
711 # freq_polymer_pred :
712 # freq_monomer_pred^monomer_count_value * (1 - freq_monomer_pred)^2
713 # ^^^^^^^^^^^^^
714 # A whole polymer is surrounded by non-monomers
715 #
716 # depart_from_pred : freq_polymer_obs / freq_polymer_pred
717 # log_depart_from_pred : log(depart_from_pred)
718
719 freq_polymer_obs = float(polymer_count_value) / float(self.total_bases)
720 # Note that log() in Python is the natural log or ln()
721 log_freq_polymer_obs = log(freq_polymer_obs) / log(10)
722
723 freq_polymer_pred = freq_monomer_pred ** monomer_count_value \
724 * (1 - freq_monomer_pred ) ** 2
725
726 if freq_polymer_pred != 0:
727 depart_from_pred = freq_polymer_obs / freq_polymer_pred
728 log_depart_from_pred = log(depart_from_pred) / log(10)
729
730 # Print details of polymer if polymer appears
731 # the desired value of times
732 if polymer_count_value >= MIN_polymer_count_value:
733 self.outfile.write(str(monomer_count_value) + chr(9) \
734 + str(polymer_count_value) + chr(9) \
735 + str(log_freq_polymer_obs) + chr(9) \
736 + str(log_depart_from_pred) + '\n')
737
738 if (log_depart_from_pred >= 0.3) & (self.zero_three_dep != TRUE):
739 sys.stdout.write('0.3 departure for ' \
740 + monomer_name + ': ' \
741 + str(monomer_count_value) + '\n')
742 self.zero_three_dep = TRUE
743
744 if (log_depart_from_pred >= 0.5) & (self.zero_five_dep != TRUE):
745 sys.stdout.write('0.5 departure for ' \
746 + monomer_name + ': ' \
747 + str(monomer_count_value) + '\n')
748 self.zero_five_dep = TRUE
749
750 else:
751 sys.stdout.write('Div by 0 calculating departure for ' + \
752 monomer_name + ' ' + str(monomer_count_value) + '\n')
753
754 # End print_stats
755
756
757 if __name__ == '__main__':
758 Poly()