ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/seqdoc/seqdoc.pl
Revision: 1.2
Committed: Fri Apr 8 04:16:59 2005 UTC (11 years ago) by mcrowe
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +4 -4 lines
Log Message:
08/04/2005 - Initial alignment calculation modified to test 1000 datapoints, not 500. Some examples where noisy initial sequence was preventing good alignments.

Line File contents
1 #!/usr/bin/perl
2
3 =head1 NAME
4
5 seqdoc.pl - Perl cgi script to align and subtractively compare two sequence
6 chromatograms in ABI (Applied Biosystems) format.
7
8 =head1 SYNOPSIS
9
10 seqdoc.pl?ref_seq=<reference sequence file>&test_seq=<test_sequence file>
11
12 Designed to be called as a cgi script from a web form with the two filenames
13 supplied as parameters.
14
15 =head1 DESCRIPTION
16
17 This script does the following in order:
18
19 =over 3
20
21 =item *
22
23 Reads in the two sequence files
24
25 =item *
26
27 Normalizes the peak heights for each trace
28
29 =item *
30
31 Aligns the two traces
32
33 =item *
34
35 Calculates a difference profile between the traces
36
37 =item *
38
39 Processes the difference profile to highlight differences characteristic of
40 single bases
41
42 =item *
43
44 Outputs aligned images of the two input sequences and the difference profile
45 in html format
46
47 =back
48
49 All processing is performed automatically, with no use-rdefined parameters other
50 than the sequence file names. Settings can be modified by altering values
51 defined in the code if desired.
52
53 The program requires write access to a temporary directory (by default /usr/local/apache/htdocs/Temp/) to store the image files between their generation and display on the web page.
54
55 =head1 WEBSITE
56
57 http://research.imb.uq.edu.au/seqdoc/
58
59 =head1 AUTHOR
60
61 Written by Mark Crowe - m.crowe@imb.uq.edu.au
62
63 The Institute for Molecular Bioscience, The University of Queensland, Brisbane, Queensland, Australia.
64
65 =head1 COPYRIGHT
66
67 Copyright 2005, Mark Crowe.
68
69 This program is free software. You may copy or redistribute it under the terms of the GNU General Public Licence.
70
71 =cut
72
73
74
75
76 use strict;
77 #use warnings;
78 use CGI;
79 use ABI;
80 use GD::Graph::lines;
81
82 my $cgi = new CGI;
83
84 # Create a list of the four channels to save on having to keep retyping them
85 # in loops
86 our @trace_list = qw( a_trace g_trace c_trace t_trace );
87 our $temp_dir = '/usr/local/apache/htdocs/Temp/';
88
89 # Get input data parameters from the web page
90 my $ref_seq = $cgi->param('ref_seq');
91 my $test_seq = $cgi->param('test_seq');
92
93 # Create temporary files to store the uploaded chromatograms
94 ### Might need to modify to generate 'unique' names each time ###
95 # Reference sequence
96 open REF, ">".$temp_dir.$$."ref_seq";
97 my $ref_fh = $cgi->upload('ref_seq');
98 print REF while <$ref_fh>;
99 # Need to close files, otherwise ABI.pm can't access them properly
100 close REF;
101 # And test sequence
102 open TEST, ">".$temp_dir.$$."test_seq";
103 my $test_fh = $cgi->upload('test_seq');
104 print TEST while <$test_fh>;
105 # Need to close files, otherwise ABI.pm can't access them properly
106 close TEST;
107 # Mac uploads put in extra header info. Delete it, since all it seems to do
108 # is break things
109 demac();
110
111 # Extract the data from the chromatogram files. Get it as a hash reference
112 my $ref_data = read_chromatogram($temp_dir.$$.'ref_seq');
113 my $test_data = read_chromatogram($temp_dir.$$.'test_seq');
114 # Extract the sequence names
115 my $ref_name = $ref_data->{name};
116 my $test_name = $test_data->{name};
117
118 # Normalize peak heights to allow valid comparisons between runs
119 # By passing a hash reference, the changes made in the subroutine will act on
120 # the original hash (which is what we want in this case)
121 normalize($ref_data);
122 normalize($test_data);
123
124 # Now align test trace to reference trace. Reference trace doesn't change.
125 get_best_align($ref_data, $test_data);
126
127 # Calculate differences between traces
128 my ($align_length, $diffs) = differences($ref_data, $test_data);
129
130 # Finally generate output images - need to create temporary files, as can't
131 # include image data directly in html
132 # Pass details about image size and scale to the subroutine, also output file
133 my $ref_image = get_image($ref_data, 0, 2000, 200, $temp_dir.$$."ref_image.png", $align_length, 1);
134 my $test_image = get_image($test_data, 0, 2000, 200, $temp_dir.$$."test_image.png", $align_length, 1 );
135 my $diff_image = get_image($diffs, -500, 500, 300, $temp_dir.$$."diff_image.png", $align_length, 0);
136
137
138 # Print out page
139 print $cgi->header;
140 print "<head><title>Chromatogram comparison</title></head>";
141 print "<body>";
142 print "Reference sequence";
143 if ($ref_data->{name}) {print " - ".$ref_data->{name}."<br>"}
144 else {print " - ".$ref_seq."<br>"}
145 print "<img src=\"/Temp/".$$."ref_image.png\"><br>\n";
146 print "<img src=\"/Temp/".$$."diff_image.png\"><br>\n";
147 print "Test sequence";
148 if ($test_data->{name}) {print " - ".$test_data->{name}."<br>"}
149 else {print " - ".$test_seq."<br>"}
150 print "<img src=\"/Temp/".$$."test_image.png\"><br>\n";
151 print "</body></html>";
152
153 # Leave output files - they are deleted automatically each night
154
155
156 sub error_end {
157 # Deals with errors in input file format
158 my $error = shift;
159 my %error_reply = (abi_format => 'One of the files does not seem to be a correctly formatted ABI file.',
160 trace_error => 'Unable to extract trace data from one of the chromatograms.',
161 basecalls => 'Unable to extract basecall positions from one of the input files.',
162 sequence => 'Unable to extract called sequence from one of the input files.',
163 short_seq => 'One of the sequences is too short for a valid comparison to be made',
164 );
165 my $reply = $error_reply{$error};
166 # Print out page
167 print $cgi->header;
168 print "<head><title>Chromatogram comparison error</title></head>";
169 print "<body>";
170 print "<h1>Error report</h1>\nAn error has occurred. The error is :- <br><pre>$reply</pre>\nPlease check that both your input files are valid ABI chromatograms.";
171 print "</body></html>";
172 exit;
173 }
174
175
176
177 sub read_chromatogram {
178 # Reads in data from the chromatogram files and returns it as a hash reference
179 # Provide filename as the argument to the function call
180 my $file = shift;
181 # Create a new ABI object pointing to that file
182 my $abi = ABI->new(-file=>$file) or error_end('abi_format');
183 # Get the sample name from the file
184 my $sample_name = $abi->get_sample_name();
185 #Now get the four traces
186 my @trace_a = $abi->get_trace("A") or error_end('trace_error');
187 my @trace_g = $abi->get_trace("G") or error_end('trace_error');
188 my @trace_c = $abi->get_trace("C") or error_end('trace_error');
189 my @trace_t = $abi->get_trace("T") or error_end('trace_error');
190 # Remove blank sequence from end of trace
191 # Blank sequence often has noise, so can't use equal to zero. Instead have
192 # threshold cutoff. Tried with various values, even as high as 500 seems to
193 # work and not delete real sequence, while as low as 5 still removes most
194 # blank sequence. 50 therefore seems to be a good compromise.
195 while ($trace_a[-1] < 50 && $trace_g[-1] < 50 && $trace_c[-1] < 50 && $trace_t[-1] < 50) {
196 pop(@trace_a); pop(@trace_g); pop(@trace_c); pop(@trace_t)}
197
198 my @base_pos = $abi->get_base_calls() or error_end('basecalls');
199 my $sequence = $abi->get_sequence() or error_end('sequence');
200 # Length is number of datapoints in the sequence
201 my $seq_length = $#trace_a;
202 # Put all the data together into a hash to return it
203 my %return_hash = (name => $sample_name,
204 a_trace => \@trace_a,
205 g_trace => \@trace_g,
206 c_trace => \@trace_c,
207 t_trace => \@trace_t,
208 seq_length => $seq_length,
209 sequence => $sequence,
210 base_pos => \@base_pos,
211 );
212 return \%return_hash;
213 }
214
215 sub normalize {
216 # Takes the raw data from the chromatograms and normalizes each value to a
217 # mean of 100 with respect to all values within 500 datapoints either side
218 # N.B. There is a special case for points within 500bp of the start.
219
220 my $trace_data = shift;
221 # Get the length of good trace data
222 my $trace_length = $trace_data->{seq_length};
223 # Ensure that the sequence is long enough for normalization to not give any
224 # errors. A shorter sequence probably won't be a lot of use anyway. Use
225 # 1100 as first 500 + last 500 plus a short middle buffer. Actual value has
226 # no meaning beyond preventing errors
227 error_end('short_seq') unless $trace_length > 1100;
228
229 # Need to create new reference arrays, since normalization alters the existing
230 # values in the hash
231 my %orig_values;
232 for (@trace_list) {
233 for my $index (0..$trace_length) {
234 $orig_values{$_}[$index] = $trace_data->{$_}[$index];
235 }
236 }
237
238 # Do the middle of the trace first - from #500 to 500 before the end
239 my $totalsum = 0;
240 # Set the normalization sum before starting
241 for my $index (0..999) {
242 for (@trace_list) {
243 $totalsum += $orig_values{$_}[$index];
244 }
245 }
246 for my $datapoint (500..($trace_length - 501)) {
247 # Normalize to 100. Divide by sum of all values, multiply by number of
248 # values, and multiply by 100;
249 # First add values for datapoint +500
250 for (@trace_list) {
251 $totalsum += $orig_values{$_}[$datapoint + 500];
252 }
253 for (@trace_list) {
254 # Blank sequence can cause problems through division by zero errors.
255 # Deleting trailing blank sequence helps, but just put in a default value
256 # for totalsum in case of problems.
257 $totalsum = 1000 unless $totalsum;
258 # Calculate normalized data
259 $trace_data->{$_}[$datapoint] = int(($orig_values{$_}[$datapoint] / $totalsum) * 4000 * 100);
260 }
261 # Finally subtract values for datapoint -500
262 for (@trace_list) {
263 $totalsum -= $orig_values{$_}[$datapoint - 500];
264 }
265 }
266 # Now do first 500 - special case, since can't do 500 before. Instead just
267 # take all points before. Not so critical anyway, since data quality is poor
268 # at start so any mismatches will be unreliable anyway.
269 # Start by initialising totalsum
270 $totalsum = 0;
271 for my $index (0..500) {
272 for (@trace_list) {
273 $totalsum += $orig_values{$_}[$index];
274 }
275 }
276 # Blank sequence can cause problems through division by zero errors.
277 # Deleting trailing blank sequence helps, but just put in a default value
278 # for totalsum in case of problems.
279 $totalsum = 1000 unless $totalsum;
280
281 for my $datapoint (0..499) {
282 # Can do 500 after though
283 my $end = $datapoint + 500;
284 # Normalize to 100. Divide by sum of all values, multiply by number of
285 # values, and multiply by 100;
286 for (@trace_list) {
287 # Blank sequence can cause problems through division by zero errors.
288 # Deleting trailing blank sequence helps, but just put in a default value
289 # for totalsum in case of problems.
290 $totalsum = 1000 unless $totalsum;
291 # Calculate normalized data
292 $trace_data->{$_}[$datapoint] = int(($orig_values{$_}[$datapoint] / $totalsum) * $end * 4 * 100);
293 }
294 # Add next value to totalsum, to keep 500 values after
295 for (@trace_list) {
296 $totalsum += $orig_values{$_}[$end + 1];
297 }
298 }
299
300 # Finally the last 500 - again a special case, as can't do 500 after. Instead
301 # just take all points after. Not so critical anyway, since data quality is
302 # poor at end so any mismatches will be unreliable anyway.
303 # Again start by initialising totalsum
304 $totalsum = 0;
305 for my $index (($trace_length-1000)..($trace_length-1)) {
306 for (@trace_list) {
307 $totalsum += abs($orig_values{$_}[$index]);
308 }
309 }
310 # Identify start and finish points
311 my ($first, $last) = (($trace_length-500),($trace_length-1));
312 for my $datapoint ($first..$last) {
313 my $start = $datapoint - 500;
314 # Normalize to 100. Divide by sum of all values, multiply by number of
315 # values, and multiply by 100;
316 for (@trace_list) {
317 # Blank sequence can cause problems through division by zero errors.
318 # Deleting trailing blank sequence helps, but just put in a default value
319 # for totalsum in case of problems.
320 $totalsum = 1000 unless $totalsum;
321 # Calculate normalized data
322 $trace_data->{$_}[$datapoint] = int(($orig_values{$_}[$datapoint] / $totalsum) * ($last-$start) * 4 * 100);
323 }
324 # Subtract first value from totalsum, to keep to 500 point before test
325 for (@trace_list) {
326 $totalsum -= abs($orig_values{$_}[$start]);
327 }
328 }
329
330 return 1;
331 }
332
333
334 sub get_best_align {
335 # This does an alignment of the first 500 datapoints using a range of offsets
336 # from -200 to 200. The best alignment is picked on the basis of having the
337 # lowest score from datapoint 200 to 500, and is used to allow for any
338 # variation in start position of the two sequences.
339 my ($ref, $test) = @_;
340 my %scores;
341 for (my $offset=-200;$offset<=200;$offset+=20) {
342 # Create temporary hashes, since a hash reference is passed to the function
343 # and otherwise the real hash will be modified
344 my (%temp_ref, %temp_test);
345 # Also need to extract hash values, since they too are references and
346 # can't be copied for the same reason.
347 for (@trace_list) {
348 my @temp_ref_trace = @{$ref->{$_}};
349 $temp_ref{$_} = \@temp_ref_trace;
350 my @temp_test_trace = @{$test->{$_}};
351 $temp_test{$_} = \@temp_test_trace;
352 }
353 # Do a partial alignment (first 1000 datapoints)
354 align (\%temp_ref, \%temp_test, $offset, 1000);
355 # Work out the score for that alignment
356 $scores{$offset} = get_score(200, 1000, 0, \%temp_ref, \%temp_test);
357 }
358 # Sort the scores to find out the lowest, and record the value of that offset
359 my @offset = sort { $scores{$a} <=> $scores{$b} } keys %scores;
360 # Once the best alignment has been determined, then do it for real
361 align($ref, $test, $offset[0], ($#{$test->{a_trace}} + $offset[0]));
362 return 1;
363 }
364
365 sub align {
366 # This takes the normalized traces and returns a best alignment of the two.
367 # Rows are added to or deleted from the test trace to keep the alignment.
368 # Best alignment is calculated by minimising the difference between the test
369 # and reference sequence over the next 30 datapoints. It is adjusted every five
370 # bases. Inserted lines are just a duplicate of the previous line.
371 my ($ref, $test, $min_index, $trace_length) = @_;
372
373 # Add/delete the appropriate number of lines to the test sequence to correct
374 # for the offset value
375 if ($min_index < 0) {
376 # Need to add lines to test sequence, since it's behind
377 for ($min_index..0) {
378 for (@trace_list) {
379 unshift @{$test->{$_}}, $test->{$_}[0];
380 }
381 }
382 } elsif ($min_index > 0) {
383 # Need to delete lines from test sequence, since it's ahead
384 for (@trace_list) {
385 splice @{$test->{$_}}, 0, $min_index;
386 }
387 }
388 # Make a note of the offset value for datapoint numbering
389 $test->{initial_offset} = $min_index;
390 $ref->{initial_offset} = 0;
391
392
393 # Now check alignments
394 for my $index (0..($trace_length - 1)) {
395 # Each fifth entry (starting from 1), check alignment
396 next if $index%5;
397 my $offset;
398 my $start_pos = $index;
399 my $end_pos = $index + 30;
400 # Compare the scores in the current alignment with those one data point in
401 # either direction
402 my $score = get_score($start_pos, $end_pos, 0, $ref, $test);
403 my $pre_score = get_score($start_pos, $end_pos, -1, $ref, $test);
404 my $post_score = get_score($start_pos, $end_pos, 1, $ref, $test);
405 last if $score eq 'no_score' or $post_score eq 'no_score' or $pre_score eq 'no_score';
406 # Work out offset
407 # Default is 0; score is the lowest of the three
408 if ($score < $pre_score && $score < $post_score) {$offset = 0}
409 # if pre-score is the lowest, then offset = -1
410 elsif ($pre_score < $score && $pre_score < $post_score) {$offset = -1}
411 # if post-score is the lowest, then offset = 1
412 elsif ($post_score < $score && $post_score < $pre_score) {$offset = 1}
413 # If in doubt, default to no change
414 else {$offset = 0}
415 # Now insert or delete lines as required
416 if ($offset == 1) {
417 # The reference sample is behind, need to delete a row from test
418 for (@trace_list) {
419 splice @{$test->{$_}}, $index, 1;
420 }
421 }
422 elsif ($offset == -1) {
423 # The reference sample is ahead, need to add a row to test
424 for (@trace_list) {
425 splice @{$test->{$_}}, $index, 0, $test->{$_}[$index];
426 }
427 }
428 }
429 # Reset length of test sequence to correct value
430 $test->{seq_length} = $#{$test->{a_trace}};
431 return 1;
432 }
433
434 sub get_score {
435 # Subroutine used in alignment testing - it gets the total difference between
436 # the two submitted sections of array and returns it.
437 my ($start, $end, $offset, $ref, $test) = @_;
438 my $score;
439 for my $index($start..$end) {
440 return 'no_score' unless defined ($ref->{a_trace}[$index]) and defined($test->{a_trace}[$index + $offset]);
441 for (@trace_list) {
442 $score += abs($ref->{$_}[$index] - $test->{$_}[$index + $offset]);
443 }
444 }
445 return $score;
446 }
447
448 sub differences {
449 # Takes the two traces and calculates the difference between the two. Then
450 # squares this difference to highlight the larger (relevent) changes.
451 # Also returns the length of the shortest sequence, so both are aligned on
452 # image generation
453 my ($ref, $test) = @_;
454 # Just compare up to end of shortest sequence
455 my $min_index = $ref->{seq_length} < $test->{seq_length} ? $ref->{seq_length} : $test->{seq_length};
456
457 # Create a hash for the difference values
458 my %diffs;
459 # Loop through the traces and calculate the differences
460 for my $index (0..$min_index) {
461 # Need to do all four traces
462 for (@trace_list) {
463 # Get the difference
464 my $diff = $ref->{$_}[$index] - $test->{$_}[$index];
465 # Is the difference positive or negative (need to record, otherwise will
466 # be lost on squaring)
467 my $sign = $diff < 0 ? -1 : 1;
468 # Stop saturation by cutting off to a max value
469 if (abs($diff) > 500) {$diff = 500}
470 # Highlight differences by multiplying value by total values of OTHER
471 # channels
472 my $othersum = 1;
473 my $samesum = 1;
474 $diffs{$_}[$index] = $diff;
475 }
476 # Have now got difference for all four traces. Can accentuate real diffs
477 for (@trace_list) {
478 my $diff = $diffs{$_}[$index];
479 my $sign = $diff < 0 ? -1 : 1;
480 my $otherchannels = 1;
481 # Sum all values in the other channels which have the opposite sign
482 for my $channel (@trace_list) {
483 next if $channel eq $_;
484 my $value = $diffs{$channel}[$index];
485 # Ignore if the sign is the same
486 next if $value * $sign > 0;
487 $otherchannels += $value;
488 }
489 my $finaldiff = ($sign * $diff * $diff * sqrt(abs($otherchannels))) / 5000;
490 $finaldiff = $sign * 500 if abs($finaldiff) > 500;
491 $diffs{$_}[$index] = $finaldiff;
492 }
493 }
494
495 return $min_index, \%diffs;
496 }
497
498 sub get_image {
499 # Routine to convert the supplied hash reference into a png image of a graph
500 # This is the slowest section of the program by a long way (typically takes
501 # around 75% of the total calculation time). The limiting factor appears to be
502 # GD::Graph - optimization suggestions are welcome.
503 my ($trace, $min, $max, $size, $output_file, $length, $chromatogram) = @_;
504
505 my @labels;
506 for (0..$length) {push @labels, ''}
507 if ($chromatogram) {
508 my @seq = split //, $trace->{sequence};
509 for (0..@{$trace->{base_pos}}) {
510 # Only mark every 10th base from 10.
511 next if $_ % 10;
512 next unless $_;
513 last unless $trace->{base_pos}[$_];
514 last if ($trace->{base_pos}[$_] - $trace->{initial_offset}) >= $length;
515 # Correct for initial offset - puts numbers in same places as for
516 # chromatogram (otherwise they can get shifted)
517 $labels[($trace->{base_pos}[$_-1] - $trace->{initial_offset})] = $_;
518 }
519 }
520
521 my @plot_data = (\@labels, $trace->{a_trace}, $trace->{g_trace}, $trace->{c_trace}, $trace->{t_trace});
522
523 # Define graph format
524 my $mygraph = GD::Graph::lines->new($length, $size);
525 $mygraph->set(
526 x_label => '',
527 x_ticks => 0,
528 y_label => '',
529 y_max_value => $max,
530 y_min_value => $min,
531 # Draw datasets in 'solid', 'dashed' and 'dotted-dashed' lines
532 line_types => [1, 1, 1, 1],
533 # Set the thickness of line
534 line_width => 1,
535 # Set colors for datasets
536 dclrs => ['green', 'black', 'blue', 'red'],
537 ) or warn $mygraph->error;
538
539 my $myimage = $mygraph->plot(\@plot_data) or die $mygraph->error;
540 open OUT, ">".$output_file;
541 print OUT $myimage->png();
542 close OUT;
543
544 return 1;
545 }
546
547 sub demac {
548 # Subroutine to strip out extra Mac headers
549 for my $file ($temp_dir.$$."ref_seq", $temp_dir.$$."test_seq") {
550 open IN, $file;
551 open OUT, ">".$file."_new";
552 while (<IN>) {
553 if ($. == 1) {
554 s/^.*ABIF/ABIF/;
555 unless (/ABIF/) {error_end('abi_format')}
556 }
557 print OUT $_;
558 }
559 close IN;
560 close OUT;
561 rename $file."_new", $file
562 }
563 }