ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/PrimerMatch/primer_match.cc
Revision: 1.3
Committed: Wed Nov 1 19:48:12 2006 UTC (9 years, 8 months ago) by nje01
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +20 -5 lines
Log Message:
*** empty log message ***

Line File contents
1 /**************************************************************************
2 * This code is part of the supporting infrastructure for ATA Mapper.
3 * Copyright (C) 2002,2003,2004 Applera Corporation. All rights reserved.
4 * Author: Nathan Edwards
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received (LICENSE.txt) a copy of the GNU General Public
17 * License along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *************************************************************************/
20
21
22 #include <unistd.h>
23 #include <assert.h>
24 #include <vector>
25 #include "keyword_tree.h"
26 #include "keyword_tree.t"
27 #include "shift_and.h"
28 #include "shift_and_inexact.h"
29 #include "fasta_io.h"
30 #include "fasta_io.t"
31 #include "char_io.t"
32 #include "primer_alignment.h"
33 #include "util.h"
34 #include "types.h"
35 #include "select.h"
36 #include "select.t"
37 #include "sts_io.h"
38 #include "math.h"
39
40 #if !defined(NO_STD_NAMESPACE)
41 using namespace std;
42 #endif
43
44 char release_tag[] = "$Name: $";
45
46 #if defined(__alpha)
47 // Hack to get around problems with mmap
48 const unsigned long __sbrk_override = 1;
49 #endif
50
51 void newfail_handler() {
52 cerr << "Fatal Error: Attempt to allocate memory with new() failed.\n"
53 << endl;
54 exit(1);
55 return;
56 }
57
58 struct Options {
59 bool rev_comp;
60 bool pattern_file;
61 bool fasta_pattern_file;
62 bool sts_pattern_file;
63 std::string patterns;
64 std::string database;
65 bool ucdict;
66 int tplen;
67 int fplen;
68 int stlen;
69 int edlen;
70 int seedlen;
71 char eos_char;
72 int nmismatch;
73 int maxcount;
74 std::string default_alignformat;
75 std::string alignformat;
76 bool alignments;
77 std::string default_countformat;
78 std::string countformat;
79 bool counts;
80 ostream *out;
81 bool delout;
82 unsigned long int report_interval;
83 bool verbose;
84 bool veryverbose;
85 bool memmap;
86 int node;
87 int dbind;
88 bool dbindex;
89 bool wc;
90 bool tn;
91 bool indels;
92 bool aggregate;
93 bool dna_mutations;
94 int posout;
95 Options(int argc, char *argv[]);
96 ~Options();
97 void usage(char *msg=NULL);
98 };
99
100 Options::Options(int argc, char *argv[]) : out(&cout) {
101 signed char c;
102 optarg = NULL;
103 pattern_file = false;
104 fasta_pattern_file = false;
105 sts_pattern_file = false;
106 rev_comp = false;
107 ucdict = false;
108 tplen = 0;
109 fplen = 0;
110 stlen = 0;
111 edlen = 0;
112 seedlen = 0;
113 eos_char = '\n';
114 nmismatch = 0;
115 maxcount = 0;
116 report_interval = 1000;
117 delout = false;
118 default_alignformat = ">%h\\n %T %s %e %d\\n %A\\n %Q %i%R\\n";
119 alignformat = default_alignformat;
120 alignments = true;
121 default_countformat = "%i %r %q %c%+ ( %C )\\n";
122 countformat = default_countformat;
123 counts = false;
124 verbose = false;
125 veryverbose = false;
126 memmap = true;
127 node = 0;
128 dbind = 0;
129 dbindex = true;
130 wc = false;
131 tn = false;
132 indels = true;
133 aggregate = false;
134 dna_mutations = false;
135 posout = 2; //space-based start and end positions
136
137 while ((c = getopt(argc, argv, "p:i:o:P:F:S:M:k:K:s:e:3:5:x:E:hr01ucavA:C:R:BN:D:IwW")) != -1)
138 switch (c) {
139 case 'p':
140 patterns = optarg;
141 pattern_file = false;
142 break;
143 case 'P':
144 patterns = optarg;
145 pattern_file = true;
146 break;
147 case 'F':
148 patterns = optarg;
149 fasta_pattern_file = true;
150 break;
151 case 'S':
152 patterns = optarg;
153 sts_pattern_file = true;
154 rev_comp = true;
155 break;
156 case 'i':
157 database = optarg;
158 break;
159 case 'o':
160 if (delout) {
161 delete out;
162 }
163 out = new ofstream(optarg,(ios::out|ios::app|ios::ate));
164 delout = true;
165 break;
166 case '3':
167 tplen = atoi(optarg);
168 if (optarg[0] == '~') {
169 tplen = -atoi(optarg+1);
170 } else {
171 tplen = atoi(optarg);
172 }
173 break;
174 case '5':
175 fplen = atoi(optarg);
176 if (optarg[0] == '~') {
177 fplen = -atoi(optarg+1);
178 } else {
179 fplen = atoi(optarg);
180 }
181 break;
182 case 's':
183 if (optarg[0] == '~') {
184 stlen = -atoi(optarg+1);
185 } else {
186 stlen = atoi(optarg);
187 }
188 break;
189 case 'e':
190 if (optarg[0] == '~') {
191 edlen = -atoi(optarg+1);
192 } else {
193 edlen = atoi(optarg);
194 }
195 break;
196 case 'k':
197 if (optarg[0] == '.') {
198 nmismatch = atoi(optarg+1);
199 dna_mutations = true;
200 } else {
201 nmismatch = atoi(optarg);
202 }
203 indels = true;
204 break;
205 case 'K':
206 if (optarg[0] == '.') {
207 nmismatch = atoi(optarg+1);
208 dna_mutations = true;
209 } else {
210 nmismatch = atoi(optarg);
211 }
212 indels = false;
213 break;
214 case 'r':
215 rev_comp = true;
216 break;
217 case 'c':
218 counts = true;
219 alignments = false;
220 break;
221 case 'M':
222 maxcount = atoi(optarg);
223 break;
224 case 'x':
225 seedlen = atoi(optarg);
226 break;
227 case 'A':
228 if (strlen(optarg)>0)
229 alignformat = optarg;
230 alignments = true;
231 break;
232 case '0':
233 posout = 0;
234 break;
235 case '1':
236 posout = 1;
237 break;
238 case 'C':
239 if (strlen(optarg)>0)
240 countformat = optarg;
241 counts = true;
242 break;
243 case 'u':
244 ucdict = true;
245 break;
246 case 'a':
247 aggregate = true;
248 break;
249 case 'w':
250 wc = true;
251 tn = false;
252 break;
253 case 'W':
254 wc = true;
255 tn = true;
256 break;
257 case 'R':
258 report_interval = atoi(optarg);
259 break;
260 case 'N':
261 node = atoi(optarg);
262 break;
263 case 'D':
264 dbind = atoi(optarg);
265 break;
266 case 'E': {
267 int ec;
268 if (!sscanf(optarg,"%i",&ec)) {
269 usage("Invalid end-of-sequence specification.\n");
270 }
271 eos_char = (char)ec;
272 }
273 break;
274 case 'v':
275 verbose = true;
276 break;
277 case 'I':
278 dbindex = false;
279 break;
280 case 'V':
281 verbose = true;
282 veryverbose = true;
283 break;
284 case 'B':
285 memmap = false;
286 break;
287 case 'h':
288 default :
289 usage();
290 }
291 if ((patterns == "" || database == "")&&!verbose) usage("No primers and/or no sequence database supplied.");
292 if (nmismatch < 0) usage("Number of mismatches (-k) must be >= 0.");
293 if (maxcount > 0 && !counts) usage("Can''t use maxcount (-M) without counts (-c or -C).");
294 if (aggregate && !counts) usage("Can''t use aggregate (-a) without counts (-c or -C).");
295 if (dbind < 0 || dbind> 4) usage("Invalid integer for fasta database indexing (-D).");
296 if (dna_mutations && nmismatch > 3) usage("Can''t model > 3 DNA mutations");
297 }
298
299
300 Options::~Options() {
301 if (delout) {
302 delete out;
303 }
304 }
305
306 void Options::usage(char *message) {
307 if (message != NULL && strlen(message) > 0) {
308 cerr << message << endl;
309 cerr << endl;
310 }
311 cerr << "Usage: primer_match [options] \n\n";
312 cerr << "Options: \n";
313 cerr << " -i <sequence-database> Input sequence database. Required.\n";
314 cerr << " -p <sequences> Primer sequences, separated by whitespace.\n";
315 cerr << " -P <sequence-file> Primer sequences, separated by whitespace.\n";
316 cerr << " \"-\" indicates standard input.\n";
317 cerr << " -F <sequence-file> Primer sequences in FASTA format.\n";
318 cerr << " \"-\" indicates standard input.\n";
319 cerr << " -S <sequence-file> Primer sequences in UniSTS format.\n";
320 cerr << " \"-\" indicates standard input.\n";
321 cerr << " One of -p, -P, -F, or -S is required.\n";
322 cerr << " -S automatically sets the -r option.\n";
323 cerr << " -o <output-file> Output file. Defaults to standard out.\n";
324 cerr << " Will append if output-file exists.\n";
325 cerr << " -k <#-mismatches> Number of insertions, deletions \n";
326 cerr << " and substitutions permitted. Defaults to 0.\n";
327 cerr << " -K <#-mismatches> Number of substitutions (mismatches) permitted.\n";
328 cerr << " No insertions or deletions allowed. Defaults to 0.\n";
329 cerr << " At most one of -k and -K may be specified.\n";
330 cerr << " -r Match reverse complement of primers too.\n";
331 cerr << " -x <#-chars> Number of characters for e(x)act seed (word size).\n";
332 cerr << " -s <#-chars> Number of characters from start of pattern\n";
333 cerr << " (or reverse complement) that must match exactly\n";
334 cerr << " (inexactly if preceeded by ~).\n";
335 cerr << " -e <#-chars> Number of characters from end of pattern\n";
336 cerr << " (or reverse complement) that must match exactly\n";
337 cerr << " (inexactly if preceeded by ~).\n";
338 cerr << " -5 <#-chars> Number of characters from 5' end of pattern\n";
339 cerr << " (or reverse complement) that must match exactly\n";
340 cerr << " (inexactly if preceeded by ~).\n";
341 cerr << " -3 <#-chars> Number of characters from 3' end of pattern\n";
342 cerr << " (or reverse complement) that must match exactly\n";
343 cerr << " (inexactly if preceeded by ~).\n";
344 cerr << " -u Uppercase pattern sequences.\n";
345 cerr << " -w Honor IUPAC wildcards in pattern & text,\n";
346 cerr << " no text N wildcard.\n";
347 cerr << " -W Honor IUPAC wildcards in pattern & text\n";
348 cerr << " with text N wildcard.\n";
349 cerr << " -E <int> End-of-sequence character. Default is \'\\n\'\n";
350 cerr << " -c Output counts (only).\n";
351 cerr << " -a Output the aggregate of forward & \n";
352 cerr << " reverse complement counts.\n";
353 cerr << " -M <max-count> Maximum number of occurances to count.\n";
354 cerr << " -A <format> Alignment output format.\n";
355 cerr << " Default: \"" << default_alignformat << "\".\n";
356 cerr << " -0 0-based position output. Default: space-based.\n";
357 cerr << " -1 1-based position output. Default: space-based.\n";
358 cerr << " -C <format> Counts output format.\n";
359 cerr << " Default: \"" << default_countformat << "\".\n";
360 cerr << " -R <int> Alignment report interval. Default is 1000.\n";
361 cerr << " -N <int> Data structure for primer index.\n";
362 cerr << " Default: Auto.\n";
363 cerr << " -B Don\'t use memmap for I/O, use buffered I/O instead.\n";
364 cerr << " -D (0|1|2|3|4) Fasta database indexing and preprocessing.\n";
365 cerr << " 0: Auto, 1: None, 2: Indexed, 3: Normalized,\n";
366 cerr << " 4: Compressed. Default: Auto.\n";
367 cerr << " -I Do not load fasta database index.\n";
368 cerr << " -v Verbose (version & diagnostic) output.\n";
369 cerr << " -h Command line option help.\n";
370 cerr << "\n";
371 exit(1);
372 }
373
374 void alignformat(ostream & os,
375 std::string const & alignformat,
376 FILE_POSITION_TYPE const & s,
377 FILE_POSITION_TYPE const & e,
378 FILE_POSITION_TYPE const & five,
379 FILE_POSITION_TYPE const & three,
380 FILE_POSITION_TYPE const & S,
381 FILE_POSITION_TYPE const & E,
382 long unsigned int const & i,
383 unsigned int const & d,
384 std::string const & p,
385 std::string const & P,
386 std::string const & q,
387 std::string const & Q,
388 std::string const & r,
389 std::string const & R,
390 std::string const & t,
391 std::string const & T,
392 std::string const & A,
393 std::string const & h,
394 std::string const & H,
395 long unsigned int const & f,
396 sts_entry const & sts) {
397
398 //
399 // The following codes get transformed into positional markers for a
400 // printf format.
401 //
402 // %s start of alignment from the start of the fasta entry
403 // %e end of alignment from the start of the fasta entry
404 // %5 position of 5' end of the alignment from the start of fasta entry
405 // %3 position of 3' end of the alignment from the start of fasta entry
406 // %S start of alignment in compressed sequence file
407 // %E end of alignment in compressed sequence file
408 // %i index of pattern
409 // %d edit distance of alignment
410 // %p pattern
411 // %q pattern or rev comp of pattern depending on which hit
412 // %Q aligned pattern or rev comp with insertion characters
413 // %r "F" or "R" depending on whether pattern or rev comp was hit
414 // %R "" or "REVCOMP" depending on whether pattern or rev comp was hit
415 // %t aligned text
416 // %T aligned text with alignment characters
417 // %A alignment string
418 // %h fasta header of hit sequence
419 // %H first work of fasta header of hit sequence
420 // %% percent character
421 //
422 // %= The usual default format, wrapped to approx 50 characters.
423
424 int pos=0;
425 bool sc=false;
426 unsigned int ins=0;
427 unsigned int del=0;
428 unsigned int sub=0;
429 unsigned int wcm=0;
430 unsigned int mat=0;
431 while (pos<alignformat.length()) {
432 if (alignformat[pos] == '%') {
433 pos++;
434 if (pos < alignformat.length()) {
435 switch(alignformat[pos]) {
436 case 's':
437 os << s;
438 break;
439 case 'e':
440 os << e;
441 break;
442 case 'l':
443 os << E-S;
444 break;
445 case '5':
446 os << five;
447 break;
448 case '3':
449 os << three;
450 break;
451 case 'S':
452 os << S;
453 break;
454 case 'E':
455 os << E;
456 break;
457 case 'i':
458 os << i;
459 break;
460 case 'd':
461 os << d;
462 break;
463 case 'D':
464 os << p.length()-(E-S);
465 break;
466 case 'M':
467 {
468 double monomw1 = 0.0;
469 double monomw2 = 0.0;
470 for (unsigned int i0=0;i0<p.length();i0++) {
471 monomw1 += monomolwt(p[i0]);
472 }
473 for (unsigned int i0=0;i0<q.length();i0++) {
474 monomw2 += monomolwt(t[i0]);
475 }
476 os << floor((monomw1-monomw2)*100)/100;
477 }
478 break;
479 case 'p':
480 os << p;
481 break;
482 case 'P':
483 os << P;
484 break;
485 case 'q':
486 os << q;
487 break;
488 case 'Q':
489 os << Q;
490 break;
491 case 'r':
492 os << r;
493 break;
494 case 'R':
495 os << R;
496 break;
497 case 't':
498 os << t;
499 break;
500 case 'T':
501 os << T;
502 break;
503 case 'A':
504 os << A;
505 break;
506 case 'h':
507 os << h;
508 break;
509 case 'H':
510 os << H;
511 break;
512 case 'f':
513 os << f;
514 break;
515 case 'I':
516 os << sts.id();
517 break;
518 case 'L':
519 if (sts.sizeub() != sts.sizelb()) {
520 os << sts.sizelb() << "-" << sts.sizeub();
521 } else {
522 os << sts.sizelb();
523 }
524 break;
525 case 'a':
526 os << sts.accession();
527 break;
528 case 'O':
529 os << sts.species();
530 break;
531 case '&':
532 os << sts.altacc();
533 break;
534 case 'X':
535 os << sts.chrom();
536 break;
537 case '%':
538 os << "%";
539 break;
540 case '|':
541 {
542 if (!sc) {
543 for (unsigned int i0=0;i0<A.length();i0++) {
544 switch (A[i0]) {
545 case '|': mat++; break;
546 case '^': del++; break;
547 case 'v': ins++; break;
548 case '*': sub++; break;
549 case '+': wcm++; break;
550 }
551 }
552 sc = true;
553 }
554 os << mat;
555 }
556 break;
557 case '^':
558 {
559 if (!sc) {
560 for (unsigned int i0=0;i0<A.length();i0++) {
561 switch (A[i0]) {
562 case '|': mat++; break;
563 case '^': del++; break;
564 case 'v': ins++; break;
565 case '*': sub++; break;
566 case '+': wcm++; break;
567 }
568 }
569 sc = true;
570 }
571 os << del;
572 }
573 break;
574 case 'v':
575 {
576 if (!sc) {
577 for (unsigned int i0=0;i0<A.length();i0++) {
578 switch (A[i0]) {
579 case '|': mat++; break;
580 case '^': del++; break;
581 case 'v': ins++; break;
582 case '*': sub++; break;
583 case '+': wcm++; break;
584 }
585 }
586 sc = true;
587 }
588 os << ins;
589 }
590 break;
591 case '*':
592 {
593 if (!sc) {
594 for (unsigned int i0=0;i0<A.length();i0++) {
595 switch (A[i0]) {
596 case '|': mat++; break;
597 case '^': del++; break;
598 case 'v': ins++; break;
599 case '*': sub++; break;
600 case '+': wcm++; break;
601 }
602 }
603 sc = true;
604 }
605 os << sub;
606 }
607 break;
608 case '+':
609 {
610 if (!sc) {
611 for (unsigned int i0=0;i0<A.length();i0++) {
612 switch (A[i0]) {
613 case '|': mat++; break;
614 case '^': del++; break;
615 case 'v': ins++; break;
616 case '*': sub++; break;
617 case '+': wcm++; break;
618 }
619 }
620 sc = true;
621 }
622 os << wcm;
623 }
624 break;
625 case '=':
626 {
627 unsigned int len0=T.length();
628 const unsigned int width0=50;
629 unsigned int textchars_start=0;
630 for (unsigned int i0=0;i0<len0;i0+=width0) {
631 unsigned int nchars=width0;
632 if (i0+nchars > len0) {
633 nchars = len0-i0;
634 }
635 unsigned int textchars_end=textchars_start+nchars;
636 unsigned int editcount0=nchars;
637 for (unsigned int j0=0;j0<nchars;j0++) {
638 if (A[i0+j0] == '|' || A[i0+j0] == '+') editcount0--;
639 if (A[i0+j0] == 'v') textchars_end--;
640 }
641 os << " " << T.substr(i0,width0)
642 << " " << textchars_start
643 << " " << textchars_end
644 << " " << editcount0
645 << "\n"
646 << " " << A.substr(i0,width0)
647 << "\n"
648 << " " << Q.substr(i0,width0)
649 << " " << i << R
650 << "\n";
651 if (len0-i0 > width0) {
652 os << endl;
653 }
654 textchars_start = textchars_end;
655 }
656 }
657 break;
658 default:
659 os << alignformat[pos];
660 }
661 } else {
662 os << "%";
663 }
664 } else if (alignformat[pos] == '\\') {
665 pos++;
666 if (pos < alignformat.length()) {
667 switch(alignformat[pos]) {
668 case 'n':
669 os << '\n';
670 break;
671 case 't':
672 os << '\t';
673 break;
674 case '\\':
675 os << '\\';
676 break;
677 default:
678 os << alignformat[pos];
679 }
680 } else {
681 os << '\\';
682 }
683 } else {
684 os << alignformat[pos];
685 }
686 pos++;
687 }
688 }
689
690 void countformat(ostream & os,
691 std::string const & format,
692 long unsigned int const & i,
693 std::string const & p,
694 std::string const & P,
695 std::string const & q,
696 std::string const & r,
697 std::string const & R,
698 long unsigned int const & c,
699 long unsigned int * const & C,
700 unsigned int const & k,
701 bool const & gtmax,
702 sts_entry const & sts) {
703 //
704 // The following codes get transformed into positional markers for a
705 // printf format.
706 //
707 // %i index of pattern
708 // %p pattern
709 // %q pattern or rev comp of pattern depending on which hit
710 // %r "F" or "R" depending on whether pattern or rev comp was hit
711 // %R "" or "REVCOMP" depending on whether pattern or rev comp was hit
712 // %c count of pattern or rev comp
713 // %C whitespace separated vector of counts by editdist
714 // %+ greater than max count marker "+"
715 // %% percent character
716 //
717
718 int pos=0;
719 while (pos<format.length()) {
720 if (format[pos] == '%') {
721 pos++;
722 if (pos < format.length()) {
723 switch(format[pos]) {
724 case 'i':
725 os << i;
726 break;
727 case 'p':
728 os << p;
729 break;
730 case 'P':
731 os << P;
732 break;
733 case 'q':
734 os << q;
735 break;
736 case 'r':
737 os << r;
738 break;
739 case 'R':
740 os << R;
741 break;
742 case 'c':
743 os << c;
744 break;
745 case 'C':
746 for (unsigned int i=0;i<k;i++) {
747 os << C[i] << " ";
748 }
749 os << C[k];
750 break;
751 case '+':
752 if (gtmax) {
753 os << "+";
754 }
755 break;
756 case '%':
757 os << "%";
758 break;
759 case 'I':
760 os << sts.id();
761 break;
762 case 'L':
763 if (sts.sizeub() != sts.sizelb()) {
764 os << sts.sizelb() << "-" << sts.sizeub();
765 } else {
766 os << sts.sizelb();
767 }
768 break;
769 case 'a':
770 os << sts.accession();
771 break;
772 case 'O':
773 os << sts.species();
774 break;
775 case '&':
776 os << sts.altacc();
777 break;
778 case 'X':
779 os << sts.chrom();
780 break;
781 default:
782 os << format[pos];
783 }
784 } else {
785 os << "%";
786 }
787 } else if (format[pos] == '\\') {
788 pos++;
789 if (pos < format.length()) {
790 switch(format[pos]) {
791 case 'n':
792 os << '\n';
793 break;
794 case 't':
795 os << '\t';
796 break;
797 case '\\':
798 os << '\\';
799 break;
800 default:
801 os << format[pos];
802 }
803 } else {
804 os << '\\';
805 }
806 } else {
807 os << format[pos];
808 }
809 pos++;
810 }
811 }
812
813 int main(int argc,char *argv[]) {
814
815 set_new_handler(newfail_handler);
816 #if defined(PROFILE)
817 set_profile_signal_handler();
818 #endif
819 // assert(sizeof(FILE_POSITION_TYPE)>=8);
820 // assert(sizeof(bigword)>=8);
821
822 Options opt(argc,argv);
823
824 if (opt.verbose) {
825 ostrstream ss;
826 ss << "Release Tag: " << release_tag << ends;
827 std::string v(ss.str());
828 timestamp(v.c_str());
829 }
830
831 std::list<std::string> patterns;
832 std::list<std::string> patdeflines;
833 std::list<sts_entry> sts;
834 std::list<std::string>::iterator patit;
835 std::list<std::string>::iterator patdefit;
836 sts_entry null_sts_entry;
837 std::list<sts_entry>::iterator stsit;
838
839 if (opt.pattern_file) {
840 bool delete_stream = false;
841 istream * ifs(&cin);
842 if (opt.patterns != "-") {
843 ifs = new ifstream(opt.patterns.c_str());
844 delete_stream = true;
845 }
846 std::string pattern;
847 while ((*ifs) >> pattern) {
848 patterns.push_back(pattern);
849 }
850 if (delete_stream) {
851 delete ifs;
852 }
853 } else if (opt.fasta_pattern_file) {
854 bool delete_stream = false;
855 istream * ifs(&cin);
856 if (opt.patterns != "-") {
857 ifs = new ifstream(opt.patterns.c_str());
858 delete_stream = true;
859 }
860 fasta_entry f;
861 while ((*ifs) >> f) {
862 if (f.sequence() == "") break;
863 patterns.push_back(f.sequence());
864 patdeflines.push_back(f.defline());
865 }
866 if (delete_stream) {
867 delete ifs;
868 }
869 } else if (opt.sts_pattern_file) {
870 bool delete_stream = false;
871 istream * ifs(&cin);
872 if (opt.patterns != "-") {
873 ifs = new ifstream(opt.patterns.c_str());
874 delete_stream = true;
875 }
876 sts_entry s;
877 while ((*ifs) >> s) {
878 if (s.forward_primer() == "") break;
879 patterns.push_back(s.forward_primer());
880 patterns.push_back(s.reverse_primer());
881 sts.push_back(s);
882 }
883 if (delete_stream) {
884 delete ifs;
885 }
886 } else {
887 istrstream sis(opt.patterns.c_str());
888 std::string pattern;
889 while (sis >> pattern) {
890 patterns.push_back(pattern);
891 }
892 }
893
894 if (patterns.empty()) {
895 exit(0);
896 }
897
898 if (opt.verbose) {
899 timestamp("Read primers");
900 }
901
902 if (opt.ucdict) {
903 for (patit=patterns.begin();patit!=patterns.end();++patit) {
904 uppercase(*patit);
905 }
906 if (opt.verbose) {
907 timestamp("Uppercase primers");
908 }
909 }
910
911 std::string pattern;
912 std::list<std::string>::size_type i=1,n=patterns.size();
913 unsigned long int N1=((!opt.rev_comp)?1:2)*n;
914 std::vector<std::string> patarray(N1+1); /* indices 1..N1, index 0 wasted */
915 std::vector<std::pair<int,int> > patconst(N1+1);
916 std::vector<int> patlen(N1+1);
917 std::vector<std::string> patdefarray;
918 std::vector<sts_entry> stsarray;
919 if (opt.fasta_pattern_file) {
920 patdefarray.resize(n+1);
921 patdefit=patdeflines.begin();
922 }
923 if (opt.sts_pattern_file) {
924 stsarray.resize(n/2+1);
925 stsit=sts.begin();
926 }
927
928 /* This is only used if we are counting hits rather than outputing them */
929 std::vector<long unsigned int> patcount;
930 std::vector<bool> maxpatcount;
931 if (opt.counts) {
932 patcount.resize(N1*(opt.nmismatch+1));
933 if (opt.maxcount>0) {
934 maxpatcount.resize(N1+1);
935 }
936 }
937
938 #define index_patcount(i,k) (((i)-1)*(opt.nmismatch+1)+(k))
939
940 for (patit=patterns.begin();patit!=patterns.end();++patit) {
941 if (opt.verbose && patterns.size() < 100 || opt.veryverbose) {
942 ostrstream ss;
943 ss << "Pattern " << setw(3) << i << " > "
944 << *patit << ends;
945 std::string v(ss.str());
946 timestamp(v.c_str());
947 }
948 patarray[i]=*patit;
949 patlen[i]=patit->length();
950 if (opt.fasta_pattern_file) {
951 patdefarray[i]=*patdefit;
952 patdefit++;
953 }
954 if (opt.sts_pattern_file && i%2 == 1) {
955 stsarray[(i+1)/2]=*stsit;
956 stsit++;
957 }
958 if (opt.stlen > 0) {
959 patconst[i].first = opt.stlen;
960 } else {
961 patconst[i].first = 0;
962 }
963 if (opt.fplen > patconst[i].first) {
964 patconst[i].first = opt.fplen;
965 }
966 if (opt.edlen < 0 && patit->length() + opt.edlen > patconst[i].first) {
967 patconst[i].first = patit->length()+ opt.edlen;
968 }
969 if (opt.tplen < 0 && patit->length() + opt.tplen > patconst[i].first) {
970 patconst[i].first = patit->length()+ opt.tplen;
971 }
972 if (opt.edlen > 0) {
973 patconst[i].second = opt.edlen;
974 } else {
975 patconst[i].second = 0;
976 }
977 if (opt.tplen > patconst[i].second) {
978 patconst[i].second = opt.tplen;
979 }
980 if (opt.stlen < 0 && patit->length() + opt.stlen > patconst[i].second) {
981 patconst[i].second = patit->length()+ opt.stlen;
982 }
983 if (opt.fplen < 0 && patit->length() + opt.fplen > patconst[i].second) {
984 patconst[i].second = patit->length()+ opt.fplen;
985 }
986 if (opt.counts) {
987 for (int k=0;k<=opt.nmismatch;k++) {
988 patcount[index_patcount(i,k)]=0;
989 }
990 if (opt.maxcount>0) {
991 maxpatcount[i] = false;
992 }
993 }
994 if (opt.rev_comp) {
995 patarray[i+n]=reverse_comp(*patit);
996 patlen[i+n]=patarray[i+n].length();
997 if (opt.stlen > 0) {
998 patconst[i+n].first = opt.stlen;
999 } else {
1000 patconst[i+n].first = 0;
1001 }
1002 if (opt.tplen > patconst[i+n].first) {
1003 patconst[i+n].first = opt.tplen;
1004 }
1005 if (opt.edlen < 0 && patit->length() + opt.edlen > patconst[i+n].first) {
1006 patconst[i+n].first = patit->length()+ opt.edlen;
1007 }
1008 if (opt.fplen < 0 && patit->length() + opt.fplen > patconst[i+n].first) {
1009 patconst[i+n].first = patit->length()+ opt.fplen;
1010 }
1011 if (opt.edlen > 0) {
1012 patconst[i+n].second = opt.edlen;
1013 } else {
1014 patconst[i+n].second = 0;
1015 }
1016 if (opt.fplen > patconst[i+n].second) {
1017 patconst[i+n].second = opt.fplen;
1018 }
1019 if (opt.stlen < 0 && patit->length() + opt.stlen > patconst[i+n].second) {
1020 patconst[i+n].second = patit->length()+ opt.stlen;
1021 }
1022 if (opt.tplen < 0 && patit->length() + opt.tplen > patconst[i+n].second) {
1023 patconst[i+n].second = patit->length()+ opt.tplen;
1024 }
1025
1026 if (opt.verbose && patterns.size() < 100 || opt.veryverbose) {
1027 ostrstream ss;
1028 ss << "Pattern " << setw(3) << i << " < "
1029 << patarray[i+n] << ends;
1030 std::string v(ss.str());
1031 timestamp(v.c_str());
1032 }
1033
1034 if (opt.counts) {
1035 for (int k=0;k<=opt.nmismatch;k++) {
1036 patcount[index_patcount(i+n,k)]=0;
1037 }
1038 if (opt.maxcount>0) {
1039 maxpatcount[i+n] = false;
1040 }
1041 }
1042 }
1043 i++;
1044 }
1045
1046 if (opt.verbose) {
1047 timestamp("Put primers in an array");
1048 }
1049
1050 patterns.clear();
1051 patdeflines.clear();
1052 sts.clear();
1053
1054 FastaFile<Lazy_Header_SI>* ff;
1055 fasta_file_seq_params ffp;
1056 ffp.upper_case = opt.ucdict;
1057 ffp.eos_char = opt.eos_char;
1058 ffp.check_params = opt.dbindex;
1059 ff = pick_fasta_file<Lazy_Header_SI>(opt.database,opt.dbind,opt.memmap,
1060 opt.alignments&&opt.dbindex,
1061 ffp,opt.verbose);
1062
1063 PatternMatch* kt;
1064 kt = pick_pattern_index(ff,opt.node,opt.nmismatch,&patconst,&patlen,
1065 opt.seedlen,opt.wc,opt.tn,opt.indels,opt.dna_mutations,
1066 opt.eos_char,opt.verbose);
1067
1068 for (int i=1;i<=N1;i++) {
1069 kt->add_pattern(patarray[i],i,patconst[i].first,patconst[i].second);
1070 }
1071
1072 if (opt.verbose) {
1073 kt->progress_interval(*ff);
1074 }
1075 kt->init(*ff);
1076
1077 pattern_hit_vector l(opt.report_interval*2);
1078 pattern_hit_vector::iterator it;
1079 bool more;
1080 // checkpoint;
1081 while ((more=kt->find_patterns(*ff,l,opt.report_interval))||!l.empty()) {
1082 // checkpoint;
1083 FILE_POSITION_TYPE oldcharspos;
1084 oldcharspos = ff->pos();
1085 it = l.begin();
1086 while (it != l.end()) {
1087 // checkpoint;
1088 long unsigned int pid = it->value()->id();
1089 // checkpoint;
1090 if (pid &&
1091 (opt.maxcount <= 0 ||
1092 !maxpatcount[pid])) {
1093 pattern_alignment *pa(0);
1094 if (opt.nmismatch == 0) {
1095 if (!opt.wc) {
1096 pa = new exact_alignment(it->key());
1097 } else {
1098 pa = new exact_wc_alignment(it->key(),opt.tn);
1099 }
1100 } else {
1101 // checkpoint;
1102 pa = new editdist_alignment(it->key(),it->key(),
1103 opt.nmismatch,opt.eos_char,
1104 opt.wc,opt.tn,opt.indels,
1105 opt.dna_mutations,
1106 patconst[pid].first,
1107 patconst[pid].second,false);
1108 }
1109 // checkpoint;
1110 pa->align(*ff,patarray[pid]);
1111 // checkpoint;
1112 if (pa->editdist() <= (unsigned int) opt.nmismatch) {
1113 // checkpoint;
1114 if (opt.alignments) {
1115 // checkpoint;
1116 FILE_POSITION_TYPE spe = ff->get_seq_pos(pa->end());
1117 FILE_POSITION_TYPE sps = spe-pa->length()+1;
1118 const FILE_POSITION_TYPE pe = pa->end();
1119 const FILE_POSITION_TYPE ps = pe-pa->length()+1;
1120 const bool rc = (pid>n);
1121 const long unsigned int ind = pid-(rc?n:0);
1122 const unsigned int ed = pa->editdist();
1123 const std::string pat = patarray[pid];
1124 const Lazy_Header_SI & h = ff->get_header_data(pa->end());
1125 // checkpoint;
1126 std::string patdef("");
1127 if (opt.fasta_pattern_file) {
1128 patdef = patdefarray[ind];
1129 }
1130 sts_entry & stsref = null_sts_entry;
1131 if (opt.sts_pattern_file) {
1132 stsref = stsarray[(ind+1)/2];
1133 }
1134 // checkpoint;
1135 if (opt.posout == 0) {
1136 spe--;
1137 } else if (opt.posout == 1) {
1138 sps++;
1139 }
1140 alignformat(*opt.out,
1141 opt.alignformat,sps,spe,(rc?spe:sps),
1142 (rc?sps:spe),ps,pe,ind,ed,patarray[ind],
1143 patdef,pat,pa->alignment_pattern(pat),
1144 (rc?"R":"F"),(rc?" REVCOMP":""),
1145 pa->matching_text(),pa->alignment_text(),
1146 pa->alignment_string(),
1147 h.header(),h.short_header(),h.index(),stsref);
1148 // checkpoint;
1149 }
1150 if (opt.counts) {
1151 patcount[index_patcount(pid,pa->editdist())]++;
1152 if (opt.maxcount > 0) {
1153 long unsigned int count=0;
1154 for (int k=0;k<=opt.nmismatch;k++) {
1155 count += patcount[index_patcount(pid,k)];
1156 }
1157 if (count >= (unsigned int) opt.maxcount) {
1158 maxpatcount[pid] = true;
1159 }
1160 }
1161 }
1162 } else {
1163 timestamp("Bogus hit returned to primer_match main()");
1164 if (opt.alignments) {
1165 const Lazy_Header_SI & h = ff->get_header_data(it->key());
1166 cerr << "Problem sequence is near:\n";
1167 cerr << ">" << h.header() << endl;
1168 } else {
1169 cerr << "Approximate absolute sequence position:\n";
1170 cerr << " " << it->key() << endl;
1171 }
1172 cerr << "Problem primer:\n";
1173 cerr << " " << patarray[pid] << endl;
1174 exit(1);
1175 }
1176 delete pa;
1177 }
1178 ++it;
1179 }
1180 l.clear();
1181 ff->pos(oldcharspos);
1182 }
1183 long unsigned int *counts = new long unsigned int [opt.nmismatch+1];
1184 if (opt.counts) {
1185 for (unsigned int i=1;i<=n;i++) {
1186 unsigned long int total=0;
1187 for (int k=0;k<=opt.nmismatch;k++) {
1188 counts[k]=patcount[index_patcount(i,k)];
1189 total += patcount[index_patcount(i,k)];
1190 }
1191 bool gtmax=false;
1192 if (opt.maxcount>0) {
1193 gtmax = maxpatcount[i];
1194 }
1195 std::string patdef("");
1196 if (opt.fasta_pattern_file) {
1197 patdef = patdefarray[i];
1198 }
1199 sts_entry & stsref = null_sts_entry;
1200 if (opt.sts_pattern_file) {
1201 stsref = stsarray[(i+1)/2];
1202 }
1203 if (!opt.aggregate) {
1204 countformat(*opt.out,opt.countformat,
1205 i,patarray[i],patdef,
1206 patarray[i],"F","",
1207 total,counts,opt.nmismatch,gtmax,
1208 stsref);
1209 }
1210 if (opt.rev_comp) {
1211 if (!opt.aggregate) {
1212 total = 0;
1213 for (int k=0;k<=opt.nmismatch;k++) {
1214 counts[k] = 0;
1215 }
1216 gtmax=false;
1217 }
1218 for (int k=0;k<=opt.nmismatch;k++) {
1219 counts[k]+=patcount[index_patcount(i+n,k)];
1220 total += patcount[index_patcount(i+n,k)];
1221 }
1222 if (opt.maxcount>0) {
1223 gtmax = gtmax || maxpatcount[i+n];
1224 }
1225 if (!opt.aggregate) {
1226 countformat(*opt.out,opt.countformat,
1227 i,patarray[i],patdef,
1228 patarray[i+n],"R"," REVCOMP",
1229 total,counts,opt.nmismatch,gtmax,
1230 stsref);
1231 }
1232 }
1233 if (opt.aggregate) {
1234 countformat(*opt.out,opt.countformat,
1235 i,patarray[i],patdef,
1236 "","","",
1237 total,counts,
1238 opt.nmismatch,gtmax,
1239 stsref);
1240 }
1241 }
1242 }
1243 // delete [] counts;
1244 if (opt.verbose) {
1245 timestamp("Done.");
1246 }
1247 return 0;
1248 }