ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/PrimerMatch/pcr_match.cc
Revision: 1.3
Committed: Wed Nov 1 19:48:12 2006 UTC (9 years, 7 months ago) by nje01
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +24 -7 lines
Log Message:
*** empty log message ***

Line File contents
1 /**************************************************************************
2 * This code is part of the supporting infrastructure for ATA Mapper.
3 * Copyright (C) 2002,2003,2004 Applera Corporation. All rights reserved.
4 * Author: Nathan Edwards
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received (LICENSE.txt) a copy of the GNU General Public
17 * License along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *************************************************************************/
20
21
22 #include <unistd.h>
23 #include <assert.h>
24 #include <math.h>
25 #include <vector>
26 #include <map>
27 #include "keyword_tree.h"
28 #include "keyword_tree.t"
29 #include "shift_and_inexact.h"
30 #include "shift_and.h"
31 #include "sts_io.h"
32 #include "fasta_io.t"
33 #include "char_io.t"
34 #include "primer_alignment.h"
35 #include "util.h"
36 #include "types.h"
37 #include "select.h"
38 #include "select.t"
39 #include "pattern_match.h"
40 #include "pattern_alignment.h"
41 #include "sortedvector.t"
42
43 #if !defined(NO_STD_NAMESPACE)
44 using namespace std;
45 #endif
46
47 char release_tag[] = "$Name: $";
48
49 #if defined(__alpha)
50 // Hack to get around problems with mmap
51 const unsigned long __sbrk_override = 1;
52 #endif
53
54 void newfail_handler() {
55 cerr << "Fatal Error: Attempt to allocate memory with new() failed.\n"
56 << endl;
57 abort();
58 return;
59 }
60
61 struct Options {
62 bool pattern_file;
63 bool sts_pattern_file;
64 bool fasta_pattern_file;
65 std::string patterns;
66 std::string database;
67 int nmismatch;
68 char eos_char;
69 std::string default_alignformat;
70 std::string alignformat;
71 bool alignments;
72 ostream *out;
73 bool delout;
74 bool verbose;
75 bool veryverbose;
76 bool memmap;
77 int dbind;
78 int node;
79 bool wc;
80 bool tn;
81 bool ucdict;
82 bool allorient;
83 bool rev_comp;
84 int maxdist;
85 int mindist;
86 int deviation;
87 int stlen;
88 int edlen;
89 int fplen;
90 int tplen;
91 int seedlen;
92 bool indels;
93 bool betweenlen;
94 int posout;
95 unsigned long int report_interval;
96 Options(int argc, char *argv[]);
97 ~Options();
98 void usage(char *msg=NULL);
99 };
100
101 Options::Options(int argc, char *argv[]) : out(&cout) {
102 signed char c;
103 optarg = NULL;
104 allorient = false;
105 rev_comp = false;
106 pattern_file = false;
107 sts_pattern_file = false;
108 fasta_pattern_file = false;
109 eos_char = '\n';
110 nmismatch = 0;
111 delout = false;
112 default_alignformat = ">%h\\n %>T %>s ... %l ... %<e %<T\\n %>A %!>s %!l %!<e %<A\\n %>Q %>r%!>s %!l %!<e%<r %<Q %a%R\\n";
113 alignformat = default_alignformat;
114 alignments = true;
115 verbose = false;
116 veryverbose = false;
117 memmap = true;
118 dbind = 0;
119 wc = false;
120 tn = false;
121 node =0;
122 report_interval=1000;
123 mindist=0;
124 maxdist=2000;
125 deviation=-1;
126 stlen = 0;
127 edlen = 0;
128 tplen = 0;
129 fplen = 0;
130 seedlen=0;
131 indels = true;
132 betweenlen = false;
133 posout = 2;
134
135 while ((c = getopt(argc, argv, "p:i:o:P:S:F:E:R:k:K:s:e:5:3:x:hrv01VubaA:BD:wWN:M:m:d:")) != -1)
136 switch (c) {
137 case 'p':
138 patterns = optarg;
139 pattern_file = false;
140 break;
141 case 'P':
142 patterns = optarg;
143 pattern_file = true;
144 break;
145 case 'S':
146 patterns = optarg;
147 sts_pattern_file = true;
148 break;
149 case 'F':
150 patterns = optarg;
151 fasta_pattern_file = true;
152 break;
153 case 'i':
154 database = optarg;
155 break;
156 case 'o':
157 if (delout) {
158 delete out;
159 }
160 out = new ofstream(optarg,(ios::out|ios::app|ios::ate));
161 delout = true;
162 break;
163 case 'k':
164 nmismatch = atoi(optarg);
165 indels=true;
166 break;
167 case 'K':
168 nmismatch = atoi(optarg);
169 indels=false;
170 break;
171 case '3':
172 tplen = atoi(optarg);
173 if (optarg[0] == '~') {
174 tplen = -atoi(optarg+1);
175 } else {
176 tplen = atoi(optarg);
177 }
178 break;
179 case '5':
180 fplen = atoi(optarg);
181 if (optarg[0] == '~') {
182 fplen = -atoi(optarg+1);
183 } else {
184 fplen = atoi(optarg);
185 }
186 break;
187 case 's':
188 if (optarg[0] == '~') {
189 stlen = -atoi(optarg+1);
190 } else {
191 stlen = atoi(optarg);
192 }
193 break;
194 case 'e':
195 if (optarg[0] == '~') {
196 edlen = -atoi(optarg+1);
197 } else {
198 edlen = atoi(optarg);
199 }
200 break;
201 case 'x':
202 seedlen = atoi(optarg);
203 break;
204 case 'R':
205 report_interval = atoi(optarg);
206 break;
207 case 'A':
208 alignformat = optarg;
209 alignments = true;
210 break;
211 case '0':
212 posout = 0;
213 break;
214 case '1':
215 posout = 1;
216 break;
217 case 'w':
218 wc = true;
219 tn = false;
220 break;
221 case 'W':
222 wc = true;
223 tn = true;
224 break;
225 case 'u':
226 ucdict = true;
227 break;
228 case 'D':
229 dbind = atoi(optarg);
230 break;
231 case 'N':
232 node = atoi(optarg);
233 break;
234 case 'M':
235 maxdist = atoi(optarg);
236 break;
237 case 'd':
238 deviation = atoi(optarg);
239 break;
240 case 'm':
241 mindist = atoi(optarg);
242 break;
243 case 'E': {
244 int eos_char0;
245 if (!sscanf(optarg,"%i",&eos_char0)) {
246 usage("Invalid end-of-sequence specification.\n");
247 }
248 eos_char = (char)eos_char0;
249 }
250 break;
251 case 'v':
252 verbose = true;
253 break;
254 case 'b':
255 betweenlen = true;
256 break;
257 case 'V':
258 veryverbose = true;
259 verbose = true;
260 break;
261 case 'r':
262 rev_comp = true;
263 break;
264 case 'a':
265 allorient = true;
266 break;
267 case 'B':
268 memmap = false;
269 break;
270 case 'h':
271 default :
272 usage();
273 }
274 if ((patterns == "" || database == "")&&!verbose) usage();
275 if (nmismatch < 0) usage("Number of mismatches (-k) must be at least 0");
276 if (dbind < 0 || dbind > 4) usage("Invalid integer for fasta database indexing (-D).");
277 }
278
279 Options::~Options() {
280 if (delout) {
281 delete out;
282 }
283 }
284
285 std::string spaces(FILE_POSITION_TYPE fp) {
286 std::string ret=" ";
287 while (fp/=10) ret+=' ';
288 return ret;
289 }
290
291 std::string spaces(std::string const & instr) {
292 std::string ret=instr;
293 for (int i=0;i<instr.length();i++) {
294 ret[i] = ' ';
295 }
296 return ret;
297 }
298
299 void Options::usage(char *message) {
300 if (message != NULL && strlen(message) > 0) {
301 cerr << message << endl;
302 cerr << endl;
303 }
304 cerr << "Usage: pcr_match [options] \n\n";
305 cerr << "Options: \n";
306 cerr << " -i <sequence-database> Input sequence database. Required.\n";
307 cerr << " -p <sequences> Primer pairs, separated by whitespace.\n";
308 cerr << " -P <sequence-file> Primer pairs, separated by whitespace.\n";
309 cerr << " \"-\" indicates standard input.\n";
310 cerr << " -S <sequence-file> Primer pairs in UniSTS format.\n";
311 cerr << " \"-\" indicates standard input.\n";
312 cerr << " -F <sequence-file> Primer pairs in FASTA format.\n";
313 cerr << " \"-\" indicates standard input.\n";
314 cerr << " One of -p, -P or -S is required.\n";
315 cerr << " -o <output-file> Output file. Defaults to standard out.\n";
316 cerr << " Will append if output-file exists.\n";
317 cerr << " -k <#-edits> Number of insertions, deletions \n";
318 cerr << " and substitutions permitted. Defaults to 0.\n";
319 cerr << " -K <#-mismatches> Number of substitutions (mismatches) permitted.\n";
320 cerr << " No insertions or deletions allowed. Defaults to 0.\n";
321 cerr << " At most one of -k and -K may be specified.\n";
322 cerr << " -r Reverse reverse complement primer.\n";
323 cerr << " Note: -r is automatically set for \n";
324 cerr << " UniSTS format primer-pair input.\n";
325 cerr << " -a Output all primer-pair orientations.\n";
326 cerr << " -x <#-chars> Number of characters for e(x)act primer seed (word size).\n";
327 cerr << " -s <#-chars> Number of characters from start of pattern\n";
328 cerr << " (or reverse complement) that must match exactly\n";
329 cerr << " (inexactly if preceeded by ~).\n";
330 cerr << " -e <#-chars> Number of characters from end of pattern\n";
331 cerr << " (or reverse complement) that must match exactly\n";
332 cerr << " (inexactly if preceeded by ~).\n";
333 cerr << " -5 <#-chars> Number of characters from 5' end of pattern\n";
334 cerr << " (or reverse complement) that must match exactly\n";
335 cerr << " (inexactly if preceeded by ~).\n";
336 cerr << " -3 <#-chars> Number of characters from 3' end of pattern\n";
337 cerr << " (or reverse complement) that must match exactly\n";
338 cerr << " (inexactly if preceeded by ~).\n";
339 cerr << " -u Upper-case all primers.\n";
340 cerr << " -w Honor IUPAC wildcards in pattern & text,\n";
341 cerr << " no text N wildcard.\n";
342 cerr << " -W Honor IUPAC wildcards in pattern & text\n";
343 cerr << " with text N wildcard.\n";
344 cerr << " -E <int> End-of-sequence character. Default is \'\\n\'\n";
345 cerr << " -m <int> Minimum amplicon length. Default: 0.\n";
346 cerr << " -M <int> Maximum amplicon length. Default: 2000.\n";
347 cerr << " -d <int> Deviation from reported amplicon length. \n";
348 cerr << " STS format primers required. \n";
349 cerr << " Default: No constraint.\n";
350 cerr << " -b Ignore primers in amplicon length computation.\n";
351 cerr << " Default: False.\n";
352 cerr << " -A <format> Alignment output format.\n";
353 cerr << " Default: " << default_alignformat << endl;
354 cerr << " -0 0-based position output. Default: space-based.\n";
355 cerr << " -1 1-based position output. Default: space-based.\n";
356 cerr << " -R <int> Alignment report interval. Default is 2000.\n";
357 cerr << " -N <int> Data structure for primer index.\n";
358 cerr << " Default: Auto.\n";
359 cerr << " -B Don\'t use memmap for I/O, use buffered I/O instead.\n";
360 cerr << " -D (0|1|2|3|4) Fasta database indexing and preprocessing.\n";
361 cerr << " 0: Auto, 1: None, 2: Indexed, 3: Normalized,\n";
362 cerr << " 4: Compressed. Default: Auto.\n";
363 cerr << " -v Verbose (version & diagnostic) output.\n";
364 cerr << " -h Command line option help.\n";
365 cerr << "\n";
366 exit(1);
367 }
368
369 void alignformat(ostream & os,
370 std::string const & alignformat,
371 FILE_POSITION_TYPE const & s,
372 FILE_POSITION_TYPE const & s1,
373 FILE_POSITION_TYPE const & e,
374 FILE_POSITION_TYPE const & e1,
375 FILE_POSITION_TYPE const & five,
376 FILE_POSITION_TYPE const & five1,
377 FILE_POSITION_TYPE const & three,
378 FILE_POSITION_TYPE const & three1,
379 FILE_POSITION_TYPE const & S,
380 FILE_POSITION_TYPE const & S1,
381 FILE_POSITION_TYPE const & E,
382 FILE_POSITION_TYPE const & E1,
383 long unsigned int const & i,
384 unsigned int const & d,
385 unsigned int const & d1,
386 std::string const & p,
387 std::string const & p1,
388 sts_entry const & sts,
389 std::string const & patdeff,
390 std::string const & patdefr,
391 std::string const & q,
392 std::string const & q1,
393 std::string const & Q,
394 std::string const & Q1,
395 std::string const & r,
396 std::string const & r1,
397 std::string const & R,
398 std::string const & R1,
399 bool const & ppo,
400 std::string const & t,
401 std::string const & t1,
402 std::string const & T,
403 std::string const & T1,
404 std::string const & A,
405 std::string const & A1,
406 std::string const & h,
407 std::string const & H,
408 long unsigned int const & f,
409 char * const & a,
410 long unsigned int const & ncount) {
411
412 //
413 // The following codes get transformed into positional markers for a
414 // printf format.
415 //
416 // %s start of alignment from the start of the fasta entry
417 // %e end of alignment from the start of the fasta entry
418 // %S start of alignment in compressed sequence file
419 // %E end of alignment in compressed sequence file
420 // %i index of pattern
421 // %I sts id number
422 // %d edit distance of alignment
423 // %p tandem repeat motif
424 // %Q aligned tandem repeat with insertion characters
425 // %t aligned text
426 // %T aligned text with alignment characters
427 // %A alignment string
428 // %h fasta header of hit sequence
429 // %H first word of fasta header of hit sequence
430 // %% percent character
431 //
432
433 int pos=0;
434 while (pos<alignformat.length()) {
435 if (alignformat[pos] == '%') {
436 pos++;
437 if (pos < alignformat.length()) {
438 bool widthonly=false;
439 if (alignformat[pos] == '!') {
440 widthonly = true;
441 pos++;
442 }
443 int dirn=0;
444 if (alignformat[pos] == '>') dirn = 1;
445 if (alignformat[pos] == '<') dirn = -1;
446 if (dirn!=0) pos++;
447 switch(alignformat[pos]) {
448 case 's':
449 if (dirn > 0) {
450 if (!widthonly) {
451 os << s;
452 } else {
453 os << spaces(s);
454 }
455 } else if (dirn < 0) {
456 if (!widthonly) {
457 os << s1;
458 } else {
459 os << spaces(s1);
460 }
461 }
462 break;
463 case 'e':
464 if (dirn > 0) {
465 if (!widthonly) {
466 os << e;
467 } else {
468 os << spaces(e);
469 }
470 } else if (dirn < 0) {
471 if (!widthonly) {
472 os << e1;
473 } else {
474 os << spaces(e1);
475 }
476 }
477 break;
478 case 'l':
479 if (dirn > 0) {
480 os << E-S;
481 } else if (dirn < 0) {
482 os << E1-S1;
483 } else {
484 if (!widthonly) {
485 os << E1-S;
486 } else {
487 os << spaces(E1-S);
488 }
489 }
490 break;
491 case 'S':
492 if (dirn > 0) {
493 os << S;
494 } else if (dirn < 0) {
495 os << S1;
496 }
497 break;
498 case 'E':
499 if (dirn > 0) {
500 os << E;
501 } else if (dirn < 0) {
502 os << E1;
503 }
504 break;
505 case 'i':
506 os << i;
507 break;
508 case 'd':
509 if (dirn > 0) {
510 os << d;
511 } else if (dirn < 0) {
512 os << d1;
513 }
514 break;
515 case 'p':
516 if (dirn > 0) {
517 os << p;
518 } else if (dirn < 0) {
519 os << p1;
520 }
521 break;
522 case 'P':
523 if (dirn > 0) {
524 os << patdeff;
525 } else if (dirn < 0) {
526 os << patdefr;
527 }
528 break;
529 case 'I':
530 os << sts.id();
531 break;
532 case 'L':
533 if (sts.sizeub() != sts.sizelb()) {
534 if (dirn > 0) {
535 os << sts.sizelb();
536 } else if (dirn < 0) {
537 os << sts.sizeub();
538 } else {
539 os << sts.sizelb() << "-" << sts.sizeub();
540 }
541 } else {
542 os << sts.sizelb();
543 }
544 break;
545 case 'D':
546 {
547 int amplen = E1-S;
548 int deviance = 0;
549 if (amplen > sts.sizeub()) {
550 deviance = amplen - sts.sizeub();
551 } else if (amplen < sts.sizelb()) {
552 deviance = sts.sizelb() - amplen;
553 }
554 os << deviance;
555 }
556 break;
557 case 'a':
558 os << sts.accession();
559 break;
560 case 'O':
561 os << sts.species();
562 break;
563 case '&':
564 os << sts.altacc();
565 break;
566 case 'X':
567 os << sts.chrom();
568 break;
569 case 'q':
570 if (dirn > 0) {
571 os << q;
572 } else if (dirn < 0) {
573 os << q1;
574 }
575 break;
576 case 'Q':
577 if (dirn > 0) {
578 if (!widthonly) {
579 os << Q;
580 } else {
581 os << spaces(Q);
582 }
583 } else if (dirn < 0) {
584 if (!widthonly) {
585 os << Q1;
586 } else {
587 os << spaces(Q1);
588 }
589
590 }
591 break;
592 case 'r':
593 if (dirn > 0) {
594 os << r;
595 } else if (dirn < 0) {
596 os << r1;
597 } else {
598 if (ppo) {
599 os << "F";
600 } else {
601 os << "R";
602 }
603
604 }
605 break;
606 case 'R':
607 if (dirn > 0) {
608 os << R;
609 } else if (dirn < 0) {
610 os << R1;
611 } else {
612 if (ppo) {
613 os << "";
614 } else {
615 os << " REVERSE-STRAND";
616 }
617 }
618 break;
619 case 't':
620 if (dirn > 0) {
621 os << t;
622 } else if (dirn < 0) {
623 os << t1;
624 }
625 break;
626 case 'T':
627 if (dirn > 0) {
628 os << T;
629 } else if (dirn < 0) {
630 os << T1;
631 }
632 break;
633 case 'A':
634 if (dirn > 0) {
635 if (!widthonly) {
636 os << A;
637 } else {
638 os << spaces(A);
639 }
640 } else if (dirn < 0) {
641 if (!widthonly) {
642 os << A1;
643 } else {
644 os << spaces(A1);
645 }
646 }
647 break;
648 case 'h':
649 os << h;
650 break;
651 case 'H':
652 os << H;
653 break;
654 case 'f':
655 os << f;
656 break;
657 case '@':
658 os << a;
659 break;
660 case 'N':
661 os << ncount;
662 break;
663 case '%':
664 os << "%";
665 break;
666 case '0':
667 os << H << " "
668 << s+1 << ".." << e1 << '\t'
669 << sts.id() << '\t';
670 if (sts.accession() != "") {
671 os << '\t' << sts.accession();
672 if (sts.chrom() != "") {
673 os << '\t' << sts.chrom();
674 if (sts.altacc() != "") {
675 os << '\t' << sts.altacc();
676 if (sts.species() != "") {
677 os << '\t' << sts.species();
678 }
679 }
680 }
681 }
682 break;
683 default:
684 os << alignformat[pos];
685 }
686 } else {
687 os << "%";
688 }
689 } else if (alignformat[pos] == '\\') {
690 pos++;
691 if (pos < alignformat.length()) {
692 switch(alignformat[pos]) {
693 case 'n':
694 os << endl;
695 break;
696 case 't':
697 os << '\t';
698 break;
699 case '\\':
700 os << '\\';
701 break;
702 default:
703 os << alignformat[pos];
704 }
705 } else {
706 os << '\\';
707 }
708 } else {
709 os << alignformat[pos];
710 }
711 pos++;
712 }
713 }
714
715 int main(int argc,char *argv[]) {
716
717 set_new_handler(newfail_handler);
718 #if defined(PROFILE)
719 set_profile_signal_handler();
720 #endif
721 // assert(sizeof(FILE_POSITION_TYPE)>=8);
722
723 Options opt(argc,argv);
724
725 if (opt.verbose) {
726 ostrstream ss;
727 ss << "Release Tag: " << release_tag << ends;
728 std::string v(ss.str());
729 timestamp(v.c_str());
730 }
731 std::list<std::string> patterns;
732 std::list<std::string>::iterator patit;
733 std::list<sts_entry> sts;
734 sts_entry null_sts_entry;
735 std::list<sts_entry>::iterator stsit;
736 std::list<std::string> patdeflines;
737 std::list<std::string>::iterator patdefit;
738
739 if (opt.pattern_file) {
740 bool delete_stream = false;
741 istream * ifs(&cin);
742 if (opt.patterns != "-") {
743 ifs = new ifstream(opt.patterns.c_str());
744 delete_stream = true;
745 }
746 std::string pattern;
747 while ((*ifs) >> pattern) {
748 patterns.push_back(pattern);
749 }
750 if (delete_stream) {
751 delete ifs;
752 }
753 } else if (opt.sts_pattern_file) {
754 bool delete_stream = false;
755 istream * ifs(&cin);
756 if (opt.patterns != "-") {
757 ifs = new ifstream(opt.patterns.c_str());
758 delete_stream = true;
759 }
760 sts_entry s;
761 while ((*ifs) >> s) {
762 if (s.forward_primer() == "") break;
763 sts.push_back(s);
764 // cerr << s << endl;
765 patterns.push_back(s.forward_primer());
766 patterns.push_back(s.reverse_primer());
767 }
768 if (delete_stream) {
769 delete ifs;
770 }
771 } else if (opt.fasta_pattern_file) {
772 bool delete_stream = false;
773 istream * ifs(&cin);
774 if (opt.patterns != "-") {
775 ifs = new ifstream(opt.patterns.c_str());
776 delete_stream = true;
777 }
778 fasta_entry f;
779 while ((*ifs) >> f) {
780 if (f.sequence() == "") break;
781 patdeflines.push_back(f.defline());
782 // cerr << s << endl;
783 patterns.push_back(f.sequence());
784 }
785 if (delete_stream) {
786 delete ifs;
787 }
788 } else {
789 istrstream sis(opt.patterns.c_str());
790 std::string pattern;
791 while (sis >> pattern) {
792 patterns.push_back(pattern);
793 }
794 }
795 if (patterns.empty()) {
796 exit(0);
797 } else if (patterns.size() % 2 != 0) {
798 opt.usage("Odd number of primers!");
799 }
800
801 if (opt.ucdict) {
802 for (patit=patterns.begin();patit!=patterns.end();++patit) {
803 uppercase(*patit);
804 }
805 }
806
807 if (opt.rev_comp || opt.sts_pattern_file) {
808 opt.rev_comp = true;
809 int i=0;
810 for (patit=patterns.begin();patit!=patterns.end();++patit) {
811 if (i%2==1) {
812 *patit = reverse_comp(*patit);
813 }
814 i++;
815 }
816 }
817
818 std::string pattern;
819 std::list<std::string>::size_type i=1,j=1,k=1,n=patterns.size();
820 unsigned long int N1=2*n;
821 std::vector<std::string> patarray(N1+1); /* indices 1..2*n, index 0 wasted */
822 std::vector<std::pair<int,int> > patconst(N1+1);
823 std::vector<int> patlen(N1+1);
824 std::vector<sts_entry> stsarray(sts.size()+1);
825 std::vector<std::string> patdefarray(patdeflines.size()+1);
826 int min_exact_const=MAXINT;
827
828 for (patit=patterns.begin(),stsit=sts.begin(),patdefit=patdeflines.begin();
829 patit!=patterns.end();++patit) {
830
831 int fplen=opt.fplen;
832 int tplen=opt.tplen;
833 if (i%2==0) {
834 fplen = opt.tplen;
835 tplen = opt.fplen;
836 }
837 if (opt.verbose && patterns.size() < 100 || opt.veryverbose) {
838 ostrstream ss;
839 if (i%2==1) {
840 ss << "[" << setw(4) << i << "] Forward primer: "
841 << setw(3) << (i-1)/2+1 << " > "
842 << *patit << ends;
843 } else {
844 ss << "[" << setw(4) << i << "] Reverse primer: "
845 << setw(3) << (i-1)/2+1 << " > "
846 << *patit << ends;
847 }
848 std::string v(ss.str());
849 timestamp(v.c_str());
850 }
851 patarray[i]=*patit;
852 patlen[i]=patit->length();
853 if (opt.sts_pattern_file && i%2 == 1) {
854 stsarray[j] = *stsit;
855 j++;
856 stsit++;
857 }
858 if (opt.fasta_pattern_file) {
859 patdefarray[k] = *patdefit;
860 k++;
861 patdefit++;
862 }
863 if (opt.stlen > 0) {
864 patconst[i].first = opt.stlen;
865 } else {
866 patconst[i].first = 0;
867 }
868 if (fplen > patconst[i].first) {
869 patconst[i].first = fplen;
870 }
871 if (opt.edlen < 0 && patit->length() + opt.edlen > patconst[i].first) {
872 patconst[i].first = patit->length()+ opt.edlen;
873 }
874 if (tplen < 0 && patit->length() + tplen > patconst[i].first) {
875 patconst[i].first = patit->length()+ tplen;
876 }
877 if (opt.edlen > 0) {
878 patconst[i].second = opt.edlen;
879 } else {
880 patconst[i].second = 0;
881 }
882 if (tplen > patconst[i].second) {
883 patconst[i].second = tplen;
884 }
885 if (opt.stlen < 0 && patit->length() + opt.stlen > patconst[i].second) {
886 patconst[i].second = patit->length()+ opt.stlen;
887 }
888 if (fplen < 0 && patit->length() + fplen > patconst[i].second) {
889 patconst[i].second = patit->length()+ fplen;
890 }
891 patarray[i+n]=reverse_comp(*patit);
892 patlen[i+n]=patarray[i+n].length();
893 if (opt.verbose && patterns.size() < 100 || opt.veryverbose) {
894 ostrstream ss;
895 if (i%2==1) {
896 ss << "[" << setw(4) << i+n << "] Forward primer: "
897 << setw(3) << (i-1)/2+1 << " < "
898 << patarray[i+n] << ends;
899 } else {
900 ss << "[" << setw(4) << i+n << "] Reverse primer: "
901 << setw(3) << (i-1)/2+1 << " < "
902 << patarray[i+n] << ends;
903 }
904 std::string v(ss.str());
905 timestamp(v.c_str());
906 }
907 if (opt.stlen > 0) {
908 patconst[i+n].first = opt.stlen;
909 } else {
910 patconst[i+n].first = 0;
911 }
912 if (tplen > patconst[i+n].first) {
913 patconst[i+n].first = tplen;
914 }
915 if (opt.edlen < 0 && patit->length() + opt.edlen > patconst[i+n].first) {
916 patconst[i+n].first = patit->length()+ opt.edlen;
917 }
918 if (fplen < 0 && patit->length() + fplen > patconst[i+n].first) {
919 patconst[i+n].first = patit->length()+ fplen;
920 }
921 if (opt.edlen > 0) {
922 patconst[i+n].second = opt.edlen;
923 } else {
924 patconst[i+n].second = 0;
925 }
926 if (fplen > patconst[i+n].second) {
927 patconst[i+n].second = fplen;
928 }
929 if (opt.stlen < 0 && patit->length() + opt.stlen > patconst[i+n].second) {
930 patconst[i+n].second = patit->length()+ opt.stlen;
931 }
932 if (tplen < 0 && patit->length() + tplen > patconst[i+n].second) {
933 patconst[i+n].second = patit->length()+ tplen;
934 }
935 i++;
936 }
937 patterns.clear();
938 sts.clear();
939 patdeflines.clear();
940
941 FastaFile<Lazy_Header_SI>* ff;
942 fasta_file_seq_params ffp;
943 ffp.upper_case = true;
944 ffp.eos_char = opt.eos_char;
945 ff = pick_fasta_file<Lazy_Header_SI>(opt.database,opt.dbind,opt.memmap,
946 /*alignments=*/true,ffp,opt.verbose);
947 PatternMatch* pm;
948 pm = pick_pattern_index(ff,opt.node,opt.nmismatch,&patconst,&patlen,
949 opt.seedlen,opt.wc,opt.tn,opt.indels,false,
950 opt.eos_char,opt.verbose);
951
952 for (int i=1;i<=N1;i++) {
953 pm->add_pattern(patarray[i],i,patconst[i].first,patconst[i].second);
954 }
955 if (opt.verbose) {
956 pm->progress_interval(*ff);
957 }
958 pm->init(*ff);
959
960 if (opt.verbose) {
961 timestamp("Scanning sequence database...");
962 }
963
964 typedef sortedvector<FILE_POSITION_TYPE,pattern_hit_vector::iterator> pathitpq;
965 typedef std::map<unsigned long int,pathitpq> pathitmap;
966 typedef std::pair<unsigned long int,pathitpq> pathitmapelt;
967
968 pattern_hit_vector l(opt.report_interval*2);
969 pattern_hit_vector::iterator it,it2;
970 std::list<pattern_hit_vector::iterator> l1;
971 std::list<pattern_hit_vector::iterator>::iterator it1;
972
973 // checkpoint;
974 bool more;
975 while ((more=pm->find_patterns(*ff,l,opt.report_interval))||!l.empty()) {
976 // checkpoint;
977 // cerr << l.size() << endl;
978 FILE_POSITION_TYPE oldcharspos;
979 oldcharspos = ff->pos();
980
981 pathitmap m;
982 pathitmap::iterator mit,mit1,mit2;
983 pathitpq::iterator pqit,pqit1;
984
985 l.normalize();
986
987 // checkpoint;
988 it = l.begin();
989 while (it != l.end()) {
990 // checkpoint;
991 if ((mit=m.find(it->value()->id()))==m.end()) {
992 // checkpoint;
993 std::pair<pathitmap::iterator,bool> ret;
994 ret = m.insert(pathitmapelt(it->value()->id(),pathitpq()));
995 mit=ret.first;
996 }
997 mit->second.push_back(it->key(),it);
998 ++it;
999 }
1000 // checkpoint;
1001 it = l.begin();
1002 while (it != l.end()) {
1003 // checkpoint;
1004 long unsigned int pid=it->value()->id();
1005 long unsigned int pos=it->key();
1006 // cerr << pid << " " << pos << endl;
1007 long unsigned int pid1=0,pid2=0;
1008 if (pid <= n && pid%2 == 1) {
1009 pid1 = pid+1;
1010 // cerr << "pid1 = " << pid1 << endl;
1011 } else if (pid > n && (pid-n)%2==0) {
1012 pid1 = pid-1;
1013 // cerr << "pid1 = " << pid1 << endl;
1014 }
1015 if (opt.allorient) {
1016 if (pid <= n) {
1017 if (pid%2 == 1) {
1018 pid2 = pid+n+1;
1019 // cerr << "pid2 = " << pid2 << endl;
1020 } else {
1021 pid1 = pid-1;
1022 pid2 = pid+n-1;
1023 // cerr << "pid1 = " << pid1 << endl;
1024 // cerr << "pid2 = " << pid2 << endl;
1025 }
1026 } else {
1027 if (pid%2 == 0) {
1028 pid2 = pid-n-1;
1029 // cerr << "pid2 = " << pid2 << endl;
1030 } else {
1031 pid1 = pid+1;
1032 pid2 = pid-n+1;
1033 // cerr << "pid1 = " << pid1 << endl;
1034 // cerr << "pid2 = " << pid2 << endl;
1035 }
1036 }
1037 }
1038
1039 // checkpoint;
1040
1041 // checkpoint;
1042 // cerr << pid << " " << pid1 << " " << pid2 << endl;
1043
1044 l1.erase(l1.begin(),l1.end());
1045 bool det;
1046 mit=m.find(pid);
1047 assert(mit != m.end());
1048 pqit = mit->second.locate_first_at_least(pos);
1049 assert(pqit->key() == pos);
1050 if (pid1>0) {
1051 mit1=m.find(pid1);
1052 } else {
1053 mit1=m.end();
1054 }
1055 if (pid2>0) {
1056 mit2=m.find(pid2);
1057 } else {
1058 mit2=m.end();
1059 }
1060 // checkpoint;
1061 long unsigned int pair = (pid-((pid>n)?n:0)+1)/2;
1062 FILE_POSITION_TYPE stretch_max=opt.maxdist;
1063 FILE_POSITION_TYPE stretch_min=opt.mindist;
1064 if (opt.betweenlen) {
1065 FILE_POSITION_TYPE plen=0;
1066 if (pid1 != 0) {
1067 plen = patlen[pid1];
1068 }
1069 if (pid2 != 0 && patlen[pid2]>plen) {
1070 plen = patlen[pid2];
1071 }
1072 stretch_max += plen + patlen[pid];
1073 }
1074 if (opt.sts_pattern_file && opt.deviation >= 0) {
1075 if (stretch_max > stsarray[pair].sizeub() + opt.deviation) {
1076 stretch_max = stsarray[pair].sizeub() + opt.deviation;
1077 }
1078 if (stretch_min < stsarray[pair].sizelb() - opt.deviation) {
1079 stretch_min = stsarray[pair].sizelb() - opt.deviation;
1080 }
1081 }
1082 stretch_max += pos - patlen[pid] + (opt.indels?opt.nmismatch:1);
1083 stretch_min += pos - patlen[pid] - (opt.indels?opt.nmismatch:1);
1084 if (oldcharspos < stretch_max && more) {
1085 det=false;
1086 } else {
1087 det=true;
1088 // checkpoint;
1089 if (mit1 != m.end()) {
1090 try {
1091 pqit1 = mit1->second.locate_first_at_least(stretch_min);
1092 }
1093 catch (pathitpq::KeyOutOfRange const & e) {
1094 // checkpoint;
1095 pqit1 = mit1->second.end();
1096 }
1097 if (pqit1!=mit1->second.end()) {
1098 while (pqit1!=mit1->second.end() &&
1099 pqit1->value()->key() <= stretch_max) {
1100 if (pqit1->value()->key()) {
1101 l1.push_back(pqit1->value());
1102 }
1103 ++pqit1;
1104 }
1105 }
1106 }
1107 // checkpoint;
1108 if (mit2 != m.end()) {
1109 try {
1110 pqit1 = mit2->second.locate_first_at_least(stretch_min);
1111 }
1112 catch (pathitpq::KeyOutOfRange const & e) {
1113 pqit1 = mit2->second.end();
1114 }
1115 if (pqit1!=mit2->second.end()) {
1116 while (pqit1!=mit2->second.end() &&
1117 pqit1->value()->key() <= stretch_max) {
1118 if (pqit1->value()->key()) {
1119 l1.push_back(pqit1->value());
1120 }
1121 ++pqit1;
1122 }
1123 }
1124 }
1125 }
1126
1127 // checkpoint;
1128 if (!det&&more) {
1129 ++it;
1130 continue;
1131 }
1132 // checkpoint;
1133 for (it1=l1.begin();it1!=l1.end();++it1) {
1134 // checkpoint;
1135 pattern_alignment *pa =
1136 new editdist_alignment(pos,pos,opt.nmismatch,opt.eos_char,
1137 opt.wc,opt.tn,opt.indels,false,
1138 patconst[pid].first,
1139 patconst[pid].second,false);
1140 long unsigned int pid1 = (*it1)->value()->id();
1141 FILE_POSITION_TYPE pos1 = (*it1)->key();
1142 // cerr << pid1 << " " << pos1 << endl;
1143 pattern_alignment *pa1 =
1144 new editdist_alignment(pos1,pos1,opt.nmismatch,opt.eos_char,
1145 opt.wc,opt.tn,opt.indels,false,
1146 patconst[pid1].first,
1147 patconst[pid1].second,false);
1148 // checkpoint;
1149 // cerr << pid << " " << pos << " " << patarray[pid] << endl;
1150 // checkpoint;
1151 // cerr << pid1 << " " << pos1 << " " << patarray[pid1] << endl;
1152 pa->align(*ff,patarray[pid]);
1153 // checkpoint;
1154 pa1->align(*ff,patarray[pid1]);
1155 // checkpoint;
1156 if (pa->editdist() <= opt.nmismatch &&
1157 pa1->editdist() <= opt.nmismatch ) {
1158 FILE_POSITION_TYPE spe,spe1;
1159 FILE_POSITION_TYPE sps,sps1;
1160 FILE_POSITION_TYPE pe,pe1;
1161 FILE_POSITION_TYPE ps,ps1;
1162 bool rc,rc1;
1163 long unsigned int ind,ind1,pind;
1164 unsigned int ed,ed1;
1165 spe = ff->get_seq_pos(pa->end());
1166 spe1 = ff->get_seq_pos(pa1->end());
1167 sps = spe-pa->length()+1;
1168 sps1 = spe1-pa1->length()+1;
1169 pe = pa->end();
1170 pe1 = pa1->end();
1171 ps = pe-pa->length()+1;
1172 ps1 = pe1-pa1->length()+1;
1173 rc = (pid>n);
1174 rc1 = (pid1>n);
1175 ind = pid-(rc?n:0);
1176 ind1 = pid1-(rc1?n:0);
1177 ed = pa->editdist();
1178 ed1 = pa1->editdist();
1179 sts_entry & stsref = null_sts_entry;
1180 if (ind<ind1) {
1181 pind = ind/2+1;
1182 } else {
1183 pind = ind1/2+1;
1184 }
1185 if (opt.sts_pattern_file) {
1186 stsref = stsarray[pind];
1187 }
1188 std::string patdef("");
1189 if (opt.fasta_pattern_file) {
1190 patdef = patdefarray[ind];
1191 }
1192 std::string patdef1("");
1193 if (opt.fasta_pattern_file) {
1194 patdef1 = patdefarray[ind1];
1195 }
1196 if (opt.rev_comp) {
1197 if (ind%2 == 0) {
1198 rc = !rc;
1199 } else if (ind1%2 == 0) {
1200 rc1 = !rc1;
1201 }
1202 }
1203 long signed int amplicon_len;
1204 if (!opt.betweenlen) {
1205 amplicon_len = pe1;
1206 amplicon_len -= ps;
1207 } else {
1208 amplicon_len = ps1;
1209 amplicon_len -= pe;
1210 }
1211 if (ff->is_subseq(ps,pe1) &&
1212 amplicon_len <= opt.maxdist && amplicon_len >= opt.mindist &&
1213 (!opt.sts_pattern_file || opt.deviation < 0 ||
1214 ((amplicon_len + opt.deviation >= stsref.sizelb()) &&
1215 (amplicon_len <= ((int)stsref.sizeub()) + opt.deviation)))) {
1216 const Lazy_Header_SI & h = ff->get_header_data(pa->end());
1217 char *buffer = new char[amplicon_len+1];
1218 long unsigned int ncount=0;
1219 int i=0;
1220 ff->pos(ps);
1221 while (i<amplicon_len) {
1222 buffer[i] = ff->getch();
1223 if (buffer[i] == 'N' || buffer[i] == 'n') {
1224 ncount++;
1225 }
1226 i++;
1227 }
1228 buffer[i] = '\0';
1229 if (opt.posout == 0) {
1230 spe--;
1231 spe1--;
1232 } else if (opt.posout == 1) {
1233 sps++;
1234 sps1++;
1235 }
1236 alignformat(*opt.out,
1237 opt.alignformat,sps,sps1,spe,spe1,
1238 (rc?spe:sps),(rc1?spe1:sps1),
1239 (rc?sps:spe),(rc1?sps1:spe1),
1240 ps,ps1,pe,pe1,pind,ed,ed1,patarray[ind],
1241 patarray[ind1],
1242 stsref,patdef,patdef1,patarray[pid],
1243 patarray[pid1],
1244 pa->alignment_pattern(patarray[pid]),
1245 pa1->alignment_pattern(patarray[pid1]),
1246 (rc?"R":"F"),(rc1?"R":"F"),
1247 (rc?" REVCOMP":""),(rc1?" REVCOMP":""),
1248 (ind<ind1),
1249 pa->matching_text(),pa1->matching_text(),
1250 pa->alignment_text(),pa1->alignment_text(),
1251 pa->alignment_string(),pa1->alignment_string(),
1252 h.header(),h.short_header(),h.index(),buffer,ncount);
1253 delete [] buffer;
1254 // checkpoint;
1255 }
1256 // checkpoint;
1257 }
1258 delete pa;
1259 delete pa1;
1260 }
1261 it->key() = 0;
1262 ++it;
1263 }
1264 m.erase(m.begin(),m.end());
1265 // checkpoint;
1266 // fix up l...
1267 // Move everything with valid key to beginning...
1268 // int i,i1;
1269 it=l.begin();
1270 it2=l.begin();
1271 // i=0;
1272 // i1=0;
1273 long unsigned int newsize=0;
1274 while (it!=l.end()) {
1275 // cerr << i << " " << i1 << endl;
1276 // cerr << l.key(it) << " " << l.value(it)->id() << endl;
1277 if (it->key() != 0) {
1278 if (it != it2) {
1279 it2->key() = it->key();
1280 it2->value() = it->value();
1281 }
1282 ++newsize;
1283 ++it2;
1284 // i1++;
1285 }
1286 ++it;
1287 // i++;
1288 }
1289 // cerr << newsize << endl;
1290 l.resize(newsize);
1291 ff->pos(oldcharspos);
1292 // checkpoint;
1293 }
1294
1295 if (opt.verbose) {
1296 timestamp("Done.");
1297 }
1298 return 0;
1299 }