ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/PrimerMatch/compress_seq.cc
Revision: 1.2
Committed: Wed May 4 18:03:44 2005 UTC (11 years, 7 months ago) by nje01
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +107 -56 lines
Log Message:
Small bug fixes, plus codon based edit distance for peptide searching.

Line File contents
1 /**************************************************************************
2 * This code is part of the supporting infrastructure for ATA Mapper.
3 * Copyright (C) 2002,2003,2004 Applera Corporation. All rights reserved.
4 * Author: Nathan Edwards
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received (LICENSE.txt) a copy of the GNU General Public
17 * License along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *************************************************************************/
20
21
22 #include <unistd.h>
23 #include <assert.h>
24 #include <ctype.h>
25 #include <iostream>
26 #include <vector>
27 #include "sortedvector.t"
28 #include "fasta_io.h"
29 #include "char_io.h"
30 #include "util.h"
31
32 #if !defined(NO_STD_NAMESPACE)
33 using namespace std;
34 #endif
35
36 char release_tag[] = "$Name: $";
37
38 #if defined(__alpha)
39 // Hack to get around problems with mmap
40 const unsigned long __sbrk_override = 1;
41 #endif
42
43 #define _NO_LARGEFILE_STREAMS
44
45 void usage(char *message=NULL) {
46 if (message != NULL && strlen(message) > 0) {
47 cerr << message << endl;
48 cerr << endl;
49 }
50 cerr << "Usage: compress_seq [options] \n\n";
51 cerr << "Options: \n";
52 cerr << " -i <sequence-database> Input sequence database.\n";
53 cerr << " Required.\n";
54 cerr << " -e [true|false] Insert end-of-sequence character.\n";
55 cerr << " Default: true. Optional.\n";
56 cerr << " -S [true|false] Insert end-of-sequence character before\n";
57 cerr << " initial sequence. Default: true. Optional.\n";
58 cerr << " -E <char> Use specified end-of-sequence character.\n";
59 cerr << " Specified as decimal, octal, or hex. integer.\n";
60 cerr << " Default: \"\\n\". Optional\n";
61 cerr << " -u [true|false] Uppercase sequence characters.\n";
62 cerr << " Default: true. Optional\n";
63 cerr << " -n [true|false] Normalize sequence information.\n";
64 cerr << " Default: false. Optional\n";
65 cerr << " -D [true|false] Optimize normalized sequence for DNA.\n";
66 cerr << " Default: true. Optional\n";
67 cerr << " -z [true|false] Compress normalized sequence information.\n";
68 cerr << " Default: false. Optional\n";
69 cerr << " -I [true|false] Write sequence and header index in binary format.\n";
70 cerr << " Default: true. Optional\n";
71 cerr << " -T [true|false] Output character table only. Default: false. Optional\n";
72 cerr << " Default: true. Optional\n";
73 cerr << " -F [true|false] Force all output files to be re-built.\n";
74 cerr << " Default: false. Optional\n";
75 cerr << " -C [true|false] Cleanup (delete) unnecessary files.\n";
76 cerr << " Default: true. Optional\n";
77 cerr << " -B Use buffered I/O rather than mmap.\n";
78 cerr << " -v Print version information.\n";
79 cerr << " -h Command line option help.\n";
80 cerr << "\n";
81 exit(1);
82 }
83
84 struct Options {
85 // bool quiet;
86 std::string database;
87 bool eos;
88 bool init_eos;
89 bool compress;
90 bool uc;
91 bool normalize;
92 char eos_char;
93 bool dnaopt;
94 bool force;
95 bool cleanup;
96 bool bufferedio;
97 bool binindex;
98 bool verbose;
99 bool tableonly;
100 };
101
102 void options(int argc, char *argv[], Options & opt) {
103 signed char c;
104 optarg = NULL;
105 opt.eos = true;
106 opt.compress = false;
107 opt.uc = true;
108 opt.normalize = false;
109 opt.eos_char = '\n';
110 opt.dnaopt = true;
111 opt.force = false;
112 opt.cleanup = true;
113 opt.bufferedio = false;
114 opt.binindex = true;
115 opt.init_eos = true;
116 opt.verbose = false;
117 opt.tableonly = false;
118 while ((c = getopt(argc, argv, "i:e:S:z:u:D:E:n:F:C:I:T:Bhv")) != -1)
119 switch (c) {
120 case 'i':
121 opt.database = optarg;
122 break;
123 case 'e':
124 if (is_true(optarg)) {
125 opt.eos = true;
126 } else if (is_false(optarg)) {
127 opt.eos = false;
128 } else {
129 usage("Invalid value for -e option.");
130 }
131 break;
132 case 'S':
133 if (is_true(optarg)) {
134 opt.init_eos = true;
135 opt.eos = true;
136 } else if (is_false(optarg)) {
137 opt.init_eos = false;
138 } else {
139 usage("Invalid value for -S option.");
140 }
141 break;
142 case 'E': {
143 int ch;
144 sscanf(optarg,"%i",&ch);
145 opt.eos_char = (char)ch;
146 }
147 break;
148 case 'u':
149 if (is_true(optarg)) {
150 opt.uc = true;
151 } else if (is_false(optarg)) {
152 opt.uc = false;
153 } else {
154 usage("Invalid value for -u option.");
155 }
156 break;
157 case 'D':
158 if (is_true(optarg)) {
159 opt.dnaopt = true;
160 } else if (is_false(optarg)) {
161 opt.dnaopt = false;
162 } else {
163 usage("Invalid value for -D option.");
164 }
165 break;
166 case 'I':
167 if (is_true(optarg)) {
168 opt.binindex = true;
169 } else if (is_false(optarg)) {
170 opt.binindex = false;
171 } else {
172 usage("Invalid value for -I option.");
173 }
174 break;
175 case 'T':
176 if (is_true(optarg)) {
177 opt.tableonly = true;
178 } else if (is_false(optarg)) {
179 opt.tableonly = false;
180 } else {
181 usage("Invalid value for -T option.");
182 }
183 break;
184 case 'n':
185 if (is_true(optarg)) {
186 opt.normalize = true;
187 } else if (is_false(optarg)) {
188 opt.normalize = false;
189 } else {
190 usage("Invalid value for -n option.");
191 }
192 break;
193 case 'z':
194 if (is_true(optarg)) {
195 opt.compress = true;
196 } else if (is_false(optarg)) {
197 opt.compress = false;
198 } else {
199 usage("Invalid value for -z option.");
200 }
201 break;
202 case 'F':
203 if (is_true(optarg)) {
204 opt.force = true;
205 } else if (is_false(optarg)) {
206 opt.force = false;
207 } else {
208 usage("Invalid value for -F option.");
209 }
210 break;
211 case 'C':
212 if (is_true(optarg)) {
213 opt.cleanup = true;
214 } else if (is_false(optarg)) {
215 opt.cleanup = false;
216 } else {
217 usage("Invalid value for -F option.");
218 }
219 break;
220 case 'B':
221 opt.bufferedio = true;
222 break;
223 case 'v':
224 opt.verbose = true;
225 break;
226 case 'h':
227 default :
228 usage();
229 }
230 if (opt.database == ""&&!opt.verbose) usage("Arguement -i missing from commandline.");
231 }
232
233 int main(int argc,char *argv[]) {
234
235 Options opt;
236 options(argc,argv,opt);
237
238 if (opt.verbose) {
239 ostrstream ss;
240 ss << "Release Tag: " << release_tag << ends;
241 std::string v(ss.str());
242 timestamp(v.c_str());
243 }
244
245 if (opt.database == "") {
246 exit(1);
247 }
248 time_t fasta_time = modtime(opt.database);
249 time_t seq_time = modtime(opt.database+".seq");
250 time_t hdr_time = modtime(opt.database+".hdr");
251 time_t idx_time = modtime(opt.database+".idx");
252 time_t idb_time = modtime(opt.database+".idb");
253 time_t tbl_time = modtime(opt.database+".tbl");
254 time_t sqn_time = modtime(opt.database+".sqn");
255 time_t tbz_time = modtime(opt.database+".tbz");
256 time_t sqz_time = modtime(opt.database+".sqz");
257
258 if (idx_time < idb_time) idx_time = idb_time;
259
260 bool didinitscan=false;
261 std::vector<bool> obs(256,false);
262 if (opt.eos) {
263 obs[opt.eos_char] = true;
264 }
265
266 FILE_POSITION_TYPE outputi=0;
267 #define OUTPUTBUFSIZE 8192
268 char outputbuffer[OUTPUTBUFSIZE];
269
270 if (opt.force ||
271 ((!opt.compress && !opt.normalize &&
272 fasta_time >= seq_time) ||
273 fasta_time >= hdr_time ||
274 fasta_time >= idx_time) ||
275 (opt.tableonly && fasta_time >= tbl_time) ||
276 (opt.compress &&
277 (fasta_time >= idx_time || fasta_time >= tbz_time || fasta_time >= sqz_time)) ||
278 (opt.normalize &&
279 (fasta_time >= idx_time || fasta_time >= tbl_time || fasta_time >= sqn_time))
280 ) {
281 didinitscan=true;
282 CharacterProducer* db;
283 if (!opt.bufferedio) {
284 db = new MapFileChars(opt.database.c_str());
285 } else {
286 db = new BufferedFileChars(opt.database.c_str());
287 }
288 #if defined(_NO_LARGEFILE_STREAMS)
289 FILE *seq;
290 #else
291 ofstream seq;
292 #endif
293 if (!opt.tableonly) {
294 #if defined(_NO_LARGEFILE_STREAMS)
295 seq = fopen((opt.database+".seq").c_str(),"wb");
296 #else
297 #if ! defined(__CYGWIN__)
298 seq.open((opt.database+".seq").c_str());
299 #else
300 seq.open((opt.database+".seq").c_str(),ios::binary);
301 #endif
302 #endif
303 }
304
305 #if defined(_NO_LARGEFILE_STREAMS)
306 FILE *hdr;
307 #else
308 ofstream hdr;
309 #endif
310 if (!opt.tableonly) {
311 #if defined(_NO_LARGEFILE_STREAMS)
312 hdr = fopen((opt.database+".hdr").c_str(),"wb");
313 #else
314 #if ! defined(__CYGWIN__)
315 hdr.open((opt.database+".hdr").c_str());
316 #else
317 hdr.open((opt.database+".hdr").c_str(),ios::binary);
318 #endif
319 #endif
320 }
321
322 #if defined(_NO_LARGEFILE_STREAMS)
323 FILE *idx;
324 #else
325 ofstream idx;
326 #endif
327 if (!opt.binindex && !opt.tableonly) {
328 #if defined(_NO_LARGEFILE_STREAMS)
329 idx = fopen((opt.database+".idx").c_str(),"wb");
330 #else
331 #if ! defined(__CYGWIN__)
332 idx.open((opt.database+".idx").c_str());
333 #else
334 idx.open((opt.database+".idx").c_str(),ios::binary);
335 #endif
336 #endif
337 }
338
339 long unsigned int count=0;
340 FILE_POSITION_TYPE headerpos=0;
341 FILE_POSITION_TYPE seqpos=0;
342 sortedvector<FILE_POSITION_TYPE,FILE_POSITION_TYPE> svi;
343 if (opt.init_eos) {
344 outputbuffer[outputi++] = opt.eos_char;
345 seqpos++;
346 }
347 if (!opt.tableonly) {
348 if (!opt.binindex) {
349 #if defined(_NO_LARGEFILE_STREAMS)
350 fprintf(idx,"%lu %llu %llu %llu\n",count,headerpos,seqpos,0);
351 #else
352 idx << count << " " << headerpos << " " << seqpos << " " << 0 << '\n';
353 #endif
354 } else {
355 svi.push_back(seqpos,headerpos);
356 }
357 }
358
359 bool inseq=false;
360 bool inheader=false;
361 bool startofline=true;
362
363 char ch;
364
365 while (!db->eof()) {
366 ch = db->getch();
367 if (startofline && ch == '>') {
368 if (inseq) {
369 if (opt.eos) {
370 outputbuffer[outputi++] = opt.eos_char;
371 seqpos++;
372 }
373 if (outputi >= (OUTPUTBUFSIZE-1)) {
374 if (!opt.tableonly) {
375 #if defined(_NO_LARGEFILE_STREAMS)
376 fwrite(outputbuffer,sizeof(char),outputi,seq);
377 #else
378 seq.write(outputbuffer,outputi);
379 #endif
380 }
381 outputi=0;
382 }
383 if (!opt.tableonly) {
384 if (!opt.binindex) {
385 #if defined(_NO_LARGEFILE_STREAMS)
386 fprintf(idx,"%llu %llu\n",seqpos,(db->pos())-1);
387 #else
388 idx << seqpos << " " << (db->pos())-1 << '\n';
389 #endif
390 } else {
391 svi.push_back(seqpos,headerpos);
392 }
393 }
394 }
395 inheader = true;
396 inseq = false;
397 startofline = false;
398 continue;
399 } else if (inheader) {
400 if (ch == '\n' || ch == '\r') {
401 if (ch == '\r') {
402 ch = db->getch();
403 assert(ch == '\n');
404 }
405 if (!opt.tableonly) {
406 #if defined(_NO_LARGEFILE_STREAMS)
407 fputc(ch,hdr);
408 #else
409 hdr << ch;
410 #endif
411 }
412 headerpos++;
413 inheader=false;
414 inseq=true;
415 startofline=true;
416 count++;
417 if (!opt.binindex && !opt.tableonly) {
418 #if defined(_NO_LARGEFILE_STREAMS)
419 fprintf(idx,"%lu %llu ",count,headerpos);
420 #else
421 idx << count << " " << headerpos << " ";
422 #endif
423 }
424 continue;
425 } else {
426 if (!opt.tableonly) {
427 #if defined(_NO_LARGEFILE_STREAMS)
428 fputc(ch,hdr);
429 #else
430 hdr << ch;
431 #endif
432 }
433 headerpos++;
434 }
435 if (startofline) startofline=false;
436 continue;
437 } else if (inseq) {
438 if (ch == '\n' || ch == '\r') {
439 if (ch == '\r') {
440 ch = db->getch();
441 assert(ch == '\n');
442 }
443 startofline = true;
444 continue;
445 } else if ((int)ch < 33 || (int)ch > 126) {
446 if (startofline) startofline=false;
447 continue;
448 } else {
449 if (opt.uc) ch = toupper(ch);
450 outputbuffer[outputi++] = ch;
451 if (outputi >= (OUTPUTBUFSIZE-1)) {
452 if (!opt.tableonly) {
453 #if defined(_NO_LARGEFILE_STREAMS)
454 fwrite(outputbuffer,sizeof(char),outputi,seq);
455 #else
456 seq.write(outputbuffer,outputi);
457 #endif
458 }
459 outputi=0;
460 }
461 seqpos++;
462 if (opt.normalize || opt.compress || opt.tableonly) obs[ch] = true;
463 if (startofline) startofline=false;
464 continue;
465 }
466 }
467 }
468 if (inheader && !opt.tableonly) {
469 #if defined(_NO_LARGEFILE_STREAMS)
470 fputc('\n',hdr);
471 #else
472 hdr << '\n';
473 #endif
474 headerpos++;
475 count++;
476 if (!opt.binindex) {
477 #if defined(_NO_LARGEFILE_STREAMS)
478 fprintf(idx,"%lu %llu %llu %llu\n",count,headerpos,seqpos,(db->pos())-1);
479 #else
480 idx << count << " " << headerpos << " " << seqpos << " " << (db->pos())-1 << '\n';
481 #endif
482 } else {
483 svi.push_back(seqpos,headerpos);
484 }
485 } else if (inseq) {
486 if (opt.eos) {
487 outputbuffer[outputi++] = opt.eos_char;
488 if (outputi >= (OUTPUTBUFSIZE-1)) {
489 if (!opt.tableonly) {
490 #if defined(_NO_LARGEFILE_STREAMS)
491 fwrite(outputbuffer,sizeof(char),outputi,seq);
492 #else
493 seq.write(outputbuffer,outputi);
494 #endif
495 }
496 outputi=0;
497 }
498 seqpos++;
499 }
500 if (!opt.tableonly) {
501 if (!opt.binindex) {
502 #if defined(_NO_LARGEFILE_STREAMS)
503 fprintf(idx,"%llu %llu\n",seqpos,(db->pos())-1);
504 #else
505 idx << seqpos << " " << (db->pos())-1 << '\n';
506 #endif
507 } else {
508 svi.push_back(seqpos,headerpos);
509 }
510 }
511 }
512 if (outputi > 0) {
513 if (!opt.tableonly) {
514 #if defined(_NO_LARGEFILE_STREAMS)
515 fwrite(outputbuffer,sizeof(char),outputi,seq);
516 #else
517 seq.write(outputbuffer,outputi);
518 #endif
519 }
520 outputi=0;
521 }
522 delete db;
523 if (!opt.tableonly) {
524 #if defined(_NO_LARGEFILE_STREAMS)
525 fclose(seq);
526 #else
527 seq.close();
528 #endif
529 #if defined(_NO_LARGEFILE_STREAMS)
530 fclose(hdr);
531 #else
532 hdr.close();
533 #endif
534 if (!opt.binindex) {
535 #if defined(_NO_LARGEFILE_STREAMS)
536 fclose(idx);
537 #else
538 idx.close();
539 #endif
540 }
541
542 if (opt.binindex) {
543 ofstream idb;
544 #if ! defined(__CYGWIN__)
545 idb.open((opt.database+".idb").c_str());
546 #else
547 idb.open((opt.database+".idb").c_str(),ios::binary);
548 #endif
549 svi.bwrite(idb);
550 idb.close();
551 }
552
553 }
554
555 }
556
557 seq_time = modtime(opt.database+".seq");
558 hdr_time = modtime(opt.database+".hdr");
559 idx_time = modtime(opt.database+".idx");
560 idb_time = modtime(opt.database+".idb");
561 if (idx_time < idb_time) idx_time = idb_time;
562
563 if (!opt.normalize && !opt.compress && !opt.tableonly) return 0;
564
565 if (didinitscan) {
566 std::vector<int> chmap(256);
567 std::vector<int> order(256);
568 unsigned int index=0;
569 ofstream tbl;
570 ofstream tbz;
571 if (opt.normalize || opt.tableonly) {
572 #if ! defined(__CYGWIN__)
573 tbl.open((opt.database+".tbl").c_str());
574 #else
575 tbl.open((opt.database+".tbl").c_str(),ios::binary);
576 #endif
577 }
578 if (opt.compress) {
579 #if ! defined(__CYGWIN__)
580 tbz.open((opt.database+".tbz").c_str());
581 #else
582 tbz.open((opt.database+".tbz").c_str(),ios::binary);
583 #endif
584 }
585 for (int i=0;i<256;i++) {
586 order[i] = i;
587 }
588 if (opt.dnaopt) {
589 order[0] = 'A'; order['A'] = 0;
590 order[1] = 'C'; order['C'] = 1;
591 order[2] = 'G'; order['G'] = 2;
592 order[3] = 'T'; order['T'] = 3;
593 }
594 for (int i=0;i<256;i++) {
595 if (obs[order[i]]) {
596 tbl << (unsigned char)order[i];
597 tbz << (unsigned char)order[i];
598 chmap[order[i]] = index;
599 index++;
600 }
601 }
602 tbl.close();
603 tbz.close();
604 }
605
606 if (!opt.normalize && !opt.compress) return 0;
607
608 tbl_time = modtime(opt.database+".tbl");
609 tbz_time = modtime(opt.database+".tbz");
610
611 if (opt.compress &&
612 (opt.force ||
613 (seq_time >= sqz_time) ||
614 (tbz_time >= sqz_time))
615 ) {
616
617 unsigned char invchmap[256];
618 long unsigned int index;
619 {
620 std::string mapfn(opt.database+".tbz");
621 ifstream chmapfile(mapfn.c_str());
622 char chmap[256];
623 chmapfile.read(chmap,256);
624 index=chmapfile.gcount();
625 chmapfile.close();
626 for (int i=0;i<256;i++) {
627 invchmap[i] = 255;
628 }
629 for (unsigned int i=0;i<index;i++) {
630 invchmap[chmap[i]] = i;
631 }
632 }
633
634 unsigned int bits = 1;
635 while ( (((unsigned int)1) << bits) < index ) {
636 bits++;
637 }
638 unsigned int ucharbits = 8*sizeof(unsigned char);
639 assert(bits <= ucharbits);
640 unsigned int bufsize = least_common_multiple(bits,ucharbits)/8;
641 bufsize *= 8;//sizeof(bigword);
642 // cerr << bits << " " << ucharbits << " " << bufsize << endl;
643
644 CharacterProducer* seqin;
645 if (!opt.bufferedio) {
646 seqin = new MapFileChars((opt.database+".seq").c_str());
647 } else {
648 seqin = new BufferedFileChars((opt.database+".seq").c_str());
649 }
650 #if defined(_NO_LARGEFILE_STREAMS)
651 FILE *sqz = fopen((opt.database+".sqz").c_str(),"wb");
652 #else
653 #if ! defined(__CYGWIN__)
654 ofstream sqz((opt.database+".sqz").c_str());
655 #else
656 ofstream sqz((opt.database+".sqz").c_str(),ios::binary);
657 #endif
658 #endif
659
660 FILE_POSITION_TYPE zposition;
661 unsigned int bufposition;
662 unsigned int bitposition;
663 unsigned int position;
664 position = zposition = bufposition = bitposition = 0;
665
666 unsigned char *buffer = new unsigned char[bufsize];
667 for (unsigned int i=0;i<bufsize;i++) {
668 buffer[i] = 0;
669 }
670
671 outputi=0;
672
673 bool partial_buffer = false;
674 bool eofcache;
675 while (!(eofcache=seqin->eof()) || partial_buffer) {
676 partial_buffer = true;
677 char ch;
678 // cerr << ((eofcache)?"eofcache true":"eofcache false") << endl;
679 // cerr << ((seqin->eof())?"seqin->eof() true":"seqin->eof() false") << endl;
680 if (eofcache) {
681 // checkpoint;
682 ch = opt.eos_char;
683 } else {
684 // checkpoint;
685 ch = seqin->getch();
686 }
687 // cerr << ((seqin->eof())?"seqin->eof() true":"seqin->eof() false") << endl;
688 // cerr << "\"" << (unsigned int)ch << "\" ";
689 // cerr << bufposition << "." << bitposition << " " << endl;
690 unsigned char mask = invchmap[ch];
691 // cerr << (int)mask << " " << (int)ch << endl;
692 assert(mask < 255);
693 // binary_format(cerr,mask);
694 // cerr << endl;
695 mask = mask << (ucharbits - bits);
696 // binary_format(cerr,mask);
697 // cerr << endl;
698 if (bits + bitposition <= ucharbits) {
699 mask = mask >> bitposition;
700 // binary_format(cerr,mask);
701 // cerr << endl;
702 buffer[bufposition] |= mask;
703 // binary_format(cerr,buffer[bufposition]);
704 // cerr << endl;
705 bitposition += bits;
706 } else {
707 unsigned char mask1 = mask;
708 mask = mask >> bitposition;
709 mask1 = mask1 << (bits - (bitposition + bits - ucharbits));
710 // binary_format(cerr,mask);
711 // cerr << " " << (int) mask << endl;
712 // binary_format(cerr,mask1);
713 // cerr << " " << (int) mask1 << endl;
714 // binary_format(cerr,buffer[bufposition]);
715 // cerr << " " << (int) buffer[bufposition] << endl;
716 buffer[bufposition] |= mask;
717 // binary_format(cerr,buffer[bufposition]);
718 // cerr << " " << (int) buffer[bufposition] << endl;
719 buffer[bufposition+1] |= mask1;
720 // binary_format(cerr,buffer[bufposition+1]);
721 // cerr << " " << (int) buffer[bufposition+1] << endl;
722 bitposition += (bits - ucharbits);
723 bufposition ++;
724 }
725 if (bufposition == bufsize-1 && bitposition == ucharbits) {
726 // int z=0;
727 for (unsigned int i=0;i<bufsize;i++) {
728 // binary_format(cerr,buffer[i],bits,z);
729 outputbuffer[outputi++] = buffer[i];
730 buffer[i] = 0;
731 }
732 if (outputi >= (OUTPUTBUFSIZE-bufsize)) {
733 #if defined(_NO_LARGEFILE_STREAMS)
734 fwrite(outputbuffer,sizeof(char),outputi,sqz);
735 #else
736 sqz.write(outputbuffer,outputi);
737 #endif
738 outputi=0;
739 }
740 // cerr << endl;
741 bufposition = 0;
742 bitposition = 0;
743 zposition += bufsize;
744 partial_buffer = false;
745 }
746 position++;
747 }
748 // int z=0;
749 // for (unsigned int i=0;i<bufsize;i++) {
750 // binary_format(cerr,buffer[i],bits,z);
751 // sqz << buffer[i];
752 // }
753 // cerr << endl;
754
755 if (outputi > 0) {
756 #if defined(_NO_LARGEFILE_STREAMS)
757 fwrite(outputbuffer,sizeof(char),outputi,sqz);
758 #else
759 sqz.write(outputbuffer,outputi);
760 #endif
761 outputi=0;
762 }
763 delete buffer;
764
765 delete seqin;
766 #if defined(_NO_LARGEFILE_STREAMS)
767 fclose(sqz);
768 #else
769 sqz.close();
770 #endif
771
772 }
773
774 if (opt.normalize &&
775 (opt.force ||
776 (seq_time >= sqn_time) ||
777 (tbl_time >= sqn_time))
778 ) {
779
780 unsigned char invchmap[256];
781 long unsigned int index;
782 {
783 std::string mapfn(opt.database+".tbl");
784 ifstream chmapfile(mapfn.c_str());
785 char chmap[256];
786 chmapfile.read(chmap,256);
787 index=chmapfile.gcount();
788 chmapfile.close();
789 for (int i=0;i<256;i++) {
790 invchmap[i] = 255;
791 }
792 for (unsigned int i=0;i<index;i++) {
793 invchmap[chmap[i]] = i;
794 }
795 }
796 CharacterProducer* seqin;
797 if (!opt.bufferedio) {
798 seqin = new MapFileChars((opt.database+".seq").c_str());
799 } else {
800 seqin = new BufferedFileChars((opt.database+".seq").c_str());
801 }
802 #if defined(_NO_LARGEFILE_STREAMS)
803 FILE *sqn = fopen((opt.database+".sqn").c_str(),"wb");
804 #else
805 #if ! defined(__CYGWIN__)
806 ofstream sqn((opt.database+".sqn").c_str());
807 #else
808 ofstream sqn((opt.database+".sqn").c_str(),ios::binary);
809 #endif
810 #endif
811
812 char ch;
813
814 outputi=0;
815 while (!seqin->eof()) {
816 ch = seqin->getch();
817 outputbuffer[outputi++] = (unsigned char)invchmap[ch];
818 if (outputi >= (OUTPUTBUFSIZE-1)) {
819 #if defined(_NO_LARGEFILE_STREAMS)
820 fwrite(outputbuffer,sizeof(char),outputi,sqn);
821 #else
822 sqn.write(outputbuffer,outputi);
823 #endif
824 outputi=0;
825 }
826 }
827 if (outputi > 0) {
828 #if defined(_NO_LARGEFILE_STREAMS)
829 fwrite(outputbuffer,sizeof(char),outputi,sqn);
830 #else
831 sqn.write(outputbuffer,outputi);
832 #endif
833 outputi=0;
834 }
835 delete seqin;
836 #if defined(_NO_LARGEFILE_STREAMS)
837 fclose(sqn);
838 #else
839 sqn.close();
840 #endif
841 }
842
843 if (opt.cleanup && (opt.compress || opt.normalize) ) {
844 unlink((opt.database+".seq").c_str());
845 }
846
847 }