ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6  
7   #define USAGE "Usage:\n\
8 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-q <minqv>] [-C] [-D]\\\n\
9 <   [-p {64|33}] [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>]\\\n\
10 <   [-Q] <input.fq>\n\
8 > fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
9 >   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
11 >    <input.fq>[,<input_mates.fq>\n\
12   \n\
13 < Trim low quality bases at 3' end, optionally trim adapter sequence, filter\n\
14 < for low complexity and collapse duplicate reads\n\
13 > Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 > for low complexity and collapse duplicate reads.\n\
15 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
16 > one read in a pair), the two file names should be given delimited by a comma\n\
17 > or a colon character\n\
18   \n\
19   Options:\n\
20   -n  rename all the reads using the <prefix> followed by a read counter;\n\
21      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
22      the read duplication count\n\
23 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
23 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
24 >    file(s) named <input>.<outsuffix> which will be created in the \n\
25 >    current (working) directory; (writes to stdout if -o- is given);\n\
26 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
27 > -f  file with adapter sequences to trim, each line having this format:\n\
28 >    <5'-adapter-sequence> <3'-adapter-sequence>\n\
29   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
30      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
31   -3  trim the given adapter sequence at the 3' end of each read\n\
32      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
33 < -q  trim bases with quality value lower than <minq> (starting the 3' end)\n\
33 > -a  minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
35 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
36   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
37   -l  minimum \"clean\" length after trimming that a read must have\n\
38      in order to pass the filter (default: 16)\n\
# Line 34 | Line 46
46   -Q  convert quality values to the other Phred qv type\n\
47   -V  verbose processing\n\
48   "
49 +
50 + //-z  for -o option, the output stream(s) will be first piped into the given\n
51 + //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
52 +
53 +
54   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55  
56 < //For pair ends sequencing:
56 > //For paired reads sequencing:
57   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
58   //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
59 < FILE* f_out=NULL; //stdout if not provided
60 < FILE* f_in=NULL; //input fastq (stdin if not provided)
59 > //FILE* f_out=NULL; //stdout if not provided
60 > //FILE* f_out2=NULL; //for paired reads
61 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
62 > //FILE* f_in2=NULL; //for paired reads
63   FILE* freport=NULL;
64 +
65   bool debug=false;
66   bool verbose=false;
67   bool doCollapse=false;
68   bool doDust=false;
69 + bool fastaOutput=false;
70   bool trashReport=false;
71   //bool rawFormat=false;
72   int min_read_len=16;
# Line 53 | Line 74
74   int dust_cutoff=16;
75   bool isfasta=false;
76   bool convert_phred=false;
77 + GStr outsuffix; // -o
78 + //GStr adapter3;
79 + //GStr adapter5;
80   GStr prefix;
81 < GStr adapter3;
82 < GStr adapter5;
83 <
84 < int qvtrim_min=0;
81 > GStr zcmd;
82 > int num_trimmed5=0;
83 > int num_trimmed3=0;
84 > int num_trimmedN=0;
85 > int num_trimmedQ=0;
86 > int min_trimmed5=INT_MAX;
87 > int min_trimmed3=INT_MAX;
88  
89 + int qvtrim_qmin=0;
90 + int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
91   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
92   int qv_cvtadd=0; //could be -31 or +31
93  
# Line 68 | Line 97
97   const int a_m_score=2; //match score
98   const int a_mis_score=-3; //mismatch
99   const int a_dropoff_score=7;
100 < const int a_min_score=8; //an exact match of 4 bases at the proper ends WILL be trimmed
100 > int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
101   const int a_min_chain_score=15; //for gapped alignments
102  
103   class CSegChain;
# Line 273 | Line 302
302    if (!s.is_empty()) {
303        if (s=='-') f=stdout;
304        else {
305 <       f=fopen(s,"w");
305 >       f=fopen(s.chars(),"w");
306         if (f==NULL) GError("Error creating file: %s\n", s.chars());
307         }
308       }
# Line 284 | Line 313
313  
314   GHash<FqDupRec> dhash; //hash to keep track of duplicates
315  
316 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
317 +                       GStr& s, GStr& infname, GStr& infname2);
318 + // uses outsuffix to generate output file names and open file handles as needed
319 +
320 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
321 + void trash_report(char trashcode, GStr& rname, FILE* freport);
322 +
323 + bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
324 +          GStr& rname, GStr& rinfo, GStr& infname);
325 +
326 + char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
327 + //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
328 +
329   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
330   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
331   int dust(GStr& seq);
# Line 294 | Line 336
336   void convertPhred(GStr& q);
337  
338   int main(int argc, char * const argv[]) {
339 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:o:");
339 >  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
340    int e;
299  int icounter=0; //counter for input reads
300  int outcounter=0; //counter for output reads
341    if ((e=args.isError())>0) {
342        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
343        exit(224);
344        }
345    debug=(args.getOpt('Y')!=NULL);
346 <  debug=(args.getOpt('V')!=NULL);
346 >  verbose=(args.getOpt('V')!=NULL);
347    convert_phred=(args.getOpt('Q')!=NULL);
348    doCollapse=(args.getOpt('C')!=NULL);
349    doDust=(args.getOpt('D')!=NULL);
# Line 327 | Line 367
367       }
368    s=args.getOpt('q');
369    if (!s.is_empty()) {
370 <     qvtrim_min=s.asInt();
370 >     qvtrim_qmin=s.asInt();
371 >     }
372 >  s=args.getOpt('t');
373 >  if (!s.is_empty()) {
374 >     qvtrim_max=s.asInt();
375       }
376    s=args.getOpt('p');
377    if (!s.is_empty()) {
# Line 353 | Line 397
397      adapter5.upper();
398      a5len=adapter5.length();
399      }
400 +  s=args.getOpt('a');
401 +  if (!s.is_empty()) {
402 +     int a_minmatch=s.asInt();
403 +     a_min_score=a_minmatch<<1;
404 +     }
405 +  
406 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
407 +                         else outsuffix="-";
408    trashReport=  (args.getOpt('r')!=NULL);
409 <  if (args.startNonOpt()==0) {
409 >  int fcount=args.startNonOpt();
410 >  if (fcount==0) {
411      GMessage(USAGE);
412      exit(224);
413      }
414 <
415 <  openfw(f_out, args, 'o');
416 <  if (f_out==NULL) f_out=stdout;
414 >   if (fcount>1 && doCollapse) {
415 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
416 >    }
417 >  //openfw(f_out, args, 'o');
418 >  //if (f_out==NULL) f_out=stdout;
419    if (trashReport)
420      openfw(freport, args, 'r');
421    char* infile=NULL;
422    while ((infile=args.nextNonOpt())!=NULL) {
423 <    GStr infname(infile);
424 <    if (strcmp(infile,"-")==0) {
425 <       f_in=stdin; infname="stdin"; }
426 <     else {
427 <        f_in=fopen(infile,"r");
428 <        if (f_in==NULL)
429 <            GError("Cannot open input file %s!\n",infile);
430 <        }
431 <     GLineReader fq(f_in);
432 <     char* l=NULL;
433 <     while ((l=fq.getLine())!=NULL) {
434 <        GStr rname; //current read name
435 <        GStr rseq;  //current read sequence
436 <        GStr rqv;   //current read quality values
437 <        GStr s;
438 <        /* if (rawFormat) {
439 <          //TODO: implement qseq parsing here
440 <          //if (raw type=N) then continue; //skip invalid/bad records
441 <          
442 <          } //raw qseq format
443 <         else { // FASTQ or FASTA */
444 <          isfasta=(l[0]=='>');
445 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
446 <          s=l;
447 <          rname=&(l[1]);
448 <          icounter++;
449 <          for (int i=0;i<rname.length();i++)
450 <            if (rname[i]<=' ') { rname.cut(i); break; }
451 <          //now get the sequence
452 <          if ((l=fq.getLine())==NULL)
453 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
454 <          rseq=l; //this must be the DNA line
455 <          while ((l=fq.getLine())!=NULL) {
456 <              //seq can span multiple lines
457 <              if (l[0]=='>' || l[0]=='+') {
458 <                   fq.pushBack();
459 <                   break; //
423 >    int incounter=0; //counter for input reads
424 >    int outcounter=0; //counter for output reads
425 >    int trash_s=0; //too short from the get go
426 >    int trash_Q=0;
427 >    int trash_N=0;
428 >    int trash_D=0;
429 >    int trash_A3=0;
430 >    int trash_A5=0;
431 >    s=infile;
432 >    GStr infname;
433 >    GStr infname2;
434 >    FILE* f_in=NULL;
435 >    FILE* f_in2=NULL;
436 >    FILE* f_out=NULL;
437 >    FILE* f_out2=NULL;
438 >    bool paired_reads=false;
439 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
440 >    GLineReader fq(f_in);
441 >    GLineReader* fq2=NULL;
442 >    if (f_in2!=NULL) {
443 >       fq2=new GLineReader(f_in2);
444 >       paired_reads=true;
445 >       }
446 >    GStr rseq, rqv, seqid, seqinfo;
447 >    GStr rseq2, rqv2, seqid2, seqinfo2;
448 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
449 >       incounter++;
450 >       int a5=0, a3=0, b5=0, b3=0;
451 >       char tcode=0, tcode2=0;
452 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
453 >       //if (!doCollapse) {
454 >         if (fq2!=NULL) {
455 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
456 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
457 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
458 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
459 >               }
460 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
461 >            //decide what to do with this pair and print rseq2 if the pair makes it
462 >            if (tcode>1 && tcode2<=1) {
463 >               //"untrash" rseq
464 >               if (a3-a5+1<min_read_len) {
465 >                   a5=1;
466 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
467                     }
468 <              rseq+=l;
469 <              } //check for multi-line seq
470 <          if (!isfasta) { //reading fastq quality values, which can also be multi-line
471 <            if ((l=fq.getLine())==NULL)
472 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
473 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
474 <            if ((l=fq.getLine())==NULL)
475 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
476 <            rqv=l;
477 <            //if (rqv.length()!=rseq.length())
478 <            //  GError("Error: qv len != seq len for %s\n", rname.chars());
479 <            while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
480 <              rqv+=l; //append to qv string
481 <              }
420 <            }// fastq
421 <        // } //<-- FASTA or FASTQ
422 <        rseq.upper();
423 <        int l5=0;
424 <        int l3=rseq.length()-1;
425 <        if (l3-l5+1<min_read_len) {
426 <           if (trashReport) {
427 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
428 <                  }
429 <           continue;
430 <           }
431 <        if (ntrim(rseq, l5, l3)) { // N-trimming
432 <           //GMessage("before: %s\n",rseq.chars());
433 <           //GMessage("after : %s (%d)\n",rseq.substr(l5,l3-l5+1).chars(),l3-l5+1);
434 <           if (l3-l5+1<min_read_len) {
435 <             if (trashReport) {
436 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
437 <                    }
438 <             continue; //invalid read
439 <             }
440 <            //-- keep only the l5..l3 range
441 <           rseq=rseq.substr(l5, l3-l5+1);
442 <           if (!rqv.is_empty())
443 <              rqv=rqv.substr(l5, l3-l5+1);
444 <           l5=0;
445 <           l3=rseq.length()-1;
446 <           }
447 <        if (qvtrim_min!=0 && !rqv.is_empty() && qtrim(rqv, l5, l3)) { // qv-threshold trimming
448 <           if (l3-l5+1<min_read_len) {
449 <             if (trashReport) {
450 <                    fprintf(freport, "%s\tQ\t%s\n", rname.chars(), rseq.chars());
451 <                    }
452 <             continue; //invalid read
453 <             }
454 <            //-- keep only the l5..l3 range
455 <           rseq=rseq.substr(l5, l3-l5+1);
456 <           if (!rqv.is_empty())
457 <              rqv=rqv.substr(l5, l3-l5+1);
458 <           } //qv trimming
459 <        if (a3len>0) {
460 <          if (trim_adapter3(rseq, l5, l3)) {
461 <             if (l3-l5+1<min_read_len) {
462 <                 if (trashReport) {
463 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
464 <                     }
465 <                 continue;
466 <                 }
467 <              //-- keep only the l5..l3 range
468 <              rseq=rseq.substr(l5, l3-l5+1);
469 <              if (!rqv.is_empty())
470 <                 rqv=rqv.substr(l5, l3-l5+1);
471 <              }//some adapter was trimmed
472 <           } //adapter trimming
473 <        if (a5len>0) {
474 <          if (trim_adapter5(rseq, l5, l3)) {
475 <             if (l3-l5+1<min_read_len) {
476 <                 if (trashReport) {
477 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
478 <                     }
479 <                 continue;
468 >               tcode=1;
469 >               }
470 >             else if (tcode<=1 && tcode2>1) {
471 >               //"untrash" rseq2
472 >               if (b3-b5+1<min_read_len) {
473 >                   b5=1;
474 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
475 >                   }
476 >               tcode2=1;
477 >               }
478 >            if (tcode<=1) { //trimmed or left intact -- write it!
479 >               if (tcode>0) {
480 >                 rseq2=rseq2.substr(b5,b3-b5+1);
481 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
482                   }
483 <              //-- keep only the l5..l3 range
484 <              rseq=rseq.substr(l5, l3-l5+1);
485 <              if (!rqv.is_empty())
486 <                 rqv=rqv.substr(l5, l3-l5+1);
487 <              }//some adapter was trimmed
488 <           } //adapter trimming
489 <        if (doCollapse) {
490 <           //keep read for later
491 <           FqDupRec* dr=dhash.Find(rseq.chars());
492 <           if (dr==NULL) { //new entry
493 <                  //if (prefix.is_empty())
494 <                     dhash.Add(rseq.chars(),
495 <                          new FqDupRec(&rqv, rname.chars()));
496 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
483 >               int nocounter=0;
484 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
485 >               }
486 >            } //paired read
487 >       // }
488 >       if (tcode>1) { //trashed
489 >          if (tcode=='s') trash_s++;
490 >            else if (tcode=='Q') trash_Q++;
491 >              else if (tcode=='N') trash_N++;
492 >               else if (tcode=='D') trash_D++;
493 >                else if (tcode=='3') trash_A3++;
494 >                 else if (tcode=='5') trash_A5++;
495 >          if (trashReport) trash_report(tcode, seqid, freport);
496 >          }
497 >         else if (!doCollapse) { //write it
498 >          if (tcode>0) {
499 >            rseq=rseq.substr(a5,a3-a5+1);
500 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
501 >            }
502 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
503 >          }
504 >       } //for each fastq record
505 >    delete fq2;
506 >    FRCLOSE(f_in);
507 >    FRCLOSE(f_in2);
508 >    if (doCollapse) {
509 >       outcounter=0;
510 >       int maxdup_count=1;
511 >       char* maxdup_seq=NULL;
512 >       dhash.startIterate();
513 >       FqDupRec* qd=NULL;
514 >       char* seq=NULL;
515 >       while ((qd=dhash.NextData(seq))!=NULL) {
516 >         GStr rseq(seq);
517 >         //do the dusting here
518 >         if (doDust) {
519 >            int dustbases=dust(rseq);
520 >            if (dustbases>(rseq.length()>>1)) {
521 >               if (trashReport && qd->firstname!=NULL) {
522 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
523                   }
524 <              else    
525 <                 dr->add(rqv);
526 <           } //collapsing duplicates
527 <         else { //not collapsing duplicates
528 <           //do the dust filter now
529 <           if (doDust) {
530 <             int dustbases=dust(rseq);
531 <             if (dustbases>(rseq.length()>>1)) {
532 <                if (trashReport) {
533 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
534 <                  }
535 <                continue;
536 <                }
509 <             }
510 <           //print this record here  
511 <           outcounter++;
512 <           if (isfasta) {
513 <            if (prefix.is_empty())
514 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
515 <              else
516 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
517 <                                  rseq.chars());
524 >               trash_D+=qd->count;
525 >               continue;
526 >               }
527 >            }
528 >         outcounter++;
529 >         if (qd->count>maxdup_count) {
530 >            maxdup_count=qd->count;
531 >            maxdup_seq=seq;
532 >            }
533 >         if (isfasta) {
534 >           if (prefix.is_empty()) {
535 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
536 >                           rseq.chars());
537               }
538 <           else {  //fastq
539 <            if (convert_phred) convertPhred(rqv);
540 <            if (prefix.is_empty())
522 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
523 <              else
524 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
525 <                                  rseq.chars(),rqv.chars() );
538 >           else { //use custom read name
539 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
540 >                        qd->count, rseq.chars());
541               }
542 <           } //not collapsing duplicates
543 <        } //for each fastq record
544 <   } //while each input file
545 < FRCLOSE(f_in);
546 < if (doCollapse) {
547 <    outcounter=0;
533 <    int maxdup_count=1;
534 <    char* maxdup_seq=NULL;
535 <    dhash.startIterate();
536 <    FqDupRec* qd=NULL;
537 <    char* seq=NULL;
538 <    while ((qd=dhash.NextData(seq))!=NULL) {
539 <      GStr rseq(seq);
540 <      //do the dusting here
541 <      if (doDust) {
542 <         int dustbases=dust(rseq);
543 <         if (dustbases>(rseq.length()>>1)) {
544 <            if (trashReport && qd->firstname!=NULL) {
545 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
546 <              }
547 <            continue;
542 >           }
543 >         else { //fastq format
544 >          if (convert_phred) convertPhred(qd->qv, qd->len);
545 >          if (prefix.is_empty()) {
546 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
547 >                           rseq.chars(), qd->qv);
548              }
549 +          else { //use custom read name
550 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
551 +                        qd->count, rseq.chars(), qd->qv);
552 +            }
553 +           }
554 +         }//for each element of dhash
555 +       if (maxdup_count>1) {
556 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
557           }
558 <      outcounter++;
559 <      if (qd->count>maxdup_count) {
560 <         maxdup_count=qd->count;
561 <         maxdup_seq=seq;
562 <         }
563 <      if (isfasta) {
564 <        if (prefix.is_empty()) {
565 <          fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
566 <                        rseq.chars());
567 <          }
568 <        else { //use custom read name
569 <          fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
570 <                     qd->count, rseq.chars());
558 >       } //collapse entries
559 >    if (verbose) {
560 >       if (paired_reads) {
561 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
562 >           GMessage("Number of input pairs :%9d\n", incounter);
563 >           GMessage("         Output pairs :%9d\n", outcounter);
564 >           }
565 >         else {
566 >           GMessage(">Input file : %s\n", infname.chars());
567 >           GMessage("Number of input reads :%9d\n", incounter);
568 >           GMessage("         Output reads :%9d\n", outcounter);
569 >           }
570 >       GMessage("------------------------------------\n");
571 >       if (num_trimmed5)
572 >          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
573 >       if (num_trimmed3)
574 >          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
575 >       GMessage("------------------------------------\n");
576 >       if (trash_s>0)
577 >         GMessage("     Trashed by length:%9d\n", trash_s);
578 >       if (trash_N>0)
579 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
580 >       if (trash_Q>0)
581 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
582 >       if (trash_A5>0)
583 >         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
584 >       if (trash_A3>0)
585 >         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
586 >       }
587 >    if (trashReport) {
588 >          FWCLOSE(freport);
589            }
590 <        }
591 <      else { //fastq format
592 <       if (convert_phred) convertPhred(qd->qv, qd->len);
567 <       if (prefix.is_empty()) {
568 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
569 <                        rseq.chars(), qd->qv);
570 <         }
571 <       else { //use custom read name
572 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
573 <                     qd->count, rseq.chars(), qd->qv);
574 <         }
575 <        }
576 <      }//for each element of dhash
577 <    if (maxdup_count>1) {
578 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
579 <      }
580 <   } //report collapsed dhash entries
581 < GMessage("Number of input reads: %9d\n", icounter);
582 < GMessage("       Output records: %9d\n", outcounter);
583 < if (trashReport) {
584 <    FWCLOSE(freport);
585 <    }
590 >    FWCLOSE(f_out);
591 >    FWCLOSE(f_out2);
592 >   } //while each input file
593  
587 FWCLOSE(f_out);
594   //getc(stdin);
595   }
596  
# Line 670 | Line 676
676  
677  
678   bool qtrim(GStr& qvs, int &l5, int &l3) {
679 < if (qvtrim_min==0 || qvs.is_empty()) return false;
679 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
680   if (qv_phredtype==0) {
681    //try to guess the Phred type
682    int vmin=256, vmax=0;
# Line 683 | Line 689
689    if (qv_phredtype==0) {
690      GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
691      }
692 <  } //guessing the Phred type
692 >  if (verbose)
693 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
694 >  } //guessing Phred type
695   for (l3=qvs.length()-1;l3>2;l3--) {
696 <  if (qvs[l3]-qv_phredtype>=qvtrim_min && qvs[l3-1]-qv_phredtype>=qvtrim_min) break;
696 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
697    }
698   //just in case, check also the 5' the end (?)
699   for (l5=0;l5<qvs.length()-3;l5++) {
700 <  if (qvs[l5]-qv_phredtype>=qvtrim_min && qvs[l5+1]-qv_phredtype>=qvtrim_min) break;
700 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
701 >  }
702 > if (qvtrim_max>0) {
703 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
704 >  if (l5>qvtrim_max) l5=qvtrim_max;
705    }
706   return (l5>0 || l3<qvs.length()-1);
707   }
# Line 1313 | Line 1325
1325   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1326   }
1327  
1328 + bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1329 +          GStr& rname, GStr& rinfo, GStr& infname) {
1330 + rseq="";
1331 + rqv="";
1332 + rname="";
1333 + rinfo="";
1334 + if (fq.eof()) return false;
1335 + char* l=fq.getLine();
1336 + while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1337 + if (l==NULL) return false;
1338 + /* if (rawFormat) {
1339 +      //TODO: implement raw qseq parsing here
1340 +      //if (raw type=N) then continue; //skip invalid/bad records
1341 +      } //raw qseq format
1342 + else { // FASTQ or FASTA */
1343 + isfasta=(l[0]=='>');
1344 + if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1345 +      infname.chars(), l);
1346 + GStr s(l);
1347 + rname=&(l[1]);
1348 + for (int i=0;i<rname.length();i++)
1349 +    if (rname[i]<=' ') {
1350 +       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1351 +       rname.cut(i);
1352 +       break;
1353 +       }
1354 +  //now get the sequence
1355 + if ((l=fq.getLine())==NULL)
1356 +      GError("Error: unexpected EOF after header for read %s (%s)\n",
1357 +                   rname.chars(), infname.chars());
1358 + rseq=l; //this must be the DNA line
1359 + while ((l=fq.getLine())!=NULL) {
1360 +      //seq can span multiple lines
1361 +      if (l[0]=='>' || l[0]=='+') {
1362 +           fq.pushBack();
1363 +           break; //
1364 +           }
1365 +      rseq+=l;
1366 +      } //check for multi-line seq
1367 + if (!isfasta) { //reading fastq quality values, which can also be multi-line
1368 +    if ((l=fq.getLine())==NULL)
1369 +        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1370 +    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1371 +    if ((l=fq.getLine())==NULL)
1372 +        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1373 +    rqv=l;
1374 +    //if (rqv.length()!=rseq.length())
1375 +    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1376 +    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1377 +      rqv+=l; //append to qv string
1378 +      }
1379 +    }// fastq
1380 + // } //<-- FASTA or FASTQ
1381 + rseq.upper(); //TODO: what if we care about masking?
1382 + return true;
1383 + }
1384 +
1385 + char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1386 + //returns 0 if the read was untouched, 1 if it was just trimmed
1387 + // and a trash code if it was trashed
1388 + l5=0;
1389 + l3=rseq.length()-1;
1390 + if (l3-l5+1<min_read_len) {
1391 +   return 's';
1392 +   }
1393 + GStr wseq(rseq.chars());
1394 + GStr wqv(rqv.chars());
1395 + int w5=l5;
1396 + int w3=l3;
1397 + //first do the q-based trimming
1398 + if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1399 +   if (w3-w5+1<min_read_len) {
1400 +     return 'Q'; //invalid read
1401 +     }
1402 +    //-- keep only the w5..w3 range
1403 +   l5=w5;
1404 +   l3=w3;
1405 +   wseq=wseq.substr(w5, w3-w5+1);
1406 +   if (!wqv.is_empty())
1407 +      wqv=wqv.substr(w5, w3-w5+1);
1408 +   } //qv trimming
1409 + // N-trimming on the remaining read seq
1410 + if (ntrim(wseq, w5, w3)) {
1411 +   //GMessage("before: %s\n",wseq.chars());
1412 +   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1413 +   l5+=w5;
1414 +   l3-=(wseq.length()-1-w3);
1415 +   if (w3-w5+1<min_read_len) {
1416 +     return 'N'; //to be trashed
1417 +     }
1418 +    //-- keep only the w5..w3 range
1419 +   wseq=wseq.substr(w5, w3-w5+1);
1420 +   if (!wqv.is_empty())
1421 +      wqv=wqv.substr(w5, w3-w5+1);
1422 +   w5=0;
1423 +   w3=wseq.length()-1;
1424 +   }
1425 + if (a3len>0) {
1426 +  if (trim_adapter3(wseq, w5, w3)) {
1427 +     int trimlen=wseq.length()-(w3-w5+1);
1428 +     num_trimmed3++;
1429 +     if (trimlen<min_trimmed3)
1430 +         min_trimmed3=trimlen;
1431 +     l5+=w5;
1432 +     l3-=(wseq.length()-1-w3);
1433 +     if (w3-w5+1<min_read_len) {
1434 +         return '3';
1435 +         }
1436 +      //-- keep only the w5..w3 range
1437 +      wseq=wseq.substr(w5, w3-w5+1);
1438 +      if (!wqv.is_empty())
1439 +         wqv=wqv.substr(w5, w3-w5+1);
1440 +      }//some adapter was trimmed
1441 +   } //adapter trimming
1442 + if (a5len>0) {
1443 +  if (trim_adapter5(wseq, w5, w3)) {
1444 +     int trimlen=wseq.length()-(w3-w5+1);
1445 +     num_trimmed5++;
1446 +     if (trimlen<min_trimmed5)
1447 +         min_trimmed5=trimlen;
1448 +     l5+=w5;
1449 +     l3-=(wseq.length()-1-w3);
1450 +     if (w3-w5+1<min_read_len) {
1451 +         return '5';
1452 +         }
1453 +      //-- keep only the w5..w3 range
1454 +      wseq=wseq.substr(w5, w3-w5+1);
1455 +      if (!wqv.is_empty())
1456 +         wqv=wqv.substr(w5, w3-w5+1);
1457 +      }//some adapter was trimmed
1458 +   } //adapter trimming
1459 + if (doCollapse) {
1460 +   //keep read for later
1461 +   FqDupRec* dr=dhash.Find(wseq.chars());
1462 +   if (dr==NULL) { //new entry
1463 +          //if (prefix.is_empty())
1464 +             dhash.Add(wseq.chars(),
1465 +                  new FqDupRec(&wqv, rname.chars()));
1466 +          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1467 +         }
1468 +      else
1469 +         dr->add(wqv);
1470 +   } //collapsing duplicates
1471 + else { //not collapsing duplicates
1472 +   //apply the dust filter now
1473 +   if (doDust) {
1474 +     int dustbases=dust(wseq);
1475 +     if (dustbases>(wseq.length()>>1)) {
1476 +        return 'D';
1477 +        }
1478 +     }
1479 +   } //not collapsing duplicates
1480 + return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1481 + }
1482 +
1483 + void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1484 + //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1485 + if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1486 +  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1487 + }
1488 +
1489 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1490 +   outcounter++;
1491 +   bool asFasta=(rqv.is_empty() || fastaOutput);
1492 +   if (asFasta) {
1493 +    if (prefix.is_empty()) {
1494 +       printHeader(f_out, '>',rname,rinfo);
1495 +       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1496 +       }
1497 +      else {
1498 +       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1499 +                          rseq.chars());
1500 +       }
1501 +     }
1502 +   else {  //fastq
1503 +    if (convert_phred) convertPhred(rqv);
1504 +    if (prefix.is_empty()) {
1505 +       printHeader(f_out, '@', rname, rinfo);
1506 +       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1507 +       }
1508 +      else
1509 +       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1510 +                          rseq.chars(),rqv.chars() );
1511 +     }
1512 + }
1513 +
1514 + void trash_report(char trashcode, GStr& rname, FILE* freport) {
1515 + if (freport==NULL || trashcode<=' ') return;
1516 + if (trashcode=='3' || trashcode=='5') {
1517 +   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1518 +   }
1519 + else {
1520 +   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1521 +   }
1522 + //tcounter++;
1523 + }
1524 +
1525 + GStr getFext(GStr& s, int* xpos=NULL) {
1526 + //s must be a filename without a path
1527 + GStr r("");
1528 + if (xpos!=NULL) *xpos=0;
1529 + if (s.is_empty() || s=="-") return r;
1530 + int p=s.rindex('.');
1531 + int d=s.rindex('/');
1532 + if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1533 + r=s.substr(p+1);
1534 + if (xpos!=NULL) *xpos=p+1;
1535 + r.lower();
1536 + return r;
1537 + }
1538 +
1539 + void baseFileName(GStr& fname) {
1540 + //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1541 + int xpos=0;
1542 + GStr fext=getFext(fname, &xpos);
1543 + if (xpos<=1) return;
1544 + bool extdel=false;
1545 + GStr f2;
1546 + if (fext=="z" || fext=="zip") {
1547 +   extdel=true;
1548 +   }
1549 +  else if (fext.length()>=2) {
1550 +   f2=fext.substr(0,2);
1551 +   extdel=(f2=="gz" || f2=="bz");
1552 +   }
1553 + if (extdel) {
1554 +   fname.cut(xpos-1);
1555 +   fext=getFext(fname, &xpos);
1556 +   if (xpos<=1) return;
1557 +   }
1558 + extdel=false;
1559 + if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1560 +   extdel=true;
1561 +   }
1562 +  else if (fext.length()>=2) {
1563 +   extdel=(fext.substr(0,2)=="fa");
1564 +   }
1565 + if (extdel) fname.cut(xpos-1);
1566 + GStr fncp(fname);
1567 + fncp.lower();
1568 + fncp.chomp("seq");
1569 + fncp.chomp("sequence");
1570 + fncp.trimR("_.");
1571 + if (fncp.length()<fname.length()) fname.cut(fncp.length());
1572 + }
1573 +
1574 + FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1575 +  FILE* f_out=NULL;
1576 +  GStr fname(getFileName(infname.chars()));
1577 +  //eliminate known extensions
1578 +  baseFileName(fname);
1579 +  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1580 +    else if (pocmd.is_empty()) {
1581 +               GStr oname(fname);
1582 +               oname.append('.');
1583 +               oname.append(outsuffix);
1584 +               f_out=fopen(oname.chars(),"w");
1585 +               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1586 +               }
1587 +            else {
1588 +              GStr oname(">");
1589 +              oname.append(fname);
1590 +              oname.append('.');
1591 +              oname.append(outsuffix);
1592 +              pocmd.append(oname);
1593 +              f_out=popen(pocmd.chars(), "w");
1594 +              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1595 +              }
1596 + return f_out;
1597 + }
1598 +
1599 + void guess_unzip(GStr& fname, GStr& picmd) {
1600 + GStr fext=getFext(fname);
1601 + if (fext=="gz" || fext=="gzip" || fext=="z") {
1602 +    picmd="gzip -cd ";
1603 +    }
1604 +   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1605 +    picmd="bzip2 -cd ";
1606 +    }
1607 + }
1608 +
1609 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1610 +                       GStr& s, GStr& infname, GStr& infname2) {
1611 + // uses outsuffix to generate output file names and open file handles as needed
1612 + infname="";
1613 + infname2="";
1614 + f_in=NULL;
1615 + f_in2=NULL;
1616 + f_out=NULL;
1617 + f_out2=NULL;
1618 + //analyze outsuffix intent
1619 + GStr pocmd;
1620 + if (outsuffix=="-") {
1621 +    f_out=stdout;
1622 +    }
1623 +   else {
1624 +    GStr ox=getFext(outsuffix);
1625 +    if (ox.length()>2) ox=ox.substr(0,2);
1626 +    if (ox=="gz") pocmd="gzip -9 -c ";
1627 +        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1628 +    }
1629 + if (s=="-") {
1630 +    f_in=stdin;
1631 +    infname="stdin";
1632 +    f_out=prepOutFile(infname, pocmd);
1633 +    return;
1634 +    } // streaming from stdin
1635 + s.startTokenize(",:");
1636 + s.nextToken(infname);
1637 + bool paired=s.nextToken(infname2);
1638 + if (fileExists(infname.chars())==0)
1639 +    GError("Error: cannot find file %s!\n",infname.chars());
1640 + GStr fname(getFileName(infname.chars()));
1641 + GStr picmd;
1642 + guess_unzip(fname, picmd);
1643 + if (picmd.is_empty()) {
1644 +   f_in=fopen(infname.chars(), "r");
1645 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1646 +   }
1647 +  else {
1648 +   picmd.append(infname);
1649 +   f_in=popen(picmd.chars(), "r");
1650 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1651 +   }
1652 + if (f_out==stdout) {
1653 +   if (paired) GError("Error: output suffix required for paired reads\n");
1654 +   return;
1655 +   }
1656 + f_out=prepOutFile(infname, pocmd);
1657 + if (!paired) return;
1658 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1659 + // ---- paired reads:-------------
1660 + if (fileExists(infname2.chars())==0)
1661 +     GError("Error: cannot find file %s!\n",infname2.chars());
1662 + picmd="";
1663 + GStr fname2(getFileName(infname2.chars()));
1664 + guess_unzip(fname2, picmd);
1665 + if (picmd.is_empty()) {
1666 +   f_in2=fopen(infname2.chars(), "r");
1667 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1668 +   }
1669 +  else {
1670 +   picmd.append(infname2);
1671 +   f_in2=popen(picmd.chars(), "r");
1672 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1673 +   }
1674 + f_out2=prepOutFile(infname2, pocmd);
1675 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines