ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 7 | Line 7
7  
8   #define USAGE "Usage:\n\
9   fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 <   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12      <input.fq>[,<input_mates.fq>\n\
13   \n\
# Line 31 | Line 31
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -A  disable polyA trimming (enabled by default)\n\
35 < -T  enable polyT trimming (disabled by default)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35   -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
# Line 49 | Line 48
48   -Q  convert quality values to the other Phred qv type\n\
49   -V  verbose processing\n\
50   "
51 <
51 >
52   //-z  for -o option, the output stream(s) will be first piped into the given\n
53   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54  
55 <
55 >
56   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58   //For paired reads sequencing:
# Line 63 | Line 62
62   //FILE* f_out2=NULL; //for paired reads
63   //FILE* f_in=NULL; //input fastq (stdin if not provided)
64   //FILE* f_in2=NULL; //for paired reads
65 +
66   FILE* freport=NULL;
67  
68   bool debug=false;
69   bool verbose=false;
70   bool doCollapse=false;
71   bool doDust=false;
72 + bool doPolyTrim=true;
73   bool fastaOutput=false;
74   bool trashReport=false;
75 < //bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77   double max_perc_N=7.0;
78   int dust_cutoff=16;
79   bool isfasta=false;
80   bool convert_phred=false;
81 < GStr outsuffix; // -o
81 > GStr outsuffix; // -o
82   GStr prefix;
83   GStr zcmd;
84   int num_trimmed5=0;
# Line 93 | Line 94
94   int qv_cvtadd=0; //could be -31 or +31
95  
96   // adaptor matching metrics -- for X-drop ungapped extension
97 + //const int match_reward=2;
98 + //const int mismatch_penalty=3;
99 + const int match_reward=2;
100 + const int mismatch_penalty=4;
101 + const int Xdrop=10;
102 +
103   const int poly_m_score=2; //match score
104   const int poly_mis_score=-3; //mismatch
105   const int poly_dropoff_score=7;
# Line 101 | Line 108
108   const char *polyA_seed="AAAA";
109   const char *polyT_seed="TTTT";
110  
111 < struct CAdapters {
112 <    GStr a5;
113 <    GStr a3;
114 <    CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
115 <      }
111 > struct CASeqData {
112 >   //positional data for every possible hexamer in an adapter
113 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
114 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
115 >   GStr seq; //actual adapter sequence data
116 >   GStr seqr; //reverse complement sequence
117 >   int amlen; //fraction of adapter length matching that's
118 >              //enough to consider the alignment
119 >   bool use_reverse;
120 >   CASeqData(bool rev=false):seq(),seqr(),
121 >             amlen(0), use_reverse(rev) {
122 >     for (int i=0;i<4096;i++) {
123 >        pz[i]=NULL;
124 >        pzr[i]=NULL;
125 >        }
126 >     }
127 >
128 >   void update(const char* s) {
129 >         seq=s;
130 >         table6mers(seq.chars(), seq.length(), pz);
131 >         amlen=iround(double(seq.length())*0.8);
132 >         if (amlen<12)
133 >                amlen=12;
134 >         if (!use_reverse) return;
135 >         //reverse complement
136 >         seqr=s;
137 >         int slen=seq.length();
138 >         for (int i=0;i<slen;i++)
139 >           seqr[i]=ntComplement(seq[slen-i-1]);
140 >         table6mers(seqr.chars(), seqr.length(), pzr);
141 >   }
142 >
143 >   ~CASeqData() {
144 >     for (int i=0;i<4096;i++) {
145 >       delete pz[i];
146 >       delete pzr[i];
147 >     }
148 >   }
149   };
150  
151 < GPVec<CAdapters> adapters;
151 > GVec<CASeqData> adapters5;
152 > GVec<CASeqData> adapters3;
153 >
154 > CGreedyAlignData* gxmem_l=NULL;
155 > CGreedyAlignData* gxmem_r=NULL;
156  
157   // element in dhash:
158   class FqDupRec {
# Line 158 | Line 202
202  
203   GHash<FqDupRec> dhash; //hash to keep track of duplicates
204  
205 + void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
206   int loadAdapters(const char* fname);
207  
208 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
209 <                       GStr& s, GStr& infname, GStr& infname2);
208 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
209 >                       GStr& s, GStr& infname, GStr& infname2);
210   // uses outsuffix to generate output file names and open file handles as needed
211 <
211 >
212   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
213   void trash_report(char trashcode, GStr& rname, FILE* freport);
214  
215 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
215 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
216            GStr& rname, GStr& rinfo, GStr& infname);
217  
218 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
218 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
219   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
220  
221   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
# Line 185 | Line 230
230   void convertPhred(GStr& q);
231  
232   int main(int argc, char * const argv[]) {
233 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
233 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
234    int e;
235    if ((e=args.isError())>0) {
236        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 196 | Line 241
241    convert_phred=(args.getOpt('Q')!=NULL);
242    doCollapse=(args.getOpt('C')!=NULL);
243    doDust=(args.getOpt('D')!=NULL);
244 +  revCompl=(args.getOpt('R')!=NULL);
245 +  if (args.getOpt('A')) doPolyTrim=false;
246    /*
247    rawFormat=(args.getOpt('R')!=NULL);
248    if (rawFormat) {
# Line 204 | Line 251
251    */
252    prefix=args.getOpt('n');
253    GStr s=args.getOpt('l');
254 <  if (!s.is_empty())
254 >  if (!s.is_empty())
255       min_read_len=s.asInt();
256    s=args.getOpt('m');
257 <  if (!s.is_empty())
257 >  if (!s.is_empty())
258       max_perc_N=s.asDouble();
259    s=args.getOpt('d');
260    if (!s.is_empty()) {
# Line 233 | Line 280
280          qv_phredtype=64;
281          qv_cvtadd=-31;
282          }
283 <       else
283 >       else
284           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
285       }
286    s=args.getOpt('f');
287    if (!s.is_empty()) {
288     loadAdapters(s.chars());
289     }
290 <  bool fileAdapters=adapters.Count();
290 >  bool fileAdapters=adapters5.Count()+adapters3.Count();
291    s=args.getOpt('5');
292    if (!s.is_empty()) {
293      if (fileAdapters)
294        GError("Error: options -5 and -f cannot be used together!\n");
295      s.upper();
296 <    adapters.Add(new CAdapters(s.chars()));
296 >    addAdapter(adapters5, s);
297      }
298    s=args.getOpt('3');
299    if (!s.is_empty()) {
300      if (fileAdapters)
301        GError("Error: options -3 and -f cannot be used together!\n");
302 <    s.upper();
303 <    if (adapters.Count()>0)
257 <          adapters[0]->a3=s.chars();
258 <     else adapters.Add(NULL, new CAdapters(s.chars()));
302 >      s.upper();
303 >      addAdapter(adapters3, s);
304      }
305    s=args.getOpt('y');
306    if (!s.is_empty()) {
307       int minmatch=s.asInt();
308       poly_minScore=minmatch*poly_m_score;
309       }
310 <  
310 >
311    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
312                           else outsuffix="-";
313    trashReport=  (args.getOpt('r')!=NULL);
# Line 279 | Line 324
324    if (trashReport)
325      openfw(freport, args, 'r');
326    char* infile=NULL;
327 +
328 +  if (adapters5.Count()>0)
329 +    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
330 +        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
331 +  if (adapters3.Count()>0)
332 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 +
334    while ((infile=args.nextNonOpt())!=NULL) {
335 +    //for each input file
336      int incounter=0; //counter for input reads
337      int outcounter=0; //counter for output reads
338      int trash_s=0; //too short from the get go
339      int trash_Q=0;
340      int trash_N=0;
341      int trash_D=0;
342 +    int trash_poly=0;
343      int trash_A3=0;
344      int trash_A5=0;
345      s=infile;
# Line 310 | Line 364
364         int a5=0, a3=0, b5=0, b3=0;
365         char tcode=0, tcode2=0;
366         tcode=process_read(seqid, rseq, rqv, a5, a3);
367 <       //if (!doCollapse) {
314 <         if (fq2!=NULL) {
367 >       if (fq2!=NULL) {
368              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
369              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
370                 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
# Line 343 | Line 396
396                 int nocounter=0;
397                 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
398                 }
399 <            } //paired read
347 <       // }
399 >            } //pair read
400         if (tcode>1) { //trashed
401 +         #ifdef GDEBUG
402 +         GMessage(" !!!!TRASH code = %c\n",tcode);
403 +         #endif
404            if (tcode=='s') trash_s++;
405 +          else if (tcode=='A' || tcode=='T') trash_poly++;
406              else if (tcode=='Q') trash_Q++;
407                else if (tcode=='N') trash_N++;
408                 else if (tcode=='D') trash_D++;
# Line 359 | Line 415
415              rseq=rseq.substr(a5,a3-a5+1);
416              if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
417              }
418 +         #ifdef GDEBUG
419 +            GMessage("  After trimming:\n");
420 +            GMessage("%s\n",rseq.chars());
421 +         #endif
422            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
423            }
424         } //for each fastq record
# Line 386 | Line 446
446                 }
447              }
448           outcounter++;
449 <         if (qd->count>maxdup_count) {
449 >         if (qd->count>maxdup_count) {
450              maxdup_count=qd->count;
451              maxdup_seq=seq;
452              }
453           if (isfasta) {
454             if (prefix.is_empty()) {
455 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
455 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
456                             rseq.chars());
457               }
458             else { //use custom read name
# Line 403 | Line 463
463           else { //fastq format
464            if (convert_phred) convertPhred(qd->qv, qd->len);
465            if (prefix.is_empty()) {
466 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
466 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
467                             rseq.chars(), qd->qv);
468              }
469            else { //use custom read name
# Line 439 | Line 499
499           GMessage("         Trashed by N%%:%9d\n", trash_N);
500         if (trash_Q>0)
501           GMessage("Trashed by low quality:%9d\n", trash_Q);
502 +       if (trash_poly>0)
503 +         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
504         if (trash_A5>0)
505           GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
506         if (trash_A3>0)
# Line 450 | Line 512
512      FWCLOSE(f_out);
513      FWCLOSE(f_out2);
514     } //while each input file
515 <
515 > delete gxmem_l;
516 > delete gxmem_r;
517   //getc(stdin);
518   }
519  
# Line 465 | Line 528
528     const char* seq;
529     bool valid;
530     NData() {
531 +    seqlen=0;
532      NCount=0;
533      end5=0;
534      end3=0;
# Line 495 | Line 559
559       perc_N=(n*100.0)/(end5-end3+1);
560       }
561   };
562 <
562 >
563   static NData feat;
564   int perc_lenN=12; // incremental distance from ends, in percentage of
565            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
566 <          
566 >
567   void N_analyze(int l5, int l3, int p5, int p3) {
568   /* assumes feat was filled properly */
569   int old_dif, t5,t3,v;
570   if (l3<l5+2 || p5>p3 ) {
571     feat.end5=l5+1;
572     feat.end3=l3+1;
573 <   return;
573 >   return;
574     }
575  
576   t5=feat.NPos[p5]-l5;
577   t3=l3-feat.NPos[p3];
578   old_dif=p3-p5;
579   v=(int)((((double)(l3-l5))*perc_lenN)/100);
580 < if (v>20) v=20; /* enforce N-search limit for very long reads */
580 > if (v>20) v=20; /* enforce N-search limit for very long reads */
581   if (t5 < v ) {
582     l5=feat.NPos[p5]+1;
583     p5++;
# Line 530 | Line 594
594             feat.end3=l3+1;
595             return;
596             }
597 <    else
597 >    else
598        N_analyze(l5,l3, p5,p3);
599   }
600  
# Line 571 | Line 635
635   feat.init(rseq);
636   l5=feat.end5-1;
637   l3=feat.end3-1;
638 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
638 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
639   if (l5==feat.end5-1 && l3==feat.end3-1) {
640      if (feat.perc_N>max_perc_N) {
641             feat.valid=false;
# Line 589 | Line 653
653     return true;
654     }
655   feat.N_calc();
656 <
656 >
657   if (feat.perc_N>max_perc_N) {
658        feat.valid=false;
659        l3=l5+1;
# Line 601 | Line 665
665   //--------------- dust functions ----------------
666   class DNADuster {
667   public:
668 <  int dustword;
669 <  int dustwindow;
670 <  int dustwindow2;
668 >  int dustword;
669 >  int dustwindow;
670 >  int dustwindow2;
671    int dustcutoff;
672    int mv, iv, jv;
673    int counts[32*32*32];
# Line 698 | Line 762
762                      }
763             }
764           }
765 < //return first;
765 > //return first;
766   }
767   };
768  
# Line 716 | Line 780
780   return ncount;
781   }
782  
719 int get3mer_value(const char* s) {
720 return (s[0]<<16)+(s[1]<<8)+s[2];
721 }
722
723 int w3_match(int qv, const char* str, int slen, int start_index=0) {
724 if (start_index>=slen || start_index<0) return -1;
725 for (int i=start_index;i<slen-3;i++) {
726   int rv=get3mer_value(str+i);
727   if (rv==qv) return i;
728   }
729 return -1;
730 }
731
732 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
733 if (end_index>=slen) return -1;
734 if (end_index<0) end_index=slen-1;
735 for (int i=end_index-2;i>=0;i--) {
736   int rv=get3mer_value(str+i);
737   if (rv==qv) return i;
738   }
739 return -1;
740 }
741
783   struct SLocScore {
784    int pos;
785    int score;
# Line 757 | Line 798
798   };
799  
800   bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
801 + if (!doPolyTrim) return false;
802   int rlen=seq.length();
803   l5=0;
804   l3=rlen-1;
# Line 765 | Line 807
807   //assumes N trimming was already done
808   //so a poly match should be very close to the end of the read
809   // -- find the initial match (seed)
810 < int lmin=GMAX((rlen-12), 0);
810 > int lmin=GMAX((rlen-16), 0);
811   int li;
812   for (li=rlen-4;li>lmin;li--) {
813     if (seedVal==*(int*)&(seq[li])) {
# Line 779 | Line 821
821   SLocScore loc(ri, poly_m_score<<2);
822   SLocScore maxloc(loc);
823   //extend right
824 < while (ri<rlen-2) {
824 > while (ri<rlen-1) {
825     ri++;
826     if (seq[ri]==polyChar) {
827                  loc.add(ri,poly_m_score);
# Line 796 | Line 838
838        }
839     }
840   ri=maxloc.pos;
841 < if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
841 > if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
842   //ri = right boundary for the poly match
843   //extend left
844   loc.set(li, maxloc.score);
# Line 817 | Line 859
859         maxloc=loc;
860         }
861      }
862 < if (maxloc.score>poly_minScore && ri>=rlen-3) {
863 <    l5=li;
864 <    l3=ri;
862 > li=maxloc.pos;
863 > if ((maxloc.score==poly_minScore && ri==rlen-1) ||
864 >    (maxloc.score>poly_minScore && ri>=rlen-3) ||
865 >    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
866 >  //trimming this li-ri match at 3' end
867 >    l3=li-1;
868 >    if (l3<0) l3=0;
869      return true;
870      }
871   return false;
872   }
873  
828
874   bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
875 + if (!doPolyTrim) return false;
876   int rlen=seq.length();
877   l5=0;
878   l3=rlen-1;
# Line 835 | Line 881
881   //assumes N trimming was already done
882   //so a poly match should be very close to the end of the read
883   // -- find the initial match (seed)
884 < int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
884 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
885   int li;
886   for (li=0;li<=lmax;li++) {
887     if (seedVal==*(int*)&(seq[li])) {
# Line 865 | Line 911
911         }
912      }
913   li=maxloc.pos;
914 < if (li>3) return false; //no trimming wanted, too far from 5' end
914 > if (li>5) return false; //no trimming wanted, too far from 5' end
915   //li = right boundary for the poly match
916  
917   //extend right
918   loc.set(ri, maxloc.score);
919   maxloc.pos=ri;
920 < while (ri<rlen-2) {
920 > while (ri<rlen-1) {
921     ri++;
922     if (seq[ri]==polyChar) {
923                  loc.add(ri,poly_m_score);
# Line 887 | Line 933
933        maxloc=loc;
934        }
935     }
936 <
937 < if (maxloc.score>poly_minScore && li<=3) {
938 <    l5=li;
939 <    l3=ri;
936 > ri=maxloc.pos;
937 > if ((maxloc.score==poly_minScore && li==0) ||
938 >     (maxloc.score>poly_minScore && li<2)
939 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
940 >    //adjust l5 to reflect this trimming of 5' end
941 >    l5=ri+1;
942 >    if (l5>rlen-1) l5=rlen-1;
943      return true;
944      }
945   return false;
946   }
947  
948   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
949 + if (adapters3.Count()==0) return false;
950   int rlen=seq.length();
951   l5=0;
952   l3=rlen-1;
953 < //first try a full match, we might get lucky
954 < int fi=-1;
955 < if ((fi=seq.index(adapter3))>=0) {
956 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
957 <      l5=fi+a3len;
958 <      l3=rlen-1;
959 <      }
960 <    else {
961 <      l5=0;
962 <      l3=fi-1;
963 <      }
964 <   return true;
965 <   }
966 < #ifdef DEBUG
967 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
968 < #endif
969 <
970 < //also, for fast detection of other adapter-only reads that start past
971 < // the beginning of the adapter sequence, try to see if the first a3len-4
972 < // bases of the read are a substring of the adapter
973 < if (rlen>a3len-3) {
974 <   GStr rstart=seq.substr(1,a3len-4);
975 <   if ((fi=adapter3.index(rstart))>=0) {
976 <     l3=rlen-1;
927 <     l5=a3len-4;
928 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
929 <     return true;
930 <     }
931 <  }
932 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
933 <  //check the easy cases - 11 bases exact match at the end
934 < int fdlen=11;
935 <  if (a3len<16) {
936 <   fdlen=a3len>>1;
937 <   }
938 < if (fdlen>4) {
939 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
940 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
941 <     if ((fi=adapter3.index(rstart))>=0) {
942 < #ifdef DEBUG
943 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
944 < #endif
945 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
946 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
947 <            return true;
948 <       }
949 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
950 <     GStr bstr=adapter3.substr(0, fdlen);
951 <     if ((fi=seq.rindex(bstr))>=0) {
952 < #ifdef DEBUG
953 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
954 < #endif
955 <       if (extendMatch(seq.chars(), rlen, fi,
956 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
957 <            return true;
958 <       }
959 <     } //tried to match 11 bases first
960 <    
961 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
962 < //-- only extend if the match is in the 3' (ending) region of the read
963 < int wordSize=3;
964 < int hlen=12;
965 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
966 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
967 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
968 < imin=rlen-imin;
969 < for (int iw=0;iw<hlen;iw++) {
970 <   //int32* qv=(int32*)(adapter3.chars()+iw);
971 <   int qv=get3mer_value(adapter3.chars()+iw);
972 <   fi=-1;
973 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
974 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
975 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
976 <
977 < #ifdef DEBUG
978 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
979 < #endif
980 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
981 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
982 <     fi--;
983 <     if (fi<imin) break;
984 <     }
985 <   } //for each wmer in the first hlen bases of the adaptor
986 < /*
987 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
988 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
989 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
990 < int hlen2=a3len-wordSize;
991 < //if (hlen2>a3len-4) hlen2=a3len-4;
992 < if (hlen2>hlen) {
993 < #ifdef DEBUG
994 <     if (debug && a3segs.Count()>0) {
995 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
996 <        }
997 < #endif
998 <     for (int iw=hlen;iw<hlen2;iw++) {
999 <         //int* qv=(int32 *)(adapter3.chars()+iw);
1000 <         int qv=get3mer_value(adapter3.chars()+iw);
1001 <         fi=-1;
1002 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1003 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1004 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1005 <                         a3len, iw, wordSize, l5,l3, a3segs);
1006 <           fi--;
1007 <           if (fi<imin) break;
1008 <           }
1009 <         } //for each wmer between hlen2 and hlen bases of the adaptor
953 > bool trimmed=false;
954 > GStr wseq(seq);
955 > int wlen=rlen;
956 > GXSeqData seqdata;
957 > for (int ai=0;ai<adapters3.Count();ai++) {
958 >  seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(),
959 >       adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen);
960 >  GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86);
961 >  if (bestaln) {
962 >     trimmed=true;
963 >     //keep unmatched region on the left OR right (the longer one)
964 >     if (bestaln->sl > wlen-bestaln->sr) {
965 >         //keep left side
966 >         l3-=(wlen-bestaln->sl+1);
967 >         if (l3<0) l3=0;
968 >         }
969 >     else { //keep right side
970 >         l5+=bestaln->sr;
971 >         if (l5>=rlen) l5=rlen-1;
972 >         }
973 >     delete bestaln;
974 >     if (l3-l5+1<min_read_len) return true;
975 >     wseq=seq.substr(l5,l3-l5+1);
976 >     wlen=wseq.length();
977       }
978 < //lastly, analyze collected a3segs for a possible gapped alignment:
979 < GList<CSegChain> segchains(false,true,false);
1013 < #ifdef DEBUG
1014 < if (debug && a3segs.Count()>0) {
1015 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1016 <   }
1017 < #endif
1018 < for (int i=0;i<a3segs.Count();i++) {
1019 <   if (a3segs[i]->chain==NULL) {
1020 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1021 <       CSegChain* newchain=new CSegChain();
1022 <       newchain->setFreeItem(false);
1023 <       newchain->addSegPair(a3segs[i]);
1024 <       a3segs[i]->chain=newchain;
1025 <       segchains.Add(newchain); //just to free them when done
1026 <       }
1027 <   for (int j=i+1;j<a3segs.Count();j++) {
1028 <      CSegChain* chain=a3segs[i]->chain;
1029 <      if (chain->extendChain(a3segs[j])) {
1030 <          a3segs[j]->chain=chain;
1031 < #ifdef DEBUG
1032 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1033 < #endif      
1034 <          //save time by checking here if the extended chain is already acceptable for trimming
1035 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1036 <            l5=0;
1037 <            l3=chain->astart-2;
1038 < #ifdef DEBUG
1039 <          if (debug && a3segs.Count()>0) {
1040 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1041 <            }
1042 < #endif
1043 <            return true;
1044 <            }
1045 <          } //chain can be extended
1046 <      }
1047 <   } //collect segment alignments into chains
1048 < */  
1049 < return false; //no adapter parts found
978 >  }//for each 5' adapter
979 >  return trimmed;
980   }
981  
982   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
983 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
983 > if (adapters5.Count()==0) return false;
984   int rlen=seq.length();
985   l5=0;
986   l3=rlen-1;
987 < //try to see if adapter is fully included in the read
988 < int fi=-1;
989 < for (int ai=0;ai<adapters.Count();ai++) {
990 <  if (adapters[ai]->a5.is_empty()) continue;
991 <  int a5len=adapters[ai]->a5.length();
992 <  GStr& adapter5=adapters[ai]->a5;
993 <  if ((fi=seq.index(adapter5))>=0) {
994 <    if (fi<rlen-fi-a5len) {//match is closer to the right end
995 <       l5=fi+a5len;
996 <       l3=rlen-1;
997 <       }
998 <     else {
999 <       l5=0;
1000 <       l3=fi-1;
1001 <       }
1002 <    return true;
1003 <    }
1004 < #ifdef DEBUG
1005 <  if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1006 < #endif
1007 <
1008 <  //try the easy way out first - look for an exact match of 11 bases
1009 <  int fdlen=11;
1010 <   if (a5len<16) {
1011 <    fdlen=a5len>>1;
1012 <    }
1083 <  if (fdlen>4) {
1084 <      GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1085 <      if ((fi=adapter5.index(rstart))>=0) {
1086 < #ifdef DEBUG
1087 <        if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1088 < #endif
1089 <        if (extendMatch(seq.chars(), rlen, 1,
1090 <                      adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1091 <            return true;
1092 <        }
1093 <      //another easy case: last 11 characters of the adaptor found as a substring of the read
1094 <      GStr bstr=adapter5.substr(-fdlen);
1095 <      if ((fi=seq.index(bstr))>=0) {
1096 < #ifdef DEBUG
1097 <        if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1098 < #endif
1099 <        if (extendMatch(seq.chars(), rlen, fi,
1100 <                      adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1101 <           return true;
1102 <        }
1103 <      } //tried to matching at most 11 bases first
1104 <
1105 <  //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1106 <  //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1107 <  int wordSize=3;
1108 <  int hlen=12;
1109 <  if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1110 <  int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1111 <  if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1112 <  for (int iw=0;iw<=hlen;iw++) {
1113 <    int apstart=a5len-iw-wordSize;
1114 <    fi=0;
1115 <    //int* qv=(int32 *)(adapter5.chars()+apstart);
1116 <    int qv=get3mer_value(adapter5.chars()+apstart);
1117 <    //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1118 <    while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1119 < #ifdef DEBUG
1120 <      if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1121 < #endif
1122 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1123 <                 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1124 <      fi++;
1125 <      if (fi>imax) break;
1126 <      }
1127 <    } //for each wmer in the last hlen bases of the adaptor
1128 < //if we're here we couldn't find a good extension
1129 <  return false; //no adapter parts found
1130 < }//for each 5' adapter
987 > bool trimmed=false;
988 > GStr wseq(seq);
989 > int wlen=rlen;
990 > GXSeqData seqdata;
991 > for (int ai=0;ai<adapters5.Count();ai++) {
992 >   seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(),
993 >       adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen);
994 >  GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90);
995 >  if (bestaln) {
996 >     trimmed=true;
997 >     if (bestaln->sl-1 > wlen-bestaln->sr) {
998 >         //keep left side
999 >         l3-=(wlen-bestaln->sl+1);
1000 >         if (l3<0) l3=0;
1001 >         }
1002 >     else { //keep right side
1003 >         l5+=bestaln->sr;
1004 >         if (l5>=rlen) l5=rlen-1;
1005 >         }
1006 >     delete bestaln;
1007 >     if (l3-l5+1<min_read_len) return true;
1008 >     wseq=seq.substr(l5,l3-l5+1);
1009 >     wlen=wseq.length();
1010 >     }
1011 >  }//for each 5' adapter
1012 >  return trimmed;
1013   }
1014  
1015 < //convert qvs to/from phred64 from/to phread33
1015 > //convert qvs to/from phred64 from/to phread33
1016   void convertPhred(GStr& q) {
1017   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1018   }
# Line 1139 | Line 1021
1021   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1022   }
1023  
1024 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1024 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1025            GStr& rname, GStr& rinfo, GStr& infname) {
1026   rseq="";
1027   rqv="";
# Line 1155 | Line 1037
1037        } //raw qseq format
1038   else { // FASTQ or FASTA */
1039   isfasta=(l[0]=='>');
1040 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1040 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1041        infname.chars(), l);
1042   GStr s(l);
1043   rname=&(l[1]);
1044   for (int i=0;i<rname.length();i++)
1045 <    if (rname[i]<=' ') {
1046 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1047 <       rname.cut(i);
1048 <       break;
1045 >    if (rname[i]<=' ') {
1046 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1047 >       rname.cut(i);
1048 >       break;
1049         }
1050    //now get the sequence
1051 < if ((l=fq.getLine())==NULL)
1051 > if ((l=fq.getLine())==NULL)
1052        GError("Error: unexpected EOF after header for read %s (%s)\n",
1053                     rname.chars(), infname.chars());
1054   rseq=l; //this must be the DNA line
1055   while ((l=fq.getLine())!=NULL) {
1056        //seq can span multiple lines
1057        if (l[0]=='>' || l[0]=='+') {
1058 <           fq.pushBack();
1058 >           fq.pushBack();
1059             break; //
1060             }
1061        rseq+=l;
1062 <      } //check for multi-line seq
1062 >      } //check for multi-line seq
1063   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1064      if ((l=fq.getLine())==NULL)
1065          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1066      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1067 <    if ((l=fq.getLine())==NULL)
1067 >    if ((l=fq.getLine())==NULL)
1068          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1069      rqv=l;
1070 <    //if (rqv.length()!=rseq.length())
1070 >    //if (rqv.length()!=rseq.length())
1071      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1072      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1073        rqv+=l; //append to qv string
# Line 1196 | Line 1078
1078   return true;
1079   }
1080  
1081 + #ifdef GDEBUG
1082 + void showTrim(GStr& s, int l5, int l3) {
1083 +  if (l5>0 || l3==0) {
1084 +    color_bg(c_red);
1085 +    }
1086 +  for (int i=0;i<s.length()-1;i++) {
1087 +    if (i && i==l5) color_resetbg();
1088 +    fprintf(stderr, "%c", s[i]);
1089 +    if (i && i==l3) color_bg(c_red);
1090 +   }
1091 +  fprintf(stderr, "%c", s[s.length()-1]);
1092 +  color_reset();
1093 +  fprintf(stderr, "\n");
1094 + }
1095 + #endif
1096 +
1097   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1098 < //returns 0 if the read was untouched, 1 if it was just trimmed
1098 > //returns 0 if the read was untouched, 1 if it was just trimmed
1099   // and a trash code if it was trashed
1100   l5=0;
1101   l3=rseq.length()-1;
1102 + #ifdef GDEBUG
1103 +   //rseq.reverse();
1104 +   GMessage(">%s\n", rname.chars());
1105 +   GMessage("%s\n",rseq.chars());
1106 + #endif
1107   if (l3-l5+1<min_read_len) {
1108     return 's';
1109     }
# Line 1236 | Line 1139
1139     w5=0;
1140     w3=wseq.length()-1;
1141     }
1142 < if (a3len>0) {
1143 <  if (trim_adapter3(wseq, w5, w3)) {
1142 > char trim_code;
1143 > do {
1144 >  trim_code=0;
1145 >  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1146 >      trim_code='A';
1147 >      }
1148 >  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1149 >      trim_code='T';
1150 >      }
1151 >  else if (trim_adapter5(wseq, w5, w3)) {
1152 >      trim_code='5';
1153 >      }
1154 >  if (trim_code) {
1155 >     #ifdef GDEBUG
1156 >      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1157 >      showTrim(wseq, w5, w3);
1158 >     #endif
1159       int trimlen=wseq.length()-(w3-w5+1);
1160 <     num_trimmed3++;
1161 <     if (trimlen<min_trimmed3)
1162 <         min_trimmed3=trimlen;
1160 >     num_trimmed5++;
1161 >     if (trimlen<min_trimmed5)
1162 >         min_trimmed5=trimlen;
1163       l5+=w5;
1164       l3-=(wseq.length()-1-w3);
1165       if (w3-w5+1<min_read_len) {
1166 <         return '3';
1166 >         return trim_code;
1167           }
1168        //-- keep only the w5..w3 range
1169        wseq=wseq.substr(w5, w3-w5+1);
1170        if (!wqv.is_empty())
1171           wqv=wqv.substr(w5, w3-w5+1);
1172 <      }//some adapter was trimmed
1173 <   } //adapter trimming
1174 < if (a5len>0) {
1175 <  if (trim_adapter5(wseq, w5, w3)) {
1172 >      }// trimmed at 5' end
1173 > } while (trim_code);
1174 >
1175 > do {
1176 >  trim_code=0;
1177 >  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1178 >      trim_code='A';
1179 >      }
1180 >  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1181 >      trim_code='T';
1182 >      }
1183 >  else if (trim_adapter3(wseq, w5, w3)) {
1184 >      trim_code='3';
1185 >      }
1186 >  if (trim_code) {
1187 >     #ifdef GDEBUG
1188 >     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1189 >     showTrim(wseq, w5, w3);
1190 >     #endif
1191       int trimlen=wseq.length()-(w3-w5+1);
1192 <     num_trimmed5++;
1193 <     if (trimlen<min_trimmed5)
1194 <         min_trimmed5=trimlen;
1192 >     num_trimmed3++;
1193 >     if (trimlen<min_trimmed3)
1194 >         min_trimmed3=trimlen;
1195       l5+=w5;
1196       l3-=(wseq.length()-1-w3);
1197       if (w3-w5+1<min_read_len) {
1198 <         return '5';
1198 >         return trim_code;
1199           }
1200        //-- keep only the w5..w3 range
1201        wseq=wseq.substr(w5, w3-w5+1);
1202        if (!wqv.is_empty())
1203           wqv=wqv.substr(w5, w3-w5+1);
1204 <      }//some adapter was trimmed
1205 <   } //adapter trimming
1204 >      }//trimming at 3' end
1205 > } while (trim_code);
1206 >
1207 >
1208   if (doCollapse) {
1209     //keep read for later
1210     FqDupRec* dr=dhash.Find(wseq.chars());
1211     if (dr==NULL) { //new entry
1212 <          //if (prefix.is_empty())
1213 <             dhash.Add(wseq.chars(),
1212 >          //if (prefix.is_empty())
1213 >             dhash.Add(wseq.chars(),
1214                    new FqDupRec(&wqv, rname.chars()));
1215            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1216           }
# Line 1309 | Line 1244
1244         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1245         }
1246        else {
1247 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1247 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1248                            rseq.chars());
1249         }
1250       }
# Line 1320 | Line 1255
1255         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1256         }
1257        else
1258 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1258 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1259                            rseq.chars(),rqv.chars() );
1260       }
1261   }
# Line 1328 | Line 1263
1263   void trash_report(char trashcode, GStr& rname, FILE* freport) {
1264   if (freport==NULL || trashcode<=' ') return;
1265   if (trashcode=='3' || trashcode=='5') {
1266 <   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1266 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1267     }
1268   else {
1269     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
# Line 1420 | Line 1355
1355      }
1356   }
1357  
1358 + void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1359 + //TODO: prepare CASeqData here, and collect hexamers as well
1360 + CASeqData adata(revCompl);
1361 + int idx=adapters.Add(adata);
1362 + if (idx<0) GError("Error: failed to add adaptor!\n");
1363 + adapters[idx].update(seq.chars());
1364 + }
1365 +
1366  
1367   int loadAdapters(const char* fname) {
1368    GLineReader lr(fname);
# Line 1433 | Line 1376
1376          i++;
1377          }
1378        if (l[i]!=0) {
1379 <        adapters.Add(new CAdapters(NULL, &(l[i])));
1379 >        GStr s(&(l[i]));
1380 >      #ifdef GDEBUG
1381 >          //s.reverse();
1382 >      #endif
1383 >        addAdapter(adapters3, s);
1384          continue;
1385          }
1386        }
# Line 1443 | Line 1390
1390        GStr a5,a3;
1391        if (s.nextToken(a5))
1392           s.nextToken(a3);
1393 <      adapters.Add(new CAdapters(a5.is_empty()?NULL:a5.chars(),
1394 <                                a3.is_empty()?NULL:a3.chars()));
1393 >      a5.upper();
1394 >      a3.upper();
1395 >     #ifdef GDEBUG
1396 >     //   a5.reverse();
1397 >     //   a3.reverse();
1398 >     #endif
1399 >      addAdapter(adapters5, a5);
1400 >      addAdapter(adapters3, a3);
1401        }
1402     }
1403 <   return adapters.Count();
1403 >   return adapters5.Count()+adapters3.Count();
1404   }
1405  
1406 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1407 <                       GStr& s, GStr& infname, GStr& infname2) {
1406 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1407 >                       GStr& s, GStr& infname, GStr& infname2) {
1408   // uses outsuffix to generate output file names and open file handles as needed
1409   infname="";
1410   infname2="";
# Line 1479 | Line 1432
1432   s.startTokenize(",:");
1433   s.nextToken(infname);
1434   bool paired=s.nextToken(infname2);
1435 < if (fileExists(infname.chars())==0)
1435 > if (fileExists(infname.chars())==0)
1436      GError("Error: cannot find file %s!\n",infname.chars());
1437   GStr fname(getFileName(infname.chars()));
1438   GStr picmd;
# Line 1501 | Line 1454
1454   if (!paired) return;
1455   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1456   // ---- paired reads:-------------
1457 < if (fileExists(infname2.chars())==0)
1457 > if (fileExists(infname2.chars())==0)
1458       GError("Error: cannot find file %s!\n",infname2.chars());
1459   picmd="";
1460   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines