ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 31 | Line 31
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -A  disable polyA trimming (enabled by default)\n\
35 < -T  enable polyT trimming (disabled by default)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35   -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
# Line 69 | Line 68
68   bool verbose=false;
69   bool doCollapse=false;
70   bool doDust=false;
71 + bool doPolyTrim=true;
72   bool fastaOutput=false;
73   bool trashReport=false;
74   //bool rawFormat=false;
# Line 93 | Line 93
93   int qv_cvtadd=0; //could be -31 or +31
94  
95   // adaptor matching metrics -- for X-drop ungapped extension
96 + const int match_reward=2;
97 + const int mismatch_penalty=3;
98 + const int Xdrop=8;
99 +
100   const int poly_m_score=2; //match score
101   const int poly_mis_score=-3; //mismatch
102   const int poly_dropoff_score=7;
# Line 100 | Line 104
104  
105   const char *polyA_seed="AAAA";
106   const char *polyT_seed="TTTT";
107 + GVec<GStr> adapters5;
108 + GVec<GStr> adapters3;
109  
110 < struct CAdapters {
111 <    GStr a5;
106 <    GStr a3;
107 <    CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108 <      }
109 < };
110 <
111 < GPVec<CAdapters> adapters;
110 > CGreedyAlignData* gxmem_l=NULL;
111 > CGreedyAlignData* gxmem_r=NULL;
112  
113   // element in dhash:
114   class FqDupRec {
# Line 185 | Line 185
185   void convertPhred(GStr& q);
186  
187   int main(int argc, char * const argv[]) {
188 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
188 >  GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189    int e;
190    if ((e=args.isError())>0) {
191        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 196 | Line 196
196    convert_phred=(args.getOpt('Q')!=NULL);
197    doCollapse=(args.getOpt('C')!=NULL);
198    doDust=(args.getOpt('D')!=NULL);
199 +  if (args.getOpt('A')) doPolyTrim=false;
200    /*
201    rawFormat=(args.getOpt('R')!=NULL);
202    if (rawFormat) {
# Line 240 | Line 241
241    if (!s.is_empty()) {
242     loadAdapters(s.chars());
243     }
244 <  bool fileAdapters=adapters.Count();
244 >  bool fileAdapters=adapters5.Count()+adapters3.Count();
245    s=args.getOpt('5');
246    if (!s.is_empty()) {
247      if (fileAdapters)
248        GError("Error: options -5 and -f cannot be used together!\n");
249      s.upper();
250 <    adapters.Add(new CAdapters(s.chars()));
250 >    adapters5.Add(s);
251      }
252    s=args.getOpt('3');
253    if (!s.is_empty()) {
254      if (fileAdapters)
255        GError("Error: options -3 and -f cannot be used together!\n");
256 <    s.upper();
257 <    if (adapters.Count()>0)
257 <          adapters[0]->a3=s.chars();
258 <     else adapters.Add(NULL, new CAdapters(s.chars()));
256 >      s.upper();
257 >      adapters3.Add(s);
258      }
259    s=args.getOpt('y');
260    if (!s.is_empty()) {
# Line 279 | Line 278
278    if (trashReport)
279      openfw(freport, args, 'r');
280    char* infile=NULL;
281 +
282 +  if (adapters5.Count()>0)
283 +    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
284 +  if (adapters3.Count()>0)
285 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
286 +
287    while ((infile=args.nextNonOpt())!=NULL) {
288 +    //for each input file
289      int incounter=0; //counter for input reads
290      int outcounter=0; //counter for output reads
291      int trash_s=0; //too short from the get go
292      int trash_Q=0;
293      int trash_N=0;
294      int trash_D=0;
295 +    int trash_poly=0;
296      int trash_A3=0;
297      int trash_A5=0;
298      s=infile;
# Line 310 | Line 317
317         int a5=0, a3=0, b5=0, b3=0;
318         char tcode=0, tcode2=0;
319         tcode=process_read(seqid, rseq, rqv, a5, a3);
320 <       //if (!doCollapse) {
314 <         if (fq2!=NULL) {
320 >       if (fq2!=NULL) {
321              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
322              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
323                 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
# Line 343 | Line 349
349                 int nocounter=0;
350                 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
351                 }
352 <            } //paired read
347 <       // }
352 >            } //pair read
353         if (tcode>1) { //trashed
354 +         #ifdef GDEBUG
355 +         GMessage(" !!!!TRASH => 'N'\n");
356 +         #endif
357            if (tcode=='s') trash_s++;
358 +          else if (tcode=='A' || tcode=='T') trash_poly++;
359              else if (tcode=='Q') trash_Q++;
360                else if (tcode=='N') trash_N++;
361                 else if (tcode=='D') trash_D++;
# Line 359 | Line 368
368              rseq=rseq.substr(a5,a3-a5+1);
369              if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
370              }
371 +         #ifdef GDEBUG
372 +            GMessage("  After trimming:\n");
373 +            GMessage("%s\n",rseq.chars());
374 +         #endif
375            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
376            }
377         } //for each fastq record
# Line 439 | Line 452
452           GMessage("         Trashed by N%%:%9d\n", trash_N);
453         if (trash_Q>0)
454           GMessage("Trashed by low quality:%9d\n", trash_Q);
455 +       if (trash_poly>0)
456 +         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
457         if (trash_A5>0)
458           GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
459         if (trash_A3>0)
# Line 450 | Line 465
465      FWCLOSE(f_out);
466      FWCLOSE(f_out2);
467     } //while each input file
468 <
468 > delete gxmem_l;
469 > delete gxmem_r;
470   //getc(stdin);
471   }
472  
# Line 757 | Line 773
773   };
774  
775   bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
776 + if (!doPolyTrim) return false;
777   int rlen=seq.length();
778   l5=0;
779   l3=rlen-1;
# Line 765 | Line 782
782   //assumes N trimming was already done
783   //so a poly match should be very close to the end of the read
784   // -- find the initial match (seed)
785 < int lmin=GMAX((rlen-12), 0);
785 > int lmin=GMAX((rlen-16), 0);
786   int li;
787   for (li=rlen-4;li>lmin;li--) {
788     if (seedVal==*(int*)&(seq[li])) {
# Line 796 | Line 813
813        }
814     }
815   ri=maxloc.pos;
816 < if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
816 > if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
817   //ri = right boundary for the poly match
818   //extend left
819   loc.set(li, maxloc.score);
# Line 817 | Line 834
834         maxloc=loc;
835         }
836      }
837 < if (maxloc.score>poly_minScore && ri>=rlen-3) {
838 <    l5=li;
839 <    l3=ri;
837 > li=maxloc.pos;
838 > if ((maxloc.score==poly_minScore && ri==rlen-1) ||
839 >    (maxloc.score>poly_minScore && ri>=rlen-3) ||
840 >    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
841 >  //trimming this li-ri match at 3' end
842 >    l3=li-1;
843 >    if (l3<0) l3=0;
844      return true;
845      }
846   return false;
847   }
848  
828
849   bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
850 + if (!doPolyTrim) return false;
851   int rlen=seq.length();
852   l5=0;
853   l3=rlen-1;
# Line 835 | Line 856
856   //assumes N trimming was already done
857   //so a poly match should be very close to the end of the read
858   // -- find the initial match (seed)
859 < int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
859 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
860   int li;
861   for (li=0;li<=lmax;li++) {
862     if (seedVal==*(int*)&(seq[li])) {
# Line 865 | Line 886
886         }
887      }
888   li=maxloc.pos;
889 < if (li>3) return false; //no trimming wanted, too far from 5' end
889 > if (li>5) return false; //no trimming wanted, too far from 5' end
890   //li = right boundary for the poly match
891  
892   //extend right
# Line 887 | Line 908
908        maxloc=loc;
909        }
910     }
911 <
912 < if (maxloc.score>poly_minScore && li<=3) {
913 <    l5=li;
914 <    l3=ri;
911 > ri=maxloc.pos;
912 > if ((maxloc.score==poly_minScore && li==0) ||
913 >     (maxloc.score>poly_minScore && li<2)
914 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
915 >    //adjust l5 to reflect this trimming of 5' end
916 >    l5=ri+1;
917 >    if (l5>rlen-1) l5=rlen-1;
918      return true;
919      }
920   return false;
921   }
922  
923   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
924 + if (adapters3.Count()==0) return false;
925   int rlen=seq.length();
926   l5=0;
927   l3=rlen-1;
928 < //first try a full match, we might get lucky
929 < int fi=-1;
930 < if ((fi=seq.index(adapter3))>=0) {
931 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
932 <      l5=fi+a3len;
933 <      l3=rlen-1;
934 <      }
935 <    else {
936 <      l5=0;
937 <      l3=fi-1;
938 <      }
939 <   return true;
940 <   }
941 < #ifdef DEBUG
942 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
943 < #endif
944 <
920 < //also, for fast detection of other adapter-only reads that start past
921 < // the beginning of the adapter sequence, try to see if the first a3len-4
922 < // bases of the read are a substring of the adapter
923 < if (rlen>a3len-3) {
924 <   GStr rstart=seq.substr(1,a3len-4);
925 <   if ((fi=adapter3.index(rstart))>=0) {
926 <     l3=rlen-1;
927 <     l5=a3len-4;
928 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
929 <     return true;
930 <     }
931 <  }
932 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
933 <  //check the easy cases - 11 bases exact match at the end
934 < int fdlen=11;
935 <  if (a3len<16) {
936 <   fdlen=a3len>>1;
937 <   }
938 < if (fdlen>4) {
939 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
940 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
941 <     if ((fi=adapter3.index(rstart))>=0) {
942 < #ifdef DEBUG
943 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
944 < #endif
945 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
946 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
947 <            return true;
948 <       }
949 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
950 <     GStr bstr=adapter3.substr(0, fdlen);
951 <     if ((fi=seq.rindex(bstr))>=0) {
952 < #ifdef DEBUG
953 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
954 < #endif
955 <       if (extendMatch(seq.chars(), rlen, fi,
956 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
957 <            return true;
958 <       }
959 <     } //tried to match 11 bases first
960 <    
961 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
962 < //-- only extend if the match is in the 3' (ending) region of the read
963 < int wordSize=3;
964 < int hlen=12;
965 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
966 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
967 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
968 < imin=rlen-imin;
969 < for (int iw=0;iw<hlen;iw++) {
970 <   //int32* qv=(int32*)(adapter3.chars()+iw);
971 <   int qv=get3mer_value(adapter3.chars()+iw);
972 <   fi=-1;
973 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
974 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
975 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
976 <
977 < #ifdef DEBUG
978 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
979 < #endif
980 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
981 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
982 <     fi--;
983 <     if (fi<imin) break;
984 <     }
985 <   } //for each wmer in the first hlen bases of the adaptor
986 < /*
987 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
988 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
989 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
990 < int hlen2=a3len-wordSize;
991 < //if (hlen2>a3len-4) hlen2=a3len-4;
992 < if (hlen2>hlen) {
993 < #ifdef DEBUG
994 <     if (debug && a3segs.Count()>0) {
995 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
996 <        }
997 < #endif
998 <     for (int iw=hlen;iw<hlen2;iw++) {
999 <         //int* qv=(int32 *)(adapter3.chars()+iw);
1000 <         int qv=get3mer_value(adapter3.chars()+iw);
1001 <         fi=-1;
1002 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1003 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1004 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1005 <                         a3len, iw, wordSize, l5,l3, a3segs);
1006 <           fi--;
1007 <           if (fi<imin) break;
1008 <           }
1009 <         } //for each wmer between hlen2 and hlen bases of the adaptor
928 > bool trimmed=false;
929 > GStr wseq(seq.chars());
930 > int wlen=rlen;
931 > for (int ai=0;ai<adapters3.Count();ai++) {
932 >  if (adapters3[ai].is_empty()) continue;
933 >  int alen=adapters3[ai].length();
934 >  GStr& aseq=adapters3[ai];
935 >  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
936 >  if (r_bestaln) {
937 >     trimmed=true;
938 >     //keep unmatched region on the left, if any
939 >     l3-=(wlen-r_bestaln->sl+1);
940 >     delete r_bestaln;
941 >     if (l3<0) l3=0;
942 >     if (l3-l5+1<min_read_len) return true;
943 >     wseq=seq.substr(l5,l3-l5+1);
944 >     wlen=wseq.length();
945       }
946 < //lastly, analyze collected a3segs for a possible gapped alignment:
947 < GList<CSegChain> segchains(false,true,false);
1013 < #ifdef DEBUG
1014 < if (debug && a3segs.Count()>0) {
1015 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1016 <   }
1017 < #endif
1018 < for (int i=0;i<a3segs.Count();i++) {
1019 <   if (a3segs[i]->chain==NULL) {
1020 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1021 <       CSegChain* newchain=new CSegChain();
1022 <       newchain->setFreeItem(false);
1023 <       newchain->addSegPair(a3segs[i]);
1024 <       a3segs[i]->chain=newchain;
1025 <       segchains.Add(newchain); //just to free them when done
1026 <       }
1027 <   for (int j=i+1;j<a3segs.Count();j++) {
1028 <      CSegChain* chain=a3segs[i]->chain;
1029 <      if (chain->extendChain(a3segs[j])) {
1030 <          a3segs[j]->chain=chain;
1031 < #ifdef DEBUG
1032 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1033 < #endif      
1034 <          //save time by checking here if the extended chain is already acceptable for trimming
1035 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1036 <            l5=0;
1037 <            l3=chain->astart-2;
1038 < #ifdef DEBUG
1039 <          if (debug && a3segs.Count()>0) {
1040 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1041 <            }
1042 < #endif
1043 <            return true;
1044 <            }
1045 <          } //chain can be extended
1046 <      }
1047 <   } //collect segment alignments into chains
1048 < */  
1049 < return false; //no adapter parts found
946 >  }//for each 5' adapter
947 >  return trimmed;
948   }
949  
950   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
951 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
951 > if (adapters5.Count()==0) return false;
952   int rlen=seq.length();
953   l5=0;
954   l3=rlen-1;
955 < //try to see if adapter is fully included in the read
956 < int fi=-1;
957 < for (int ai=0;ai<adapters.Count();ai++) {
958 <  if (adapters[ai]->a5.is_empty()) continue;
959 <  int a5len=adapters[ai]->a5.length();
960 <  GStr& adapter5=adapters[ai]->a5;
961 <  if ((fi=seq.index(adapter5))>=0) {
962 <    if (fi<rlen-fi-a5len) {//match is closer to the right end
963 <       l5=fi+a5len;
964 <       l3=rlen-1;
965 <       }
966 <     else {
967 <       l5=0;
968 <       l3=fi-1;
969 <       }
970 <    return true;
971 <    }
972 < #ifdef DEBUG
973 <  if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1076 < #endif
1077 <
1078 <  //try the easy way out first - look for an exact match of 11 bases
1079 <  int fdlen=11;
1080 <   if (a5len<16) {
1081 <    fdlen=a5len>>1;
1082 <    }
1083 <  if (fdlen>4) {
1084 <      GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1085 <      if ((fi=adapter5.index(rstart))>=0) {
1086 < #ifdef DEBUG
1087 <        if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1088 < #endif
1089 <        if (extendMatch(seq.chars(), rlen, 1,
1090 <                      adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1091 <            return true;
1092 <        }
1093 <      //another easy case: last 11 characters of the adaptor found as a substring of the read
1094 <      GStr bstr=adapter5.substr(-fdlen);
1095 <      if ((fi=seq.index(bstr))>=0) {
1096 < #ifdef DEBUG
1097 <        if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1098 < #endif
1099 <        if (extendMatch(seq.chars(), rlen, fi,
1100 <                      adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1101 <           return true;
1102 <        }
1103 <      } //tried to matching at most 11 bases first
1104 <
1105 <  //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1106 <  //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1107 <  int wordSize=3;
1108 <  int hlen=12;
1109 <  if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1110 <  int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1111 <  if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1112 <  for (int iw=0;iw<=hlen;iw++) {
1113 <    int apstart=a5len-iw-wordSize;
1114 <    fi=0;
1115 <    //int* qv=(int32 *)(adapter5.chars()+apstart);
1116 <    int qv=get3mer_value(adapter5.chars()+apstart);
1117 <    //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1118 <    while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1119 < #ifdef DEBUG
1120 <      if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1121 < #endif
1122 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1123 <                 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1124 <      fi++;
1125 <      if (fi>imax) break;
1126 <      }
1127 <    } //for each wmer in the last hlen bases of the adaptor
1128 < //if we're here we couldn't find a good extension
1129 <  return false; //no adapter parts found
1130 < }//for each 5' adapter
955 > bool trimmed=false;
956 > GStr wseq(seq.chars());
957 > int wlen=rlen;
958 > for (int ai=0;ai<adapters5.Count();ai++) {
959 >  if (adapters5[ai].is_empty()) continue;
960 >  int alen=adapters5[ai].length();
961 >  GStr& aseq=adapters5[ai];
962 >  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
963 >  if (l_bestaln) {
964 >     trimmed=true;
965 >     l5+=l_bestaln->sr;
966 >     delete l_bestaln;
967 >     if (l5>=rlen) l5=rlen-1;
968 >     if (l3-l5+1<min_read_len) return true;
969 >     wseq=seq.substr(l5,l3-l5+1);
970 >     wlen=wseq.length();
971 >     }
972 >  }//for each 5' adapter
973 >  return trimmed;
974   }
975  
976   //convert qvs to/from phred64 from/to phread33
# Line 1196 | Line 1039
1039   return true;
1040   }
1041  
1042 + #ifdef GDEBUG
1043 + void showTrim(GStr& s, int l5, int l3) {
1044 +  if (l5>0) {
1045 +    color_bg(c_red);
1046 +    }
1047 +  for (int i=0;i<s.length()-1;i++) {
1048 +    if (i && i==l5) color_resetbg();
1049 +    fprintf(stderr, "%c", s[i]);
1050 +    if (i==l3) color_bg(c_red);
1051 +   }
1052 +  fprintf(stderr, "%c", s[s.length()-1]);
1053 +  color_reset();
1054 +  fprintf(stderr, "\n");
1055 + }
1056 + #endif
1057 +
1058   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1059   //returns 0 if the read was untouched, 1 if it was just trimmed
1060   // and a trash code if it was trashed
1061   l5=0;
1062   l3=rseq.length()-1;
1063 + #ifdef GDEBUG
1064 +   //rseq.reverse();
1065 +   GMessage(">%s\n", rname.chars());
1066 +   GMessage("%s\n",rseq.chars());
1067 + #endif
1068   if (l3-l5+1<min_read_len) {
1069     return 's';
1070     }
# Line 1236 | Line 1100
1100     w5=0;
1101     w3=wseq.length()-1;
1102     }
1103 < if (a3len>0) {
1104 <  if (trim_adapter3(wseq, w5, w3)) {
1103 > char trim_code;
1104 > do {
1105 >  trim_code=0;
1106 >  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1107 >      trim_code='A';
1108 >      }
1109 >  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1110 >      trim_code='T';
1111 >      }
1112 >  else if (trim_adapter5(wseq, w5, w3)) {
1113 >      trim_code='5';
1114 >      }
1115 >  if (trim_code) {
1116 >     #ifdef GDEBUG
1117 >      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1118 >      showTrim(wseq, w5, w3);
1119 >     #endif
1120       int trimlen=wseq.length()-(w3-w5+1);
1121 <     num_trimmed3++;
1122 <     if (trimlen<min_trimmed3)
1123 <         min_trimmed3=trimlen;
1121 >     num_trimmed5++;
1122 >     if (trimlen<min_trimmed5)
1123 >         min_trimmed5=trimlen;
1124       l5+=w5;
1125       l3-=(wseq.length()-1-w3);
1126       if (w3-w5+1<min_read_len) {
1127 <         return '3';
1127 >         return trim_code;
1128           }
1129        //-- keep only the w5..w3 range
1130        wseq=wseq.substr(w5, w3-w5+1);
1131        if (!wqv.is_empty())
1132           wqv=wqv.substr(w5, w3-w5+1);
1133 <      }//some adapter was trimmed
1134 <   } //adapter trimming
1135 < if (a5len>0) {
1136 <  if (trim_adapter5(wseq, w5, w3)) {
1133 >      }// trimmed at 5' end
1134 > } while (trim_code);
1135 >
1136 > do {
1137 >  trim_code=0;
1138 >  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1139 >      trim_code='A';
1140 >      }
1141 >  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1142 >      trim_code='T';
1143 >      }
1144 >  else if (trim_adapter3(wseq, w5, w3)) {
1145 >      trim_code='3';
1146 >      }
1147 >  if (trim_code) {
1148 >     #ifdef GDEBUG
1149 >     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1150 >     showTrim(wseq, w5, w3);
1151 >     #endif
1152       int trimlen=wseq.length()-(w3-w5+1);
1153 <     num_trimmed5++;
1154 <     if (trimlen<min_trimmed5)
1155 <         min_trimmed5=trimlen;
1153 >     num_trimmed3++;
1154 >     if (trimlen<min_trimmed3)
1155 >         min_trimmed3=trimlen;
1156       l5+=w5;
1157       l3-=(wseq.length()-1-w3);
1158       if (w3-w5+1<min_read_len) {
1159 <         return '5';
1159 >         return trim_code;
1160           }
1161        //-- keep only the w5..w3 range
1162        wseq=wseq.substr(w5, w3-w5+1);
1163        if (!wqv.is_empty())
1164           wqv=wqv.substr(w5, w3-w5+1);
1165 <      }//some adapter was trimmed
1166 <   } //adapter trimming
1165 >      }//trimming at 3' end
1166 > } while (trim_code);
1167 >
1168 >
1169   if (doCollapse) {
1170     //keep read for later
1171     FqDupRec* dr=dhash.Find(wseq.chars());
# Line 1328 | Line 1224
1224   void trash_report(char trashcode, GStr& rname, FILE* freport) {
1225   if (freport==NULL || trashcode<=' ') return;
1226   if (trashcode=='3' || trashcode=='5') {
1227 <   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1227 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1228     }
1229   else {
1230     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
# Line 1433 | Line 1329
1329          i++;
1330          }
1331        if (l[i]!=0) {
1332 <        adapters.Add(new CAdapters(NULL, &(l[i])));
1332 >        GStr s(&(l[i]));
1333 >      #ifdef GDEBUG
1334 >          //s.reverse();
1335 >      #endif
1336 >        adapters3.Add(s);
1337          continue;
1338          }
1339        }
# Line 1443 | Line 1343
1343        GStr a5,a3;
1344        if (s.nextToken(a5))
1345           s.nextToken(a3);
1346 <      adapters.Add(new CAdapters(a5.is_empty()?NULL:a5.chars(),
1347 <                                a3.is_empty()?NULL:a3.chars()));
1346 >      a5.upper();
1347 >      a3.upper();
1348 >     #ifdef GDEBUG
1349 >     //   a5.reverse();
1350 >     //   a3.reverse();
1351 >     #endif
1352 >      adapters5.Add(a5);
1353 >      adapters3.Add(a3);
1354        }
1355     }
1356 <   return adapters.Count();
1356 >   return adapters5.Count()+adapters3.Count();
1357   }
1358  
1359   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines