ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 6 | Line 6
6   #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 <   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
9 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12      <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15   for low complexity and collapse duplicate reads.\n\
16   If read pairs should be trimmed and kept together (i.e. without discarding\n\
17   one read in a pair), the two file names should be given delimited by a comma\n\
# Line 25 | Line 25
25      file(s) named <input>.<outsuffix> which will be created in the \n\
26      current (working) directory; (writes to stdout if -o- is given);\n\
27      a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 < -f  file with adapter sequences to trim, each line having this format:\n\
29 <    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
28 > -f  file with adaptor sequences to trim, each line having this format:\n\
29 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 < -3  trim the given adapter sequence at the 3' end of each read\n\
32 > -3  trim the given adaptor sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -A  disable polyA trimming (enabled by default)\n\
35 < -T  enable polyT trimming (disabled by default)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35   -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
# Line 49 | Line 48
48   -Q  convert quality values to the other Phred qv type\n\
49   -V  verbose processing\n\
50   "
51 <
51 >
52   //-z  for -o option, the output stream(s) will be first piped into the given\n
53   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54  
55 <
56 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55 >
56 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58   //For paired reads sequencing:
59   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
# Line 63 | Line 62
62   //FILE* f_out2=NULL; //for paired reads
63   //FILE* f_in=NULL; //input fastq (stdin if not provided)
64   //FILE* f_in2=NULL; //for paired reads
65 +
66   FILE* freport=NULL;
67  
68   bool debug=false;
69   bool verbose=false;
70   bool doCollapse=false;
71   bool doDust=false;
72 + bool doPolyTrim=true;
73   bool fastaOutput=false;
74   bool trashReport=false;
75 < //bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77   double max_perc_N=7.0;
78   int dust_cutoff=16;
79   bool isfasta=false;
80   bool convert_phred=false;
81 < GStr outsuffix; // -o
81 > GStr outsuffix; // -o
82   GStr prefix;
83   GStr zcmd;
84   int num_trimmed5=0;
# Line 93 | Line 94
94   int qv_cvtadd=0; //could be -31 or +31
95  
96   // adaptor matching metrics -- for X-drop ungapped extension
97 + //const int match_reward=2;
98 + //const int mismatch_penalty=3;
99 + const int match_reward=2;
100 + const int mismatch_penalty=4;
101 + const int Xdrop=10;
102 +
103   const int poly_m_score=2; //match score
104   const int poly_mis_score=-3; //mismatch
105   const int poly_dropoff_score=7;
# Line 101 | Line 108
108   const char *polyA_seed="AAAA";
109   const char *polyT_seed="TTTT";
110  
111 < struct CAdapters {
112 <    GStr a5;
113 <    GStr a3;
114 <    CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
115 <      }
111 > struct CASeqData {
112 >   //positional data for every possible hexamer in an adaptor
113 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115 >   GStr seq; //actual adaptor sequence data
116 >   GStr seqr; //reverse complement sequence
117 >   int amlen; //fraction of adaptor length matching that's
118 >              //enough to consider the alignment
119 >   GAlnTrimType trim_type;
120 >   bool use_reverse;
121 >   CASeqData(bool rev=false):seq(),seqr(),
122 >             amlen(0), use_reverse(rev) {
123 >     trim_type=galn_None; //should be updated later!
124 >     for (int i=0;i<4096;i++) {
125 >        pz[i]=NULL;
126 >        pzr[i]=NULL;
127 >        }
128 >     }
129 >
130 >   void update(const char* s) {
131 >         seq=s;
132 >         table6mers(seq.chars(), seq.length(), pz);
133 >         amlen=iround(double(seq.length())*0.8);
134 >         if (amlen<12)
135 >                amlen=12;
136 >         if (!use_reverse) return;
137 >         //reverse complement
138 >         seqr=s;
139 >         int slen=seq.length();
140 >         for (int i=0;i<slen;i++)
141 >           seqr[i]=ntComplement(seq[slen-i-1]);
142 >         table6mers(seqr.chars(), seqr.length(), pzr);
143 >   }
144 >
145 >   ~CASeqData() {
146 >     for (int i=0;i<4096;i++) {
147 >       delete pz[i];
148 >       delete pzr[i];
149 >     }
150 >   }
151   };
152  
153 < GPVec<CAdapters> adapters;
153 > GVec<CASeqData> adaptors5;
154 > GVec<CASeqData> adaptors3;
155 >
156 > CGreedyAlignData* gxmem_l=NULL;
157 > CGreedyAlignData* gxmem_r=NULL;
158  
159   // element in dhash:
160   class FqDupRec {
# Line 158 | Line 204
204  
205   GHash<FqDupRec> dhash; //hash to keep track of duplicates
206  
207 < int loadAdapters(const char* fname);
207 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208 > int loadAdaptors(const char* fname);
209  
210 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211 <                       GStr& s, GStr& infname, GStr& infname2);
210 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211 >                       GStr& s, GStr& infname, GStr& infname2);
212   // uses outsuffix to generate output file names and open file handles as needed
213 <
213 >
214   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215   void trash_report(char trashcode, GStr& rname, FILE* freport);
216  
217 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
217 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218            GStr& rname, GStr& rinfo, GStr& infname);
219  
220 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
220 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222  
223   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
# Line 178 | Line 225
225   int dust(GStr& seq);
226   bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227   bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 < bool trim_adapter3(GStr& seq, int &l5, int &l3);
228 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230  
231   void convertPhred(char* q, int len);
232   void convertPhred(GStr& q);
233  
234   int main(int argc, char * const argv[]) {
235 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
235 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236    int e;
237    if ((e=args.isError())>0) {
238        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 196 | Line 243
243    convert_phred=(args.getOpt('Q')!=NULL);
244    doCollapse=(args.getOpt('C')!=NULL);
245    doDust=(args.getOpt('D')!=NULL);
246 +  revCompl=(args.getOpt('R')!=NULL);
247 +  if (args.getOpt('A')) doPolyTrim=false;
248    /*
249    rawFormat=(args.getOpt('R')!=NULL);
250    if (rawFormat) {
# Line 204 | Line 253
253    */
254    prefix=args.getOpt('n');
255    GStr s=args.getOpt('l');
256 <  if (!s.is_empty())
256 >  if (!s.is_empty())
257       min_read_len=s.asInt();
258    s=args.getOpt('m');
259 <  if (!s.is_empty())
259 >  if (!s.is_empty())
260       max_perc_N=s.asDouble();
261    s=args.getOpt('d');
262    if (!s.is_empty()) {
# Line 233 | Line 282
282          qv_phredtype=64;
283          qv_cvtadd=-31;
284          }
285 <       else
285 >       else
286           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287       }
288    s=args.getOpt('f');
289    if (!s.is_empty()) {
290 <   loadAdapters(s.chars());
290 >   loadAdaptors(s.chars());
291     }
292 <  bool fileAdapters=adapters.Count();
292 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293    s=args.getOpt('5');
294    if (!s.is_empty()) {
295 <    if (fileAdapters)
295 >    if (fileAdaptors)
296        GError("Error: options -5 and -f cannot be used together!\n");
297      s.upper();
298 <    adapters.Add(new CAdapters(s.chars()));
298 >    addAdaptor(adaptors5, s, galn_TrimLeft);
299      }
300    s=args.getOpt('3');
301    if (!s.is_empty()) {
302 <    if (fileAdapters)
302 >    if (fileAdaptors)
303        GError("Error: options -3 and -f cannot be used together!\n");
304 <    s.upper();
305 <    if (adapters.Count()>0)
257 <          adapters[0]->a3=s.chars();
258 <     else adapters.Add(NULL, new CAdapters(s.chars()));
304 >      s.upper();
305 >      addAdaptor(adaptors3, s, galn_TrimRight);
306      }
307    s=args.getOpt('y');
308    if (!s.is_empty()) {
309       int minmatch=s.asInt();
310       poly_minScore=minmatch*poly_m_score;
311       }
312 <  
312 >
313    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314                           else outsuffix="-";
315    trashReport=  (args.getOpt('r')!=NULL);
# Line 279 | Line 326
326    if (trashReport)
327      openfw(freport, args, 'r');
328    char* infile=NULL;
329 +
330 +  if (adaptors5.Count()>0)
331 +    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 +        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 +  if (adaptors3.Count()>0)
334 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335 +
336    while ((infile=args.nextNonOpt())!=NULL) {
337 +    //for each input file
338      int incounter=0; //counter for input reads
339      int outcounter=0; //counter for output reads
340      int trash_s=0; //too short from the get go
341      int trash_Q=0;
342      int trash_N=0;
343      int trash_D=0;
344 +    int trash_poly=0;
345      int trash_A3=0;
346      int trash_A5=0;
347      s=infile;
# Line 310 | Line 366
366         int a5=0, a3=0, b5=0, b3=0;
367         char tcode=0, tcode2=0;
368         tcode=process_read(seqid, rseq, rqv, a5, a3);
369 <       //if (!doCollapse) {
314 <         if (fq2!=NULL) {
369 >       if (fq2!=NULL) {
370              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372                 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
# Line 343 | Line 398
398                 int nocounter=0;
399                 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400                 }
401 <            } //paired read
347 <       // }
401 >            } //pair read
402         if (tcode>1) { //trashed
403 +         #ifdef GDEBUG
404 +         GMessage(" !!!!TRASH code = %c\n",tcode);
405 +         #endif
406            if (tcode=='s') trash_s++;
407 +          else if (tcode=='A' || tcode=='T') trash_poly++;
408              else if (tcode=='Q') trash_Q++;
409                else if (tcode=='N') trash_N++;
410                 else if (tcode=='D') trash_D++;
# Line 359 | Line 417
417              rseq=rseq.substr(a5,a3-a5+1);
418              if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419              }
420 +         #ifdef GDEBUG
421 +            GMessage("  After trimming:\n");
422 +            GMessage("%s\n",rseq.chars());
423 +         #endif
424            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425            }
426         } //for each fastq record
# Line 386 | Line 448
448                 }
449              }
450           outcounter++;
451 <         if (qd->count>maxdup_count) {
451 >         if (qd->count>maxdup_count) {
452              maxdup_count=qd->count;
453              maxdup_seq=seq;
454              }
455           if (isfasta) {
456             if (prefix.is_empty()) {
457 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
457 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458                             rseq.chars());
459               }
460             else { //use custom read name
# Line 403 | Line 465
465           else { //fastq format
466            if (convert_phred) convertPhred(qd->qv, qd->len);
467            if (prefix.is_empty()) {
468 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
468 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469                             rseq.chars(), qd->qv);
470              }
471            else { //use custom read name
# Line 439 | Line 501
501           GMessage("         Trashed by N%%:%9d\n", trash_N);
502         if (trash_Q>0)
503           GMessage("Trashed by low quality:%9d\n", trash_Q);
504 +       if (trash_poly>0)
505 +         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
506         if (trash_A5>0)
507 <         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
507 >         GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508         if (trash_A3>0)
509 <         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
509 >         GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510         }
511      if (trashReport) {
512            FWCLOSE(freport);
# Line 450 | Line 514
514      FWCLOSE(f_out);
515      FWCLOSE(f_out2);
516     } //while each input file
517 <
517 > delete gxmem_l;
518 > delete gxmem_r;
519   //getc(stdin);
520   }
521  
# Line 465 | Line 530
530     const char* seq;
531     bool valid;
532     NData() {
533 +    seqlen=0;
534      NCount=0;
535      end5=0;
536      end3=0;
# Line 495 | Line 561
561       perc_N=(n*100.0)/(end5-end3+1);
562       }
563   };
564 <
564 >
565   static NData feat;
566   int perc_lenN=12; // incremental distance from ends, in percentage of
567            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568 <          
568 >
569   void N_analyze(int l5, int l3, int p5, int p3) {
570   /* assumes feat was filled properly */
571   int old_dif, t5,t3,v;
572   if (l3<l5+2 || p5>p3 ) {
573     feat.end5=l5+1;
574     feat.end3=l3+1;
575 <   return;
575 >   return;
576     }
577  
578   t5=feat.NPos[p5]-l5;
579   t3=l3-feat.NPos[p3];
580   old_dif=p3-p5;
581   v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 < if (v>20) v=20; /* enforce N-search limit for very long reads */
582 > if (v>20) v=20; /* enforce N-search limit for very long reads */
583   if (t5 < v ) {
584     l5=feat.NPos[p5]+1;
585     p5++;
# Line 530 | Line 596
596             feat.end3=l3+1;
597             return;
598             }
599 <    else
599 >    else
600        N_analyze(l5,l3, p5,p3);
601   }
602  
# Line 571 | Line 637
637   feat.init(rseq);
638   l5=feat.end5-1;
639   l3=feat.end3-1;
640 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
640 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641   if (l5==feat.end5-1 && l3==feat.end3-1) {
642      if (feat.perc_N>max_perc_N) {
643             feat.valid=false;
# Line 589 | Line 655
655     return true;
656     }
657   feat.N_calc();
658 <
658 >
659   if (feat.perc_N>max_perc_N) {
660        feat.valid=false;
661        l3=l5+1;
# Line 601 | Line 667
667   //--------------- dust functions ----------------
668   class DNADuster {
669   public:
670 <  int dustword;
671 <  int dustwindow;
672 <  int dustwindow2;
670 >  int dustword;
671 >  int dustwindow;
672 >  int dustwindow2;
673    int dustcutoff;
674    int mv, iv, jv;
675    int counts[32*32*32];
# Line 698 | Line 764
764                      }
765             }
766           }
767 < //return first;
767 > //return first;
768   }
769   };
770  
# Line 716 | Line 782
782   return ncount;
783   }
784  
719 int get3mer_value(const char* s) {
720 return (s[0]<<16)+(s[1]<<8)+s[2];
721 }
722
723 int w3_match(int qv, const char* str, int slen, int start_index=0) {
724 if (start_index>=slen || start_index<0) return -1;
725 for (int i=start_index;i<slen-3;i++) {
726   int rv=get3mer_value(str+i);
727   if (rv==qv) return i;
728   }
729 return -1;
730 }
731
732 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
733 if (end_index>=slen) return -1;
734 if (end_index<0) end_index=slen-1;
735 for (int i=end_index-2;i>=0;i--) {
736   int rv=get3mer_value(str+i);
737   if (rv==qv) return i;
738   }
739 return -1;
740 }
741
785   struct SLocScore {
786    int pos;
787    int score;
# Line 757 | Line 800
800   };
801  
802   bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 + if (!doPolyTrim) return false;
804   int rlen=seq.length();
805   l5=0;
806   l3=rlen-1;
# Line 765 | Line 809
809   //assumes N trimming was already done
810   //so a poly match should be very close to the end of the read
811   // -- find the initial match (seed)
812 < int lmin=GMAX((rlen-12), 0);
812 > int lmin=GMAX((rlen-16), 0);
813   int li;
814   for (li=rlen-4;li>lmin;li--) {
815     if (seedVal==*(int*)&(seq[li])) {
# Line 779 | Line 823
823   SLocScore loc(ri, poly_m_score<<2);
824   SLocScore maxloc(loc);
825   //extend right
826 < while (ri<rlen-2) {
826 > while (ri<rlen-1) {
827     ri++;
828     if (seq[ri]==polyChar) {
829                  loc.add(ri,poly_m_score);
# Line 796 | Line 840
840        }
841     }
842   ri=maxloc.pos;
843 < if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
843 > if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844   //ri = right boundary for the poly match
845   //extend left
846   loc.set(li, maxloc.score);
# Line 817 | Line 861
861         maxloc=loc;
862         }
863      }
864 < if (maxloc.score>poly_minScore && ri>=rlen-3) {
865 <    l5=li;
866 <    l3=ri;
864 > li=maxloc.pos;
865 > if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866 >    (maxloc.score>poly_minScore && ri>=rlen-3) ||
867 >    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 >  //trimming this li-ri match at 3' end
869 >    l3=li-1;
870 >    if (l3<0) l3=0;
871      return true;
872      }
873   return false;
874   }
875  
828
876   bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 + if (!doPolyTrim) return false;
878   int rlen=seq.length();
879   l5=0;
880   l3=rlen-1;
# Line 835 | Line 883
883   //assumes N trimming was already done
884   //so a poly match should be very close to the end of the read
885   // -- find the initial match (seed)
886 < int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
886 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887   int li;
888   for (li=0;li<=lmax;li++) {
889     if (seedVal==*(int*)&(seq[li])) {
# Line 865 | Line 913
913         }
914      }
915   li=maxloc.pos;
916 < if (li>3) return false; //no trimming wanted, too far from 5' end
916 > if (li>5) return false; //no trimming wanted, too far from 5' end
917   //li = right boundary for the poly match
918  
919   //extend right
920   loc.set(ri, maxloc.score);
921   maxloc.pos=ri;
922 < while (ri<rlen-2) {
922 > while (ri<rlen-1) {
923     ri++;
924     if (seq[ri]==polyChar) {
925                  loc.add(ri,poly_m_score);
# Line 887 | Line 935
935        maxloc=loc;
936        }
937     }
938 <
939 < if (maxloc.score>poly_minScore && li<=3) {
940 <    l5=li;
941 <    l3=ri;
938 > ri=maxloc.pos;
939 > if ((maxloc.score==poly_minScore && li==0) ||
940 >     (maxloc.score>poly_minScore && li<2)
941 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
942 >    //adjust l5 to reflect this trimming of 5' end
943 >    l5=ri+1;
944 >    if (l5>rlen-1) l5=rlen-1;
945      return true;
946      }
947   return false;
948   }
949  
950 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
950 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951 > if (adaptors3.Count()==0) return false;
952   int rlen=seq.length();
953   l5=0;
954   l3=rlen-1;
955 < //first try a full match, we might get lucky
956 < int fi=-1;
957 < if ((fi=seq.index(adapter3))>=0) {
958 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
959 <      l5=fi+a3len;
960 <      l3=rlen-1;
961 <      }
962 <    else {
963 <      l5=0;
964 <      l3=fi-1;
913 <      }
914 <   return true;
915 <   }
916 < #ifdef DEBUG
917 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
918 < #endif
919 <
920 < //also, for fast detection of other adapter-only reads that start past
921 < // the beginning of the adapter sequence, try to see if the first a3len-4
922 < // bases of the read are a substring of the adapter
923 < if (rlen>a3len-3) {
924 <   GStr rstart=seq.substr(1,a3len-4);
925 <   if ((fi=adapter3.index(rstart))>=0) {
926 <     l3=rlen-1;
927 <     l5=a3len-4;
928 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
929 <     return true;
930 <     }
931 <  }
932 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
933 <  //check the easy cases - 11 bases exact match at the end
934 < int fdlen=11;
935 <  if (a3len<16) {
936 <   fdlen=a3len>>1;
937 <   }
938 < if (fdlen>4) {
939 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
940 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
941 <     if ((fi=adapter3.index(rstart))>=0) {
942 < #ifdef DEBUG
943 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
944 < #endif
945 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
946 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
947 <            return true;
948 <       }
949 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
950 <     GStr bstr=adapter3.substr(0, fdlen);
951 <     if ((fi=seq.rindex(bstr))>=0) {
952 < #ifdef DEBUG
953 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
954 < #endif
955 <       if (extendMatch(seq.chars(), rlen, fi,
956 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
957 <            return true;
958 <       }
959 <     } //tried to match 11 bases first
960 <    
961 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
962 < //-- only extend if the match is in the 3' (ending) region of the read
963 < int wordSize=3;
964 < int hlen=12;
965 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
966 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
967 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
968 < imin=rlen-imin;
969 < for (int iw=0;iw<hlen;iw++) {
970 <   //int32* qv=(int32*)(adapter3.chars()+iw);
971 <   int qv=get3mer_value(adapter3.chars()+iw);
972 <   fi=-1;
973 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
974 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
975 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
976 <
977 < #ifdef DEBUG
978 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
979 < #endif
980 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
981 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
982 <     fi--;
983 <     if (fi<imin) break;
984 <     }
985 <   } //for each wmer in the first hlen bases of the adaptor
986 < /*
987 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
988 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
989 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
990 < int hlen2=a3len-wordSize;
991 < //if (hlen2>a3len-4) hlen2=a3len-4;
992 < if (hlen2>hlen) {
993 < #ifdef DEBUG
994 <     if (debug && a3segs.Count()>0) {
995 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
955 > bool trimmed=false;
956 > GStr wseq(seq);
957 > int wlen=rlen;
958 > GXSeqData seqdata;
959 > int numruns=revCompl ? 2 : 1;
960 > for (int ai=0;ai<adaptors3.Count();ai++) {
961 >   for (int r=0;r<numruns;r++) {
962 >     if (r) {
963 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
964 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
965          }
966 < #endif
967 <     for (int iw=hlen;iw<hlen2;iw++) {
968 <         //int* qv=(int32 *)(adapter3.chars()+iw);
969 <         int qv=get3mer_value(adapter3.chars()+iw);
970 <         fi=-1;
971 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
972 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
973 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
974 <                         a3len, iw, wordSize, l5,l3, a3segs);
975 <           fi--;
976 <           if (fi<imin) break;
977 <           }
978 <         } //for each wmer between hlen2 and hlen bases of the adaptor
966 >     else {
967 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
968 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
969 >        }
970 >
971 >  GXAlnInfo* bestaln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
972 >  if (bestaln) {
973 >     trimmed=true;
974 >     //keep unmatched region on the left OR right (the longer one)
975 >     if (bestaln->sl > wlen-bestaln->sr) {
976 >         //keep left side
977 >         l3-=(wlen-bestaln->sl+1);
978 >         if (l3<0) l3=0;
979 >         }
980 >     else { //keep right side
981 >         l5+=bestaln->sr;
982 >         if (l5>=rlen) l5=rlen-1;
983 >         }
984 >     delete bestaln;
985 >     if (l3-l5+1<min_read_len) return true;
986 >     wseq=seq.substr(l5,l3-l5+1);
987 >     wlen=wseq.length();
988       }
989 < //lastly, analyze collected a3segs for a possible gapped alignment:
990 < GList<CSegChain> segchains(false,true,false);
991 < #ifdef DEBUG
1014 < if (debug && a3segs.Count()>0) {
1015 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1016 <   }
1017 < #endif
1018 < for (int i=0;i<a3segs.Count();i++) {
1019 <   if (a3segs[i]->chain==NULL) {
1020 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1021 <       CSegChain* newchain=new CSegChain();
1022 <       newchain->setFreeItem(false);
1023 <       newchain->addSegPair(a3segs[i]);
1024 <       a3segs[i]->chain=newchain;
1025 <       segchains.Add(newchain); //just to free them when done
1026 <       }
1027 <   for (int j=i+1;j<a3segs.Count();j++) {
1028 <      CSegChain* chain=a3segs[i]->chain;
1029 <      if (chain->extendChain(a3segs[j])) {
1030 <          a3segs[j]->chain=chain;
1031 < #ifdef DEBUG
1032 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1033 < #endif      
1034 <          //save time by checking here if the extended chain is already acceptable for trimming
1035 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1036 <            l5=0;
1037 <            l3=chain->astart-2;
1038 < #ifdef DEBUG
1039 <          if (debug && a3segs.Count()>0) {
1040 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1041 <            }
1042 < #endif
1043 <            return true;
1044 <            }
1045 <          } //chain can be extended
1046 <      }
1047 <   } //collect segment alignments into chains
1048 < */  
1049 < return false; //no adapter parts found
989 >   }//forward and reverse adaptors
990 >  }//for each 3' adaptor
991 >  return trimmed;
992   }
993  
994 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
995 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
994 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
995 > if (adaptors5.Count()==0) return false;
996   int rlen=seq.length();
997   l5=0;
998   l3=rlen-1;
999 < //try to see if adapter is fully included in the read
1000 < int fi=-1;
1001 < for (int ai=0;ai<adapters.Count();ai++) {
1002 <  if (adapters[ai]->a5.is_empty()) continue;
1003 <  int a5len=adapters[ai]->a5.length();
1004 <  GStr& adapter5=adapters[ai]->a5;
1005 <  if ((fi=seq.index(adapter5))>=0) {
1006 <    if (fi<rlen-fi-a5len) {//match is closer to the right end
1007 <       l5=fi+a5len;
1008 <       l3=rlen-1;
1067 <       }
1068 <     else {
1069 <       l5=0;
1070 <       l3=fi-1;
1071 <       }
1072 <    return true;
1073 <    }
1074 < #ifdef DEBUG
1075 <  if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1076 < #endif
1077 <
1078 <  //try the easy way out first - look for an exact match of 11 bases
1079 <  int fdlen=11;
1080 <   if (a5len<16) {
1081 <    fdlen=a5len>>1;
1082 <    }
1083 <  if (fdlen>4) {
1084 <      GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1085 <      if ((fi=adapter5.index(rstart))>=0) {
1086 < #ifdef DEBUG
1087 <        if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1088 < #endif
1089 <        if (extendMatch(seq.chars(), rlen, 1,
1090 <                      adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1091 <            return true;
999 > bool trimmed=false;
1000 > GStr wseq(seq);
1001 > int wlen=rlen;
1002 > GXSeqData seqdata;
1003 > int numruns=revCompl ? 2 : 1;
1004 > for (int ai=0;ai<adaptors5.Count();ai++) {
1005 >   for (int r=0;r<numruns;r++) {
1006 >     if (r) {
1007 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1008 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1009          }
1010 <      //another easy case: last 11 characters of the adaptor found as a substring of the read
1011 <      GStr bstr=adapter5.substr(-fdlen);
1012 <      if ((fi=seq.index(bstr))>=0) {
1096 < #ifdef DEBUG
1097 <        if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1098 < #endif
1099 <        if (extendMatch(seq.chars(), rlen, fi,
1100 <                      adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1101 <           return true;
1010 >     else {
1011 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1012 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1013          }
1014 <      } //tried to matching at most 11 bases first
1015 <
1016 <  //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1017 <  //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1018 <  int wordSize=3;
1019 <  int hlen=12;
1020 <  if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1021 <  int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1022 <  if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1023 <  for (int iw=0;iw<=hlen;iw++) {
1024 <    int apstart=a5len-iw-wordSize;
1025 <    fi=0;
1026 <    //int* qv=(int32 *)(adapter5.chars()+apstart);
1027 <    int qv=get3mer_value(adapter5.chars()+apstart);
1028 <    //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1029 <    while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1030 < #ifdef DEBUG
1031 <      if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1032 < #endif
1033 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1123 <                 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1124 <      fi++;
1125 <      if (fi>imax) break;
1126 <      }
1127 <    } //for each wmer in the last hlen bases of the adaptor
1128 < //if we're here we couldn't find a good extension
1129 <  return false; //no adapter parts found
1130 < }//for each 5' adapter
1014 >         GXAlnInfo* bestaln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1015 >         if (bestaln) {
1016 >           trimmed=true;
1017 >           if (bestaln->sl-1 > wlen-bestaln->sr) {
1018 >                   //keep left side
1019 >                   l3-=(wlen-bestaln->sl+1);
1020 >                   if (l3<0) l3=0;
1021 >                   }
1022 >           else { //keep right side
1023 >                   l5+=bestaln->sr;
1024 >                   if (l5>=rlen) l5=rlen-1;
1025 >                   }
1026 >           delete bestaln;
1027 >           if (l3-l5+1<min_read_len) return true;
1028 >           wseq=seq.substr(l5,l3-l5+1);
1029 >           wlen=wseq.length();
1030 >           }
1031 >         } //forward and reverse?
1032 >  }//for each 5' adaptor
1033 >  return trimmed;
1034   }
1035  
1036 < //convert qvs to/from phred64 from/to phread33
1036 > //convert qvs to/from phred64 from/to phread33
1037   void convertPhred(GStr& q) {
1038   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1039   }
# Line 1139 | Line 1042
1042   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1043   }
1044  
1045 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1045 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1046            GStr& rname, GStr& rinfo, GStr& infname) {
1047   rseq="";
1048   rqv="";
# Line 1155 | Line 1058
1058        } //raw qseq format
1059   else { // FASTQ or FASTA */
1060   isfasta=(l[0]=='>');
1061 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1061 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1062        infname.chars(), l);
1063   GStr s(l);
1064   rname=&(l[1]);
1065   for (int i=0;i<rname.length();i++)
1066 <    if (rname[i]<=' ') {
1067 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1068 <       rname.cut(i);
1069 <       break;
1066 >    if (rname[i]<=' ') {
1067 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1068 >       rname.cut(i);
1069 >       break;
1070         }
1071    //now get the sequence
1072 < if ((l=fq.getLine())==NULL)
1072 > if ((l=fq.getLine())==NULL)
1073        GError("Error: unexpected EOF after header for read %s (%s)\n",
1074                     rname.chars(), infname.chars());
1075   rseq=l; //this must be the DNA line
1076   while ((l=fq.getLine())!=NULL) {
1077        //seq can span multiple lines
1078        if (l[0]=='>' || l[0]=='+') {
1079 <           fq.pushBack();
1079 >           fq.pushBack();
1080             break; //
1081             }
1082        rseq+=l;
1083 <      } //check for multi-line seq
1083 >      } //check for multi-line seq
1084   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1085      if ((l=fq.getLine())==NULL)
1086          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1087      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1088 <    if ((l=fq.getLine())==NULL)
1088 >    if ((l=fq.getLine())==NULL)
1089          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1090      rqv=l;
1091 <    //if (rqv.length()!=rseq.length())
1091 >    //if (rqv.length()!=rseq.length())
1092      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1093      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1094        rqv+=l; //append to qv string
# Line 1196 | Line 1099
1099   return true;
1100   }
1101  
1102 + #ifdef GDEBUG
1103 + void showTrim(GStr& s, int l5, int l3) {
1104 +  if (l5>0 || l3==0) {
1105 +    color_bg(c_red);
1106 +    }
1107 +  for (int i=0;i<s.length()-1;i++) {
1108 +    if (i && i==l5) color_resetbg();
1109 +    fprintf(stderr, "%c", s[i]);
1110 +    if (i && i==l3) color_bg(c_red);
1111 +   }
1112 +  fprintf(stderr, "%c", s[s.length()-1]);
1113 +  color_reset();
1114 +  fprintf(stderr, "\n");
1115 + }
1116 + #endif
1117 +
1118   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1119 < //returns 0 if the read was untouched, 1 if it was just trimmed
1119 > //returns 0 if the read was untouched, 1 if it was just trimmed
1120   // and a trash code if it was trashed
1121   l5=0;
1122   l3=rseq.length()-1;
1123 + #ifdef GDEBUG
1124 +   //rseq.reverse();
1125 +   GMessage(">%s\n", rname.chars());
1126 +   GMessage("%s\n",rseq.chars());
1127 + #endif
1128   if (l3-l5+1<min_read_len) {
1129     return 's';
1130     }
# Line 1236 | Line 1160
1160     w5=0;
1161     w3=wseq.length()-1;
1162     }
1163 < if (a3len>0) {
1164 <  if (trim_adapter3(wseq, w5, w3)) {
1163 > char trim_code;
1164 > //clean the more dirty end first - 3'
1165 > int prev_w5=0;
1166 > int prev_w3=0;
1167 > bool w3upd=true;
1168 > bool w5upd=true;
1169 > do {
1170 >  trim_code=0;
1171 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1172 >      trim_code='A';
1173 >      }
1174 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1175 >      trim_code='T';
1176 >      }
1177 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1178 >      trim_code='A';
1179 >      }
1180 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1181 >      trim_code='T';
1182 >      }
1183 >  else if (trim_adaptor5(wseq, w5, w3)) {
1184 >      trim_code='5';
1185 >      }
1186 >  else if (trim_adaptor3(wseq, w5, w3)) {
1187 >      trim_code='3';
1188 >      }
1189 >  if (trim_code) {
1190 >     w3upd=(w3!=prev_w3);
1191 >         w5upd=(w5!=prev_w5);
1192 >         if (w3upd) prev_w3=w3;
1193 >         if (w5upd) prev_w5=w5;
1194 >   #ifdef GDEBUG
1195 >     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1196 >     showTrim(wseq, w5, w3);
1197 >   #endif
1198       int trimlen=wseq.length()-(w3-w5+1);
1199       num_trimmed3++;
1200 <     if (trimlen<min_trimmed3)
1200 >     if (trimlen<min_trimmed3)
1201           min_trimmed3=trimlen;
1202       l5+=w5;
1203       l3-=(wseq.length()-1-w3);
1204       if (w3-w5+1<min_read_len) {
1205 <         return '3';
1249 <         }
1250 <      //-- keep only the w5..w3 range
1251 <      wseq=wseq.substr(w5, w3-w5+1);
1252 <      if (!wqv.is_empty())
1253 <         wqv=wqv.substr(w5, w3-w5+1);
1254 <      }//some adapter was trimmed
1255 <   } //adapter trimming
1256 < if (a5len>0) {
1257 <  if (trim_adapter5(wseq, w5, w3)) {
1258 <     int trimlen=wseq.length()-(w3-w5+1);
1259 <     num_trimmed5++;
1260 <     if (trimlen<min_trimmed5)
1261 <         min_trimmed5=trimlen;
1262 <     l5+=w5;
1263 <     l3-=(wseq.length()-1-w3);
1264 <     if (w3-w5+1<min_read_len) {
1265 <         return '5';
1205 >         return trim_code;
1206           }
1207        //-- keep only the w5..w3 range
1208        wseq=wseq.substr(w5, w3-w5+1);
1209        if (!wqv.is_empty())
1210           wqv=wqv.substr(w5, w3-w5+1);
1211 <      }//some adapter was trimmed
1212 <   } //adapter trimming
1211 >      }//trimming at 3' end
1212 > } while (trim_code);
1213 >
1214   if (doCollapse) {
1215     //keep read for later
1216     FqDupRec* dr=dhash.Find(wseq.chars());
1217     if (dr==NULL) { //new entry
1218 <          //if (prefix.is_empty())
1219 <             dhash.Add(wseq.chars(),
1218 >          //if (prefix.is_empty())
1219 >             dhash.Add(wseq.chars(),
1220                    new FqDupRec(&wqv, rname.chars()));
1221            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1222           }
# Line 1309 | Line 1250
1250         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1251         }
1252        else {
1253 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1253 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1254                            rseq.chars());
1255         }
1256       }
# Line 1320 | Line 1261
1261         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1262         }
1263        else
1264 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1264 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1265                            rseq.chars(),rqv.chars() );
1266       }
1267   }
# Line 1328 | Line 1269
1269   void trash_report(char trashcode, GStr& rname, FILE* freport) {
1270   if (freport==NULL || trashcode<=' ') return;
1271   if (trashcode=='3' || trashcode=='5') {
1272 <   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1272 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1273     }
1274   else {
1275     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
# Line 1420 | Line 1361
1361      }
1362   }
1363  
1364 + void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1365 + //TODO: prepare CASeqData here, and collect hexamers as well
1366 +  if (seq.is_empty() || seq=="-" ||
1367 +          seq=="N/A" || seq==".") return;
1368 +
1369 + CASeqData adata(revCompl);
1370 + int idx=adaptors.Add(adata);
1371 + if (idx<0) GError("Error: failed to add adaptor!\n");
1372 + adaptors[idx].trim_type=trim_type;
1373 + adaptors[idx].update(seq.chars());
1374 + }
1375 +
1376  
1377 < int loadAdapters(const char* fname) {
1377 > int loadAdaptors(const char* fname) {
1378    GLineReader lr(fname);
1379    char* l;
1380    while ((l=lr.nextLine())!=NULL) {
# Line 1433 | Line 1386
1386          i++;
1387          }
1388        if (l[i]!=0) {
1389 <        adapters.Add(new CAdapters(NULL, &(l[i])));
1389 >        GStr s(&(l[i]));
1390 >      #ifdef GDEBUG
1391 >          //s.reverse();
1392 >      #endif
1393 >        addAdaptor(adaptors3, s, galn_TrimRight);
1394          continue;
1395          }
1396        }
# Line 1442 | Line 1399
1399        s.startTokenize("\t ;,:");
1400        GStr a5,a3;
1401        if (s.nextToken(a5))
1402 <         s.nextToken(a3);
1403 <      adapters.Add(new CAdapters(a5.is_empty()?NULL:a5.chars(),
1404 <                                a3.is_empty()?NULL:a3.chars()));
1402 >            s.nextToken(a3);
1403 >        else continue; //no tokens on this line
1404 >      GAlnTrimType ttype5=galn_TrimLeft;
1405 >      a5.upper();
1406 >      a3.upper();
1407 >      if (a3.is_empty() || a3==a5 || a3=="=") {
1408 >         a3.clear();
1409 >         ttype5=galn_TrimEither;
1410 >         }
1411 >     #ifdef GDEBUG
1412 >     //   a5.reverse();
1413 >     //   a3.reverse();
1414 >     #endif
1415 >      addAdaptor(adaptors5, a5, ttype5);
1416 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1417        }
1418     }
1419 <   return adapters.Count();
1419 >   return adaptors5.Count()+adaptors3.Count();
1420   }
1421  
1422 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1423 <                       GStr& s, GStr& infname, GStr& infname2) {
1422 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1423 >                       GStr& s, GStr& infname, GStr& infname2) {
1424   // uses outsuffix to generate output file names and open file handles as needed
1425   infname="";
1426   infname2="";
# Line 1479 | Line 1448
1448   s.startTokenize(",:");
1449   s.nextToken(infname);
1450   bool paired=s.nextToken(infname2);
1451 < if (fileExists(infname.chars())==0)
1451 > if (fileExists(infname.chars())==0)
1452      GError("Error: cannot find file %s!\n",infname.chars());
1453   GStr fname(getFileName(infname.chars()));
1454   GStr picmd;
# Line 1501 | Line 1470
1470   if (!paired) return;
1471   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1472   // ---- paired reads:-------------
1473 < if (fileExists(infname2.chars())==0)
1473 > if (fileExists(infname2.chars())==0)
1474       GError("Error: cannot find file %s!\n",infname2.chars());
1475   picmd="";
1476   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines