ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 6 | Line 6
6   #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 <   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
9 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12      <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15   for low complexity and collapse duplicate reads.\n\
16   If read pairs should be trimmed and kept together (i.e. without discarding\n\
17   one read in a pair), the two file names should be given delimited by a comma\n\
# Line 25 | Line 25
25      file(s) named <input>.<outsuffix> which will be created in the \n\
26      current (working) directory; (writes to stdout if -o- is given);\n\
27      a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 < -f  file with adapter sequences to trim, each line having this format:\n\
29 <    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
28 > -f  file with adaptor sequences to trim, each line having this format:\n\
29 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 < -3  trim the given adapter sequence at the 3' end of each read\n\
32 > -3  trim the given adaptor sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34   -A  disable polyA/T trimming (enabled by default)\n\
35   -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
# Line 48 | Line 48
48   -Q  convert quality values to the other Phred qv type\n\
49   -V  verbose processing\n\
50   "
51 <
51 >
52   //-z  for -o option, the output stream(s) will be first piped into the given\n
53   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54  
55 <
56 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55 >
56 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58   //For paired reads sequencing:
59   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
# Line 62 | Line 62
62   //FILE* f_out2=NULL; //for paired reads
63   //FILE* f_in=NULL; //input fastq (stdin if not provided)
64   //FILE* f_in2=NULL; //for paired reads
65 +
66   FILE* freport=NULL;
67  
68   bool debug=false;
# Line 71 | Line 72
72   bool doPolyTrim=true;
73   bool fastaOutput=false;
74   bool trashReport=false;
75 < //bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77   double max_perc_N=7.0;
78   int dust_cutoff=16;
79   bool isfasta=false;
80   bool convert_phred=false;
81 < GStr outsuffix; // -o
81 > GStr outsuffix; // -o
82   GStr prefix;
83   GStr zcmd;
84   int num_trimmed5=0;
# Line 93 | Line 94
94   int qv_cvtadd=0; //could be -31 or +31
95  
96   // adaptor matching metrics -- for X-drop ungapped extension
97 + //const int match_reward=2;
98 + //const int mismatch_penalty=3;
99   const int match_reward=2;
100 < const int mismatch_penalty=3;
101 < const int Xdrop=8;
100 > const int mismatch_penalty=4;
101 > const int Xdrop=10;
102  
103   const int poly_m_score=2; //match score
104   const int poly_mis_score=-3; //mismatch
# Line 104 | Line 107
107  
108   const char *polyA_seed="AAAA";
109   const char *polyT_seed="TTTT";
110 < GVec<GStr> adapters5;
111 < GVec<GStr> adapters3;
110 >
111 > struct CASeqData {
112 >   //positional data for every possible hexamer in an adaptor
113 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115 >   GStr seq; //actual adaptor sequence data
116 >   GStr seqr; //reverse complement sequence
117 >   int amlen; //fraction of adaptor length matching that's
118 >              //enough to consider the alignment
119 >   GAlnTrimType trim_type;
120 >   bool use_reverse;
121 >   CASeqData(bool rev=false):seq(),seqr(),
122 >             amlen(0), use_reverse(rev) {
123 >     trim_type=galn_None; //should be updated later!
124 >     for (int i=0;i<4096;i++) {
125 >        pz[i]=NULL;
126 >        pzr[i]=NULL;
127 >        }
128 >     }
129 >
130 >   void update(const char* s) {
131 >         seq=s;
132 >         table6mers(seq.chars(), seq.length(), pz);
133 >         amlen=iround(double(seq.length())*0.8);
134 >         if (amlen<12)
135 >                amlen=12;
136 >         if (!use_reverse) return;
137 >         //reverse complement
138 >         seqr=s;
139 >         int slen=seq.length();
140 >         for (int i=0;i<slen;i++)
141 >           seqr[i]=ntComplement(seq[slen-i-1]);
142 >         table6mers(seqr.chars(), seqr.length(), pzr);
143 >   }
144 >
145 >   ~CASeqData() {
146 >     for (int i=0;i<4096;i++) {
147 >       delete pz[i];
148 >       delete pzr[i];
149 >     }
150 >   }
151 > };
152 >
153 > GVec<CASeqData> adaptors5;
154 > GVec<CASeqData> adaptors3;
155  
156   CGreedyAlignData* gxmem_l=NULL;
157   CGreedyAlignData* gxmem_r=NULL;
# Line 158 | Line 204
204  
205   GHash<FqDupRec> dhash; //hash to keep track of duplicates
206  
207 < int loadAdapters(const char* fname);
207 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208 > int loadAdaptors(const char* fname);
209  
210 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211 <                       GStr& s, GStr& infname, GStr& infname2);
210 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211 >                       GStr& s, GStr& infname, GStr& infname2);
212   // uses outsuffix to generate output file names and open file handles as needed
213 <
213 >
214   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215   void trash_report(char trashcode, GStr& rname, FILE* freport);
216  
217 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
217 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218            GStr& rname, GStr& rinfo, GStr& infname);
219  
220 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
220 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222  
223   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
# Line 178 | Line 225
225   int dust(GStr& seq);
226   bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227   bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 < bool trim_adapter3(GStr& seq, int &l5, int &l3);
228 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230  
231   void convertPhred(char* q, int len);
232   void convertPhred(GStr& q);
233  
234   int main(int argc, char * const argv[]) {
235 <  GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
235 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236    int e;
237    if ((e=args.isError())>0) {
238        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 196 | Line 243
243    convert_phred=(args.getOpt('Q')!=NULL);
244    doCollapse=(args.getOpt('C')!=NULL);
245    doDust=(args.getOpt('D')!=NULL);
246 +  revCompl=(args.getOpt('R')!=NULL);
247    if (args.getOpt('A')) doPolyTrim=false;
248    /*
249    rawFormat=(args.getOpt('R')!=NULL);
# Line 205 | Line 253
253    */
254    prefix=args.getOpt('n');
255    GStr s=args.getOpt('l');
256 <  if (!s.is_empty())
256 >  if (!s.is_empty())
257       min_read_len=s.asInt();
258    s=args.getOpt('m');
259 <  if (!s.is_empty())
259 >  if (!s.is_empty())
260       max_perc_N=s.asDouble();
261    s=args.getOpt('d');
262    if (!s.is_empty()) {
# Line 234 | Line 282
282          qv_phredtype=64;
283          qv_cvtadd=-31;
284          }
285 <       else
285 >       else
286           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287       }
288    s=args.getOpt('f');
289    if (!s.is_empty()) {
290 <   loadAdapters(s.chars());
290 >   loadAdaptors(s.chars());
291     }
292 <  bool fileAdapters=adapters5.Count()+adapters3.Count();
292 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293    s=args.getOpt('5');
294    if (!s.is_empty()) {
295 <    if (fileAdapters)
295 >    if (fileAdaptors)
296        GError("Error: options -5 and -f cannot be used together!\n");
297      s.upper();
298 <    adapters5.Add(s);
298 >    addAdaptor(adaptors5, s, galn_TrimLeft);
299      }
300    s=args.getOpt('3');
301    if (!s.is_empty()) {
302 <    if (fileAdapters)
302 >    if (fileAdaptors)
303        GError("Error: options -3 and -f cannot be used together!\n");
304        s.upper();
305 <      adapters3.Add(s);
305 >      addAdaptor(adaptors3, s, galn_TrimRight);
306      }
307    s=args.getOpt('y');
308    if (!s.is_empty()) {
309       int minmatch=s.asInt();
310       poly_minScore=minmatch*poly_m_score;
311       }
312 <  
312 >
313    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314                           else outsuffix="-";
315    trashReport=  (args.getOpt('r')!=NULL);
# Line 279 | Line 327
327      openfw(freport, args, 'r');
328    char* infile=NULL;
329  
330 <  if (adapters5.Count()>0)
331 <    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 <  if (adapters3.Count()>0)
330 >  if (adaptors5.Count()>0)
331 >    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 >        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 >  if (adaptors3.Count()>0)
334      gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335  
336    while ((infile=args.nextNonOpt())!=NULL) {
# Line 352 | Line 401
401              } //pair read
402         if (tcode>1) { //trashed
403           #ifdef GDEBUG
404 <         GMessage(" !!!!TRASH => 'N'\n");
404 >         GMessage(" !!!!TRASH code = %c\n",tcode);
405           #endif
406            if (tcode=='s') trash_s++;
407            else if (tcode=='A' || tcode=='T') trash_poly++;
# Line 399 | Line 448
448                 }
449              }
450           outcounter++;
451 <         if (qd->count>maxdup_count) {
451 >         if (qd->count>maxdup_count) {
452              maxdup_count=qd->count;
453              maxdup_seq=seq;
454              }
455           if (isfasta) {
456             if (prefix.is_empty()) {
457 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
457 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458                             rseq.chars());
459               }
460             else { //use custom read name
# Line 416 | Line 465
465           else { //fastq format
466            if (convert_phred) convertPhred(qd->qv, qd->len);
467            if (prefix.is_empty()) {
468 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
468 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469                             rseq.chars(), qd->qv);
470              }
471            else { //use custom read name
# Line 455 | Line 504
504         if (trash_poly>0)
505           GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
506         if (trash_A5>0)
507 <         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
507 >         GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508         if (trash_A3>0)
509 <         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
509 >         GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510         }
511      if (trashReport) {
512            FWCLOSE(freport);
# Line 481 | Line 530
530     const char* seq;
531     bool valid;
532     NData() {
533 +    seqlen=0;
534      NCount=0;
535      end5=0;
536      end3=0;
# Line 511 | Line 561
561       perc_N=(n*100.0)/(end5-end3+1);
562       }
563   };
564 <
564 >
565   static NData feat;
566   int perc_lenN=12; // incremental distance from ends, in percentage of
567            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568 <          
568 >
569   void N_analyze(int l5, int l3, int p5, int p3) {
570   /* assumes feat was filled properly */
571   int old_dif, t5,t3,v;
572   if (l3<l5+2 || p5>p3 ) {
573     feat.end5=l5+1;
574     feat.end3=l3+1;
575 <   return;
575 >   return;
576     }
577  
578   t5=feat.NPos[p5]-l5;
579   t3=l3-feat.NPos[p3];
580   old_dif=p3-p5;
581   v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 < if (v>20) v=20; /* enforce N-search limit for very long reads */
582 > if (v>20) v=20; /* enforce N-search limit for very long reads */
583   if (t5 < v ) {
584     l5=feat.NPos[p5]+1;
585     p5++;
# Line 546 | Line 596
596             feat.end3=l3+1;
597             return;
598             }
599 <    else
599 >    else
600        N_analyze(l5,l3, p5,p3);
601   }
602  
# Line 587 | Line 637
637   feat.init(rseq);
638   l5=feat.end5-1;
639   l3=feat.end3-1;
640 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
640 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641   if (l5==feat.end5-1 && l3==feat.end3-1) {
642      if (feat.perc_N>max_perc_N) {
643             feat.valid=false;
# Line 605 | Line 655
655     return true;
656     }
657   feat.N_calc();
658 <
658 >
659   if (feat.perc_N>max_perc_N) {
660        feat.valid=false;
661        l3=l5+1;
# Line 617 | Line 667
667   //--------------- dust functions ----------------
668   class DNADuster {
669   public:
670 <  int dustword;
671 <  int dustwindow;
672 <  int dustwindow2;
670 >  int dustword;
671 >  int dustwindow;
672 >  int dustwindow2;
673    int dustcutoff;
674    int mv, iv, jv;
675    int counts[32*32*32];
# Line 714 | Line 764
764                      }
765             }
766           }
767 < //return first;
767 > //return first;
768   }
769   };
770  
# Line 732 | Line 782
782   return ncount;
783   }
784  
735 int get3mer_value(const char* s) {
736 return (s[0]<<16)+(s[1]<<8)+s[2];
737 }
738
739 int w3_match(int qv, const char* str, int slen, int start_index=0) {
740 if (start_index>=slen || start_index<0) return -1;
741 for (int i=start_index;i<slen-3;i++) {
742   int rv=get3mer_value(str+i);
743   if (rv==qv) return i;
744   }
745 return -1;
746 }
747
748 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
749 if (end_index>=slen) return -1;
750 if (end_index<0) end_index=slen-1;
751 for (int i=end_index-2;i>=0;i--) {
752   int rv=get3mer_value(str+i);
753   if (rv==qv) return i;
754   }
755 return -1;
756 }
757
785   struct SLocScore {
786    int pos;
787    int score;
# Line 782 | Line 809
809   //assumes N trimming was already done
810   //so a poly match should be very close to the end of the read
811   // -- find the initial match (seed)
812 < int lmin=GMAX((rlen-12), 0);
812 > int lmin=GMAX((rlen-16), 0);
813   int li;
814   for (li=rlen-4;li>lmin;li--) {
815     if (seedVal==*(int*)&(seq[li])) {
# Line 796 | Line 823
823   SLocScore loc(ri, poly_m_score<<2);
824   SLocScore maxloc(loc);
825   //extend right
826 < while (ri<rlen-2) {
826 > while (ri<rlen-1) {
827     ri++;
828     if (seq[ri]==polyChar) {
829                  loc.add(ri,poly_m_score);
# Line 835 | Line 862
862         }
863      }
864   li=maxloc.pos;
865 < if ((maxloc.score>poly_minScore && ri>=rlen-3) ||
866 <    (maxloc.score==poly_minScore && ri==rlen-1) ||
867 <    (maxloc.score>(poly_minScore<<1) && ri>=rlen-6)) {
865 > if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866 >    (maxloc.score>poly_minScore && ri>=rlen-3) ||
867 >    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868    //trimming this li-ri match at 3' end
869      l3=li-1;
870      if (l3<0) l3=0;
# Line 856 | Line 883
883   //assumes N trimming was already done
884   //so a poly match should be very close to the end of the read
885   // -- find the initial match (seed)
886 < int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
886 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887   int li;
888   for (li=0;li<=lmax;li++) {
889     if (seedVal==*(int*)&(seq[li])) {
# Line 892 | Line 919
919   //extend right
920   loc.set(ri, maxloc.score);
921   maxloc.pos=ri;
922 < while (ri<rlen-2) {
922 > while (ri<rlen-1) {
923     ri++;
924     if (seq[ri]==polyChar) {
925                  loc.add(ri,poly_m_score);
# Line 911 | Line 938
938   ri=maxloc.pos;
939   if ((maxloc.score==poly_minScore && li==0) ||
940       (maxloc.score>poly_minScore && li<2)
941 <     || (maxloc.score>(poly_minScore<<1) && li<6)) {
941 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
942      //adjust l5 to reflect this trimming of 5' end
943      l5=ri+1;
944      if (l5>rlen-1) l5=rlen-1;
# Line 920 | Line 947
947   return false;
948   }
949  
950 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
951 < if (adapters3.Count()==0) return false;
950 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951 > if (adaptors3.Count()==0) return false;
952   int rlen=seq.length();
953   l5=0;
954   l3=rlen-1;
955   bool trimmed=false;
956 < GStr wseq(seq.chars());
956 > GStr wseq(seq);
957   int wlen=rlen;
958 < for (int ai=0;ai<adapters3.Count();ai++) {
959 <  if (adapters3[ai].is_empty()) continue;
960 <  int alen=adapters3[ai].length();
961 <  GStr& aseq=adapters3[ai];
962 <  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
963 <  if (r_bestaln) {
964 <     trimmed=true;
965 <     //keep unmatched region on the left, if any
966 <     l3-=(wlen-r_bestaln->sl+1);
967 <     delete r_bestaln;
968 <     if (l3<0) l3=0;
969 <     if (l3-l5+1<min_read_len) return true;
970 <     wseq=seq.substr(l5,l3-l5+1);
971 <     wlen=wseq.length();
958 > GXSeqData seqdata;
959 > int numruns=revCompl ? 2 : 1;
960 > GList<GXAlnInfo> bestalns(true, true, false);
961 > for (int ai=0;ai<adaptors3.Count();ai++) {
962 >   for (int r=0;r<numruns;r++) {
963 >     if (r) {
964 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
965 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
966 >        }
967 >     else {
968 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
969 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
970 >        }
971 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
972 >         if (aln) {
973 >           if (aln->strong) {
974 >                   trimmed=true;
975 >                   bestalns.Add(aln);
976 >                   break; //will check the rest next time
977 >                   }
978 >            else bestalns.Add(aln);
979 >           }
980 >   }//forward and reverse adaptors
981 >   if (trimmed) break; //will check the rest in the next cycle
982 >  }//for each 3' adaptor
983 > if (bestalns.Count()>0) {
984 >           GXAlnInfo* aln=bestalns[0];
985 >           if (aln->sl-1 > wlen-aln->sr) {
986 >                   //keep left side
987 >                   l3-=(wlen-aln->sl+1);
988 >                   if (l3<0) l3=0;
989 >                   }
990 >           else { //keep right side
991 >                   l5+=aln->sr;
992 >                   if (l5>=rlen) l5=rlen-1;
993 >                   }
994 >           //delete aln;
995 >           //if (l3-l5+1<min_read_len) return true;
996 >           wseq=seq.substr(l5,l3-l5+1);
997 >           wlen=wseq.length();
998 >           return true; //break the loops here to report a good find
999       }
1000 <  }//for each 5' adapter
947 <  return trimmed;
1000 >  return false;
1001   }
1002  
1003 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1004 < if (adapters5.Count()==0) return false;
1003 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1004 > if (adaptors5.Count()==0) return false;
1005   int rlen=seq.length();
1006   l5=0;
1007   l3=rlen-1;
1008   bool trimmed=false;
1009 < GStr wseq(seq.chars());
1009 > GStr wseq(seq);
1010   int wlen=rlen;
1011 < for (int ai=0;ai<adapters5.Count();ai++) {
1012 <  if (adapters5[ai].is_empty()) continue;
1013 <  int alen=adapters5[ai].length();
1014 <  GStr& aseq=adapters5[ai];
1015 <  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
1016 <  if (l_bestaln) {
1017 <     trimmed=true;
1018 <     l5+=l_bestaln->sr;
1019 <     delete l_bestaln;
1020 <     if (l5>=rlen) l5=rlen-1;
1021 <     if (l3-l5+1<min_read_len) return true;
1022 <     wseq=seq.substr(l5,l3-l5+1);
1023 <     wlen=wseq.length();
1011 > GXSeqData seqdata;
1012 > int numruns=revCompl ? 2 : 1;
1013 > GList<GXAlnInfo> bestalns(true, true, false);
1014 > for (int ai=0;ai<adaptors5.Count();ai++) {
1015 >   for (int r=0;r<numruns;r++) {
1016 >     if (r) {
1017 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1018 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1019 >        }
1020 >     else {
1021 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1022 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1023 >        }
1024 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1025 >         if (aln) {
1026 >           if (aln->strong) {
1027 >                   trimmed=true;
1028 >                   bestalns.Add(aln);
1029 >                   break; //will check the rest next time
1030 >                   }
1031 >            else bestalns.Add(aln);
1032 >           }
1033 >         } //forward and reverse?
1034 >   if (trimmed) break; //will check the rest in the next cycle
1035 >  }//for each 5' adaptor
1036 >  if (bestalns.Count()>0) {
1037 >           GXAlnInfo* aln=bestalns[0];
1038 >           if (aln->sl-1 > wlen-aln->sr) {
1039 >                   //keep left side
1040 >                   l3-=(wlen-aln->sl+1);
1041 >                   if (l3<0) l3=0;
1042 >                   }
1043 >           else { //keep right side
1044 >                   l5+=aln->sr;
1045 >                   if (l5>=rlen) l5=rlen-1;
1046 >                   }
1047 >           //delete aln;
1048 >           //if (l3-l5+1<min_read_len) return true;
1049 >           wseq=seq.substr(l5,l3-l5+1);
1050 >           wlen=wseq.length();
1051 >           return true; //break the loops here to report a good find
1052       }
1053 <  }//for each 5' adapter
973 <  return trimmed;
1053 >  return false;
1054   }
1055  
1056 < //convert qvs to/from phred64 from/to phread33
1056 > //convert qvs to/from phred64 from/to phread33
1057   void convertPhred(GStr& q) {
1058   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1059   }
# Line 982 | Line 1062
1062   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1063   }
1064  
1065 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1065 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1066            GStr& rname, GStr& rinfo, GStr& infname) {
1067   rseq="";
1068   rqv="";
# Line 998 | Line 1078
1078        } //raw qseq format
1079   else { // FASTQ or FASTA */
1080   isfasta=(l[0]=='>');
1081 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1081 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1082        infname.chars(), l);
1083   GStr s(l);
1084   rname=&(l[1]);
1085   for (int i=0;i<rname.length();i++)
1086 <    if (rname[i]<=' ') {
1087 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1088 <       rname.cut(i);
1089 <       break;
1086 >    if (rname[i]<=' ') {
1087 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1088 >       rname.cut(i);
1089 >       break;
1090         }
1091    //now get the sequence
1092 < if ((l=fq.getLine())==NULL)
1092 > if ((l=fq.getLine())==NULL)
1093        GError("Error: unexpected EOF after header for read %s (%s)\n",
1094                     rname.chars(), infname.chars());
1095   rseq=l; //this must be the DNA line
1096   while ((l=fq.getLine())!=NULL) {
1097        //seq can span multiple lines
1098        if (l[0]=='>' || l[0]=='+') {
1099 <           fq.pushBack();
1099 >           fq.pushBack();
1100             break; //
1101             }
1102        rseq+=l;
1103 <      } //check for multi-line seq
1103 >      } //check for multi-line seq
1104   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1105      if ((l=fq.getLine())==NULL)
1106          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1107      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1108 <    if ((l=fq.getLine())==NULL)
1108 >    if ((l=fq.getLine())==NULL)
1109          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1110      rqv=l;
1111 <    //if (rqv.length()!=rseq.length())
1111 >    //if (rqv.length()!=rseq.length())
1112      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1113      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1114        rqv+=l; //append to qv string
# Line 1041 | Line 1121
1121  
1122   #ifdef GDEBUG
1123   void showTrim(GStr& s, int l5, int l3) {
1124 <  if (l5>0) {
1124 >  if (l5>0 || l3==0) {
1125      color_bg(c_red);
1126      }
1127    for (int i=0;i<s.length()-1;i++) {
1128      if (i && i==l5) color_resetbg();
1129      fprintf(stderr, "%c", s[i]);
1130 <    if (i==l3) color_bg(c_red);
1130 >    if (i && i==l3) color_bg(c_red);
1131     }
1132    fprintf(stderr, "%c", s[s.length()-1]);
1133    color_reset();
# Line 1056 | Line 1136
1136   #endif
1137  
1138   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1139 < //returns 0 if the read was untouched, 1 if it was just trimmed
1139 > //returns 0 if the read was untouched, 1 if it was just trimmed
1140   // and a trash code if it was trashed
1141   l5=0;
1142   l3=rseq.length()-1;
1143   #ifdef GDEBUG
1144 +   //rseq.reverse();
1145     GMessage(">%s\n", rname.chars());
1146     GMessage("%s\n",rseq.chars());
1147   #endif
# Line 1100 | Line 1181
1181     w3=wseq.length()-1;
1182     }
1183   char trim_code;
1184 + //clean the more dirty end first - 3'
1185 + int prev_w5=0;
1186 + int prev_w3=0;
1187 + bool w3upd=true;
1188 + bool w5upd=true;
1189   do {
1190    trim_code=0;
1191 <  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1191 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1192        trim_code='A';
1193        }
1194 <  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1194 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1195        trim_code='T';
1196        }
1197 <  else if (trim_adapter5(wseq, w5, w3)) {
1112 <      trim_code='5';
1113 <      }
1114 <  if (trim_code) {
1115 <     #ifdef GDEBUG
1116 <      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1117 <      showTrim(wseq, w5, w3);
1118 <     #endif
1119 <     int trimlen=wseq.length()-(w3-w5+1);
1120 <     num_trimmed5++;
1121 <     if (trimlen<min_trimmed5)
1122 <         min_trimmed5=trimlen;
1123 <     l5+=w5;
1124 <     l3-=(wseq.length()-1-w3);
1125 <     if (w3-w5+1<min_read_len) {
1126 <         return trim_code;
1127 <         }
1128 <      //-- keep only the w5..w3 range
1129 <      wseq=wseq.substr(w5, w3-w5+1);
1130 <      if (!wqv.is_empty())
1131 <         wqv=wqv.substr(w5, w3-w5+1);
1132 <      }// trimmed at 5' end
1133 < } while (trim_code);
1134 <
1135 < do {
1136 <  trim_code=0;
1137 <  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1197 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1198        trim_code='A';
1199        }
1200 <  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1200 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1201        trim_code='T';
1202        }
1203 <  else if (trim_adapter3(wseq, w5, w3)) {
1203 >  else if (trim_adaptor5(wseq, w5, w3)) {
1204 >      trim_code='5';
1205 >      }
1206 >  else if (trim_adaptor3(wseq, w5, w3)) {
1207        trim_code='3';
1208        }
1209    if (trim_code) {
1210 <     #ifdef GDEBUG
1210 >     w3upd=(w3!=prev_w3);
1211 >         w5upd=(w5!=prev_w5);
1212 >         if (w3upd) prev_w3=w3;
1213 >         if (w5upd) prev_w5=w5;
1214 >   #ifdef GDEBUG
1215       GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1216       showTrim(wseq, w5, w3);
1217 <     #endif
1217 >   #endif
1218       int trimlen=wseq.length()-(w3-w5+1);
1219       num_trimmed3++;
1220 <     if (trimlen<min_trimmed3)
1220 >     if (trimlen<min_trimmed3)
1221           min_trimmed3=trimlen;
1222       l5+=w5;
1223       l3-=(wseq.length()-1-w3);
# Line 1164 | Line 1231
1231        }//trimming at 3' end
1232   } while (trim_code);
1233  
1167
1234   if (doCollapse) {
1235     //keep read for later
1236     FqDupRec* dr=dhash.Find(wseq.chars());
1237     if (dr==NULL) { //new entry
1238 <          //if (prefix.is_empty())
1239 <             dhash.Add(wseq.chars(),
1238 >          //if (prefix.is_empty())
1239 >             dhash.Add(wseq.chars(),
1240                    new FqDupRec(&wqv, rname.chars()));
1241            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1242           }
# Line 1204 | Line 1270
1270         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1271         }
1272        else {
1273 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1273 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1274                            rseq.chars());
1275         }
1276       }
# Line 1215 | Line 1281
1281         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1282         }
1283        else
1284 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1284 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1285                            rseq.chars(),rqv.chars() );
1286       }
1287   }
# Line 1315 | Line 1381
1381      }
1382   }
1383  
1384 + void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1385 + //TODO: prepare CASeqData here, and collect hexamers as well
1386 +  if (seq.is_empty() || seq=="-" ||
1387 +          seq=="N/A" || seq==".") return;
1388 +
1389 + CASeqData adata(revCompl);
1390 + int idx=adaptors.Add(adata);
1391 + if (idx<0) GError("Error: failed to add adaptor!\n");
1392 + adaptors[idx].trim_type=trim_type;
1393 + adaptors[idx].update(seq.chars());
1394 + }
1395 +
1396  
1397 < int loadAdapters(const char* fname) {
1397 > int loadAdaptors(const char* fname) {
1398    GLineReader lr(fname);
1399    char* l;
1400    while ((l=lr.nextLine())!=NULL) {
# Line 1329 | Line 1407
1407          }
1408        if (l[i]!=0) {
1409          GStr s(&(l[i]));
1410 <        adapters3.Add(s);
1410 >      #ifdef GDEBUG
1411 >          //s.reverse();
1412 >      #endif
1413 >        addAdaptor(adaptors3, s, galn_TrimRight);
1414          continue;
1415          }
1416        }
# Line 1338 | Line 1419
1419        s.startTokenize("\t ;,:");
1420        GStr a5,a3;
1421        if (s.nextToken(a5))
1422 <         s.nextToken(a3);
1422 >            s.nextToken(a3);
1423 >        else continue; //no tokens on this line
1424 >      GAlnTrimType ttype5=galn_TrimLeft;
1425        a5.upper();
1426        a3.upper();
1427 <      adapters5.Add(a5);
1428 <      adapters3.Add(a3);
1427 >      if (a3.is_empty() || a3==a5 || a3=="=") {
1428 >         a3.clear();
1429 >         ttype5=galn_TrimEither;
1430 >         }
1431 >     #ifdef GDEBUG
1432 >     //   a5.reverse();
1433 >     //   a3.reverse();
1434 >     #endif
1435 >      addAdaptor(adaptors5, a5, ttype5);
1436 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1437        }
1438     }
1439 <   return adapters5.Count()+adapters3.Count();
1439 >   return adaptors5.Count()+adaptors3.Count();
1440   }
1441  
1442 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1443 <                       GStr& s, GStr& infname, GStr& infname2) {
1442 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1443 >                       GStr& s, GStr& infname, GStr& infname2) {
1444   // uses outsuffix to generate output file names and open file handles as needed
1445   infname="";
1446   infname2="";
# Line 1377 | Line 1468
1468   s.startTokenize(",:");
1469   s.nextToken(infname);
1470   bool paired=s.nextToken(infname2);
1471 < if (fileExists(infname.chars())==0)
1471 > if (fileExists(infname.chars())==0)
1472      GError("Error: cannot find file %s!\n",infname.chars());
1473   GStr fname(getFileName(infname.chars()));
1474   GStr picmd;
# Line 1399 | Line 1490
1490   if (!paired) return;
1491   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1492   // ---- paired reads:-------------
1493 < if (fileExists(infname2.chars())==0)
1493 > if (fileExists(infname2.chars())==0)
1494       GError("Error: cannot find file %s!\n",infname2.chars());
1495   picmd="";
1496   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines