ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 7 | Line 7
7  
8   #define USAGE "Usage:\n\
9   fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 <   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12      <input.fq>[,<input_mates.fq>\n\
13   \n\
# Line 48 | Line 48
48   -Q  convert quality values to the other Phred qv type\n\
49   -V  verbose processing\n\
50   "
51 <
51 >
52   //-z  for -o option, the output stream(s) will be first piped into the given\n
53   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54  
55 <
55 >
56   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58   //For paired reads sequencing:
# Line 62 | Line 62
62   //FILE* f_out2=NULL; //for paired reads
63   //FILE* f_in=NULL; //input fastq (stdin if not provided)
64   //FILE* f_in2=NULL; //for paired reads
65 +
66   FILE* freport=NULL;
67  
68   bool debug=false;
# Line 71 | Line 72
72   bool doPolyTrim=true;
73   bool fastaOutput=false;
74   bool trashReport=false;
75 < //bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77   double max_perc_N=7.0;
78   int dust_cutoff=16;
79   bool isfasta=false;
80   bool convert_phred=false;
81 < GStr outsuffix; // -o
81 > GStr outsuffix; // -o
82   GStr prefix;
83   GStr zcmd;
84   int num_trimmed5=0;
# Line 104 | Line 105
105  
106   const char *polyA_seed="AAAA";
107   const char *polyT_seed="TTTT";
108 < GVec<GStr> adapters5;
109 < GVec<GStr> adapters3;
108 >
109 > struct CASeqData {
110 >   //positional data for every possible hexamer in an adapter
111 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
112 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113 >   GStr seq; //actual adapter sequence data
114 >   GStr seqr; //reverse complement sequence
115 >   bool use_reverse;
116 >   CASeqData(bool rev=false):seq(),seqr() {
117 >     use_reverse=rev;
118 >     for (int i=0;i<4096;i++) {
119 >        pz[i]=NULL;
120 >        pzr[i]=NULL;
121 >        }
122 >     }
123 >
124 >   void update(const char* s) {
125 >   seq=s;
126 >   table6mers(seq.chars(), seq.length(), pz);
127 >   if (!use_reverse) return;
128 >   //reverse complement
129 >   seqr=s;
130 >   int slen=seq.length();
131 >   for (int i=0;i<slen;i++)
132 >     seqr[i]=ntComplement(seq[slen-i-1]);
133 >   table6mers(seqr.chars(), seqr.length(), pzr);
134 >   }
135 >
136 >   ~CASeqData() {
137 >     for (int i=0;i<4096;i++) {
138 >       delete pz[i];
139 >       delete pzr[i];
140 >     }
141 >   }
142 > };
143 >
144 > GVec<CASeqData> adapters5;
145 > GVec<CASeqData> adapters3;
146  
147   CGreedyAlignData* gxmem_l=NULL;
148   CGreedyAlignData* gxmem_r=NULL;
# Line 158 | Line 195
195  
196   GHash<FqDupRec> dhash; //hash to keep track of duplicates
197  
198 + void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
199   int loadAdapters(const char* fname);
200  
201 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
202 <                       GStr& s, GStr& infname, GStr& infname2);
201 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
202 >                       GStr& s, GStr& infname, GStr& infname2);
203   // uses outsuffix to generate output file names and open file handles as needed
204 <
204 >
205   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
206   void trash_report(char trashcode, GStr& rname, FILE* freport);
207  
208 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
208 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
209            GStr& rname, GStr& rinfo, GStr& infname);
210  
211 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
211 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
212   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
213  
214   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
# Line 185 | Line 223
223   void convertPhred(GStr& q);
224  
225   int main(int argc, char * const argv[]) {
226 <  GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
226 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
227    int e;
228    if ((e=args.isError())>0) {
229        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 196 | Line 234
234    convert_phred=(args.getOpt('Q')!=NULL);
235    doCollapse=(args.getOpt('C')!=NULL);
236    doDust=(args.getOpt('D')!=NULL);
237 +  revCompl=(args.getOpt('R')!=NULL);
238    if (args.getOpt('A')) doPolyTrim=false;
239    /*
240    rawFormat=(args.getOpt('R')!=NULL);
# Line 205 | Line 244
244    */
245    prefix=args.getOpt('n');
246    GStr s=args.getOpt('l');
247 <  if (!s.is_empty())
247 >  if (!s.is_empty())
248       min_read_len=s.asInt();
249    s=args.getOpt('m');
250 <  if (!s.is_empty())
250 >  if (!s.is_empty())
251       max_perc_N=s.asDouble();
252    s=args.getOpt('d');
253    if (!s.is_empty()) {
# Line 234 | Line 273
273          qv_phredtype=64;
274          qv_cvtadd=-31;
275          }
276 <       else
276 >       else
277           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
278       }
279    s=args.getOpt('f');
# Line 247 | Line 286
286      if (fileAdapters)
287        GError("Error: options -5 and -f cannot be used together!\n");
288      s.upper();
289 <    adapters5.Add(s);
289 >    addAdapter(adapters5, s);
290      }
291    s=args.getOpt('3');
292    if (!s.is_empty()) {
293      if (fileAdapters)
294        GError("Error: options -3 and -f cannot be used together!\n");
295        s.upper();
296 <      adapters3.Add(s);
296 >      addAdapter(adapters3, s);
297      }
298    s=args.getOpt('y');
299    if (!s.is_empty()) {
300       int minmatch=s.asInt();
301       poly_minScore=minmatch*poly_m_score;
302       }
303 <  
303 >
304    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
305                           else outsuffix="-";
306    trashReport=  (args.getOpt('r')!=NULL);
# Line 370 | Line 409
409              }
410           #ifdef GDEBUG
411              GMessage("  After trimming:\n");
412 <            GMessage("<==%s\n",rseq.chars());
412 >            GMessage("%s\n",rseq.chars());
413           #endif
414            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
415            }
# Line 399 | Line 438
438                 }
439              }
440           outcounter++;
441 <         if (qd->count>maxdup_count) {
441 >         if (qd->count>maxdup_count) {
442              maxdup_count=qd->count;
443              maxdup_seq=seq;
444              }
445           if (isfasta) {
446             if (prefix.is_empty()) {
447 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
447 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
448                             rseq.chars());
449               }
450             else { //use custom read name
# Line 416 | Line 455
455           else { //fastq format
456            if (convert_phred) convertPhred(qd->qv, qd->len);
457            if (prefix.is_empty()) {
458 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
458 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
459                             rseq.chars(), qd->qv);
460              }
461            else { //use custom read name
# Line 481 | Line 520
520     const char* seq;
521     bool valid;
522     NData() {
523 +    seqlen=0;
524      NCount=0;
525      end5=0;
526      end3=0;
# Line 511 | Line 551
551       perc_N=(n*100.0)/(end5-end3+1);
552       }
553   };
554 <
554 >
555   static NData feat;
556   int perc_lenN=12; // incremental distance from ends, in percentage of
557            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
558 <          
558 >
559   void N_analyze(int l5, int l3, int p5, int p3) {
560   /* assumes feat was filled properly */
561   int old_dif, t5,t3,v;
562   if (l3<l5+2 || p5>p3 ) {
563     feat.end5=l5+1;
564     feat.end3=l3+1;
565 <   return;
565 >   return;
566     }
567  
568   t5=feat.NPos[p5]-l5;
569   t3=l3-feat.NPos[p3];
570   old_dif=p3-p5;
571   v=(int)((((double)(l3-l5))*perc_lenN)/100);
572 < if (v>20) v=20; /* enforce N-search limit for very long reads */
572 > if (v>20) v=20; /* enforce N-search limit for very long reads */
573   if (t5 < v ) {
574     l5=feat.NPos[p5]+1;
575     p5++;
# Line 546 | Line 586
586             feat.end3=l3+1;
587             return;
588             }
589 <    else
589 >    else
590        N_analyze(l5,l3, p5,p3);
591   }
592  
# Line 587 | Line 627
627   feat.init(rseq);
628   l5=feat.end5-1;
629   l3=feat.end3-1;
630 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
630 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
631   if (l5==feat.end5-1 && l3==feat.end3-1) {
632      if (feat.perc_N>max_perc_N) {
633             feat.valid=false;
# Line 605 | Line 645
645     return true;
646     }
647   feat.N_calc();
648 <
648 >
649   if (feat.perc_N>max_perc_N) {
650        feat.valid=false;
651        l3=l5+1;
# Line 617 | Line 657
657   //--------------- dust functions ----------------
658   class DNADuster {
659   public:
660 <  int dustword;
661 <  int dustwindow;
662 <  int dustwindow2;
660 >  int dustword;
661 >  int dustwindow;
662 >  int dustwindow2;
663    int dustcutoff;
664    int mv, iv, jv;
665    int counts[32*32*32];
# Line 714 | Line 754
754                      }
755             }
756           }
757 < //return first;
757 > //return first;
758   }
759   };
760  
# Line 732 | Line 772
772   return ncount;
773   }
774  
735 int get3mer_value(const char* s) {
736 return (s[0]<<16)+(s[1]<<8)+s[2];
737 }
738
739 int w3_match(int qv, const char* str, int slen, int start_index=0) {
740 if (start_index>=slen || start_index<0) return -1;
741 for (int i=start_index;i<slen-3;i++) {
742   int rv=get3mer_value(str+i);
743   if (rv==qv) return i;
744   }
745 return -1;
746 }
747
748 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
749 if (end_index>=slen) return -1;
750 if (end_index<0) end_index=slen-1;
751 for (int i=end_index-2;i>=0;i--) {
752   int rv=get3mer_value(str+i);
753   if (rv==qv) return i;
754   }
755 return -1;
756 }
757
775   struct SLocScore {
776    int pos;
777    int score;
# Line 782 | Line 799
799   //assumes N trimming was already done
800   //so a poly match should be very close to the end of the read
801   // -- find the initial match (seed)
802 < int lmin=GMAX((rlen-12), 0);
802 > int lmin=GMAX((rlen-16), 0);
803   int li;
804   for (li=rlen-4;li>lmin;li--) {
805     if (seedVal==*(int*)&(seq[li])) {
# Line 796 | Line 813
813   SLocScore loc(ri, poly_m_score<<2);
814   SLocScore maxloc(loc);
815   //extend right
816 < while (ri<rlen-2) {
816 > while (ri<rlen-1) {
817     ri++;
818     if (seq[ri]==polyChar) {
819                  loc.add(ri,poly_m_score);
# Line 834 | Line 851
851         maxloc=loc;
852         }
853      }
854 < if ((maxloc.score>poly_minScore && ri>=rlen-3) ||
855 <    (maxloc.score==poly_minScore && ri==rlen-1) ||
856 <    (maxloc.score>(poly_minScore<<1) && ri>=rlen-6)) {
857 <    l5=li;
858 <    l3=ri;
854 > li=maxloc.pos;
855 > if ((maxloc.score==poly_minScore && ri==rlen-1) ||
856 >    (maxloc.score>poly_minScore && ri>=rlen-3) ||
857 >    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
858 >  //trimming this li-ri match at 3' end
859 >    l3=li-1;
860 >    if (l3<0) l3=0;
861      return true;
862      }
863   return false;
# Line 854 | Line 873
873   //assumes N trimming was already done
874   //so a poly match should be very close to the end of the read
875   // -- find the initial match (seed)
876 < int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
876 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
877   int li;
878   for (li=0;li<=lmax;li++) {
879     if (seedVal==*(int*)&(seq[li])) {
# Line 890 | Line 909
909   //extend right
910   loc.set(ri, maxloc.score);
911   maxloc.pos=ri;
912 < while (ri<rlen-2) {
912 > while (ri<rlen-1) {
913     ri++;
914     if (seq[ri]==polyChar) {
915                  loc.add(ri,poly_m_score);
# Line 906 | Line 925
925        maxloc=loc;
926        }
927     }
928 <
928 > ri=maxloc.pos;
929   if ((maxloc.score==poly_minScore && li==0) ||
930       (maxloc.score>poly_minScore && li<2)
931 <     || (maxloc.score>(poly_minScore<<1) && li<6)) {
932 <    l5=li;
933 <    l3=ri;
931 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
932 >    //adjust l5 to reflect this trimming of 5' end
933 >    l5=ri+1;
934 >    if (l5>rlen-1) l5=rlen-1;
935      return true;
936      }
937   return false;
# Line 926 | Line 946
946   GStr wseq(seq.chars());
947   int wlen=rlen;
948   for (int ai=0;ai<adapters3.Count();ai++) {
949 <  if (adapters3[ai].is_empty()) continue;
950 <  int alen=adapters3[ai].length();
951 <  GStr& aseq=adapters3[ai];
952 <  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
949 >  //if (adapters3[ai].is_empty()) continue;
950 >  int alen=adapters3[ai].seq.length();
951 >  GStr& aseq=adapters3[ai].seq;
952 >  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
953 >                            wseq.chars(), wlen, gxmem_r, 74);
954    if (r_bestaln) {
955       trimmed=true;
956       //keep unmatched region on the left, if any
# Line 953 | Line 974
974   GStr wseq(seq.chars());
975   int wlen=rlen;
976   for (int ai=0;ai<adapters5.Count();ai++) {
977 <  if (adapters5[ai].is_empty()) continue;
978 <  int alen=adapters5[ai].length();
979 <  GStr& aseq=adapters5[ai];
980 <  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
977 >  //if (adapters5[ai].is_empty()) continue;
978 >  int alen=adapters5[ai].seq.length();
979 >  GStr& aseq=adapters5[ai].seq;
980 >  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
981 >                 wseq.chars(), wlen, gxmem_l, 84);
982    if (l_bestaln) {
983       trimmed=true;
984       l5+=l_bestaln->sr;
# Line 970 | Line 992
992    return trimmed;
993   }
994  
995 < //convert qvs to/from phred64 from/to phread33
995 > //convert qvs to/from phred64 from/to phread33
996   void convertPhred(GStr& q) {
997   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
998   }
# Line 979 | Line 1001
1001   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1002   }
1003  
1004 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1004 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1005            GStr& rname, GStr& rinfo, GStr& infname) {
1006   rseq="";
1007   rqv="";
# Line 995 | Line 1017
1017        } //raw qseq format
1018   else { // FASTQ or FASTA */
1019   isfasta=(l[0]=='>');
1020 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1020 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1021        infname.chars(), l);
1022   GStr s(l);
1023   rname=&(l[1]);
1024   for (int i=0;i<rname.length();i++)
1025 <    if (rname[i]<=' ') {
1026 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1027 <       rname.cut(i);
1028 <       break;
1025 >    if (rname[i]<=' ') {
1026 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1027 >       rname.cut(i);
1028 >       break;
1029         }
1030    //now get the sequence
1031 < if ((l=fq.getLine())==NULL)
1031 > if ((l=fq.getLine())==NULL)
1032        GError("Error: unexpected EOF after header for read %s (%s)\n",
1033                     rname.chars(), infname.chars());
1034   rseq=l; //this must be the DNA line
1035   while ((l=fq.getLine())!=NULL) {
1036        //seq can span multiple lines
1037        if (l[0]=='>' || l[0]=='+') {
1038 <           fq.pushBack();
1038 >           fq.pushBack();
1039             break; //
1040             }
1041        rseq+=l;
1042 <      } //check for multi-line seq
1042 >      } //check for multi-line seq
1043   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1044      if ((l=fq.getLine())==NULL)
1045          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1046      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1047 <    if ((l=fq.getLine())==NULL)
1047 >    if ((l=fq.getLine())==NULL)
1048          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1049      rqv=l;
1050 <    //if (rqv.length()!=rseq.length())
1050 >    //if (rqv.length()!=rseq.length())
1051      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1052      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1053        rqv+=l; //append to qv string
# Line 1036 | Line 1058
1058   return true;
1059   }
1060  
1061 + #ifdef GDEBUG
1062 + void showTrim(GStr& s, int l5, int l3) {
1063 +  if (l5>0) {
1064 +    color_bg(c_red);
1065 +    }
1066 +  for (int i=0;i<s.length()-1;i++) {
1067 +    if (i && i==l5) color_resetbg();
1068 +    fprintf(stderr, "%c", s[i]);
1069 +    if (i==l3) color_bg(c_red);
1070 +   }
1071 +  fprintf(stderr, "%c", s[s.length()-1]);
1072 +  color_reset();
1073 +  fprintf(stderr, "\n");
1074 + }
1075 + #endif
1076 +
1077   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1078 < //returns 0 if the read was untouched, 1 if it was just trimmed
1078 > //returns 0 if the read was untouched, 1 if it was just trimmed
1079   // and a trash code if it was trashed
1080   l5=0;
1081   l3=rseq.length()-1;
1082   #ifdef GDEBUG
1083 +   //rseq.reverse();
1084     GMessage(">%s\n", rname.chars());
1085 <   GMessage("==>%s\n",rseq.chars());
1085 >   GMessage("%s\n",rseq.chars());
1086   #endif
1087   if (l3-l5+1<min_read_len) {
1088     return 's';
# Line 1085 | Line 1124
1124    trim_code=0;
1125    if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1126        trim_code='A';
1088      #ifdef GDEBUG
1089       GMessage("\t-trimmed poly-A at 5' end\n");
1090      #endif
1127        }
1128    else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1129        trim_code='T';
1094      #ifdef GDEBUG
1095       GMessage("\t-trimmed poly-T at 5' end\n");
1096      #endif
1130        }
1131    else if (trim_adapter5(wseq, w5, w3)) {
1132        trim_code='5';
1100      #ifdef GDEBUG
1101       GMessage("\t-trimmed adapter at 5' end\n");
1102      #endif
1133        }
1134    if (trim_code) {
1135 +     #ifdef GDEBUG
1136 +      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1137 +      showTrim(wseq, w5, w3);
1138 +     #endif
1139       int trimlen=wseq.length()-(w3-w5+1);
1140       num_trimmed5++;
1141       if (trimlen<min_trimmed5)
# Line 1130 | Line 1164
1164        trim_code='3';
1165        }
1166    if (trim_code) {
1167 +     #ifdef GDEBUG
1168 +     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1169 +     showTrim(wseq, w5, w3);
1170 +     #endif
1171       int trimlen=wseq.length()-(w3-w5+1);
1172       num_trimmed3++;
1173 <     if (trimlen<min_trimmed3)
1173 >     if (trimlen<min_trimmed3)
1174           min_trimmed3=trimlen;
1175       l5+=w5;
1176       l3-=(wseq.length()-1-w3);
# Line 1151 | Line 1189
1189     //keep read for later
1190     FqDupRec* dr=dhash.Find(wseq.chars());
1191     if (dr==NULL) { //new entry
1192 <          //if (prefix.is_empty())
1193 <             dhash.Add(wseq.chars(),
1192 >          //if (prefix.is_empty())
1193 >             dhash.Add(wseq.chars(),
1194                    new FqDupRec(&wqv, rname.chars()));
1195            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1196           }
# Line 1186 | Line 1224
1224         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1225         }
1226        else {
1227 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1227 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1228                            rseq.chars());
1229         }
1230       }
# Line 1197 | Line 1235
1235         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1236         }
1237        else
1238 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1238 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1239                            rseq.chars(),rqv.chars() );
1240       }
1241   }
# Line 1297 | Line 1335
1335      }
1336   }
1337  
1338 + void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1339 + //TODO: prepare CASeqData here, and collect hexamers as well
1340 + CASeqData adata(revCompl);
1341 + int idx=adapters.Add(adata);
1342 + if (idx<0) GError("Error: failed to add adaptor!\n");
1343 + adapters[idx].update(seq.chars());
1344 + }
1345 +
1346  
1347   int loadAdapters(const char* fname) {
1348    GLineReader lr(fname);
# Line 1311 | Line 1357
1357          }
1358        if (l[i]!=0) {
1359          GStr s(&(l[i]));
1360 <        adapters3.Add(s);
1360 >      #ifdef GDEBUG
1361 >          //s.reverse();
1362 >      #endif
1363 >        addAdapter(adapters3, s);
1364          continue;
1365          }
1366        }
# Line 1323 | Line 1372
1372           s.nextToken(a3);
1373        a5.upper();
1374        a3.upper();
1375 <      adapters5.Add(a5);
1376 <      adapters3.Add(a3);
1375 >     #ifdef GDEBUG
1376 >     //   a5.reverse();
1377 >     //   a3.reverse();
1378 >     #endif
1379 >      addAdapter(adapters5, a5);
1380 >      addAdapter(adapters3, a3);
1381        }
1382     }
1383     return adapters5.Count()+adapters3.Count();
1384   }
1385  
1386 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1387 <                       GStr& s, GStr& infname, GStr& infname2) {
1386 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1387 >                       GStr& s, GStr& infname, GStr& infname2) {
1388   // uses outsuffix to generate output file names and open file handles as needed
1389   infname="";
1390   infname2="";
# Line 1359 | Line 1412
1412   s.startTokenize(",:");
1413   s.nextToken(infname);
1414   bool paired=s.nextToken(infname2);
1415 < if (fileExists(infname.chars())==0)
1415 > if (fileExists(infname.chars())==0)
1416      GError("Error: cannot find file %s!\n",infname.chars());
1417   GStr fname(getFileName(infname.chars()));
1418   GStr picmd;
# Line 1381 | Line 1434
1434   if (!paired) return;
1435   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1436   // ---- paired reads:-------------
1437 < if (fileExists(infname2.chars())==0)
1437 > if (fileExists(infname2.chars())==0)
1438       GError("Error: cannot find file %s!\n",infname2.chars());
1439   picmd="";
1440   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines