ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/seqalign.cpp
Revision: 89
Committed: Mon Oct 3 18:42:24 2011 UTC (9 years, 7 months ago) by gpertea
File size: 8035 byte(s)
Log Message:
separate wrappers for 5' and  3' alignments

Line User Rev File contents
1 gpertea 74 #include "GBase.h"
2     #include "GArgs.h"
3     #include "GStr.h"
4     #include "GList.hh"
5     #include "GXAlign.h"
6    
7     #define USAGE "Usage:\n\
8 gpertea 85 seqalign {-i <2line_file> | -a <seq1> -b <seq2>} \\ \n\
9     [X=<xdrop>] [M=<match_reward>] [N=<mismatch_penalty]\
10     "
11 gpertea 74
12     bool debug=true;
13 gpertea 86 int Xdrop=8;
14 gpertea 85 int match_reward=2;
15 gpertea 86 int mismatch_penalty=3;
16 gpertea 74
17     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
18     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
19    
20 gpertea 75 char* seq_cleanup(const char* s, int& s_len) {
21 gpertea 74 char* r=NULL;
22     int slen=strlen(s);
23     GMALLOC(r, slen+1);
24     int newlen=0;
25     for (int i=0;i<slen;i++) {
26     if (s[i]<'A' || s[i]>'z' || (s[i]>'Z' && s[i]<'a')) continue;
27     r[newlen]=s[i];
28     newlen++;
29     }
30     r[newlen]=0;
31 gpertea 75 s_len=newlen;
32 gpertea 74 return r;
33     }
34    
35 gpertea 89 bool loadSeqs(GLineReader& lr, char*& s1, char*& s2,
36 gpertea 75 int& s1_len, int& s2_len) {
37 gpertea 74 char* l=NULL;
38     s1=NULL;
39     s2=NULL;
40     while ((l=lr.nextLine())!=NULL) {
41     if (lr.length()<=3 || l[0]=='#') continue;
42     if (s1==NULL) {
43 gpertea 75 s1=seq_cleanup(l, s1_len);
44 gpertea 74 }
45 gpertea 89 else if (s2==NULL) {
46     s2=seq_cleanup(l, s2_len);
47     return true;
48     }
49 gpertea 74 else break;
50     }
51 gpertea 89 if (s1!=NULL) GFREE(s1);
52     return false;
53 gpertea 74 }
54    
55    
56 gpertea 89 GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, SGreedyAlignMem* gxmem) {
57     bool editscript=false;
58     #ifdef GDEBUG
59     editscript=true;
60     GMessage("==========> matching Right (3') end ========\n");
61     #endif
62     GList<GXSeed> rseeds(true,true,false);
63     GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
64     GList<GXAlnInfo> galns(true,true,false);
65     int max_top_bands=5;
66     int top_band_count=0;
67     GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
68     for (int b=0;b<alnbands->Count();b++) {
69     if (alnbands->Get(b)->score<4) break;
70     top_band_count++;
71     GXBand& band=*(alnbands->Get(b));
72     band.seeds.setSorted(cmpSeedScore);
73     anchor_seeds.Add(band.seeds.First());
74     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
75     }
76     #ifdef GDEBUG
77     GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
78     #endif
79 gpertea 84
80 gpertea 89 for (int i=0;i<anchor_seeds.Count();i++) {
81     GXSeed& aseed=*anchor_seeds[i];
82     int a1=aseed.a_ofs+(aseed.len>>1)+1;
83     int a2=aseed.b_ofs+(aseed.len>>1)+1;
84     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
85     seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem,
86     match_reward, mismatch_penalty, Xdrop);
87     if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4)
88     galns.AddIfNew(alninfo, true);
89     else delete alninfo;
90     }
91 gpertea 85
92 gpertea 89 #ifdef GDEBUG
93     for (int i=0;i<galns.Count();i++) {
94     GXAlnInfo* alninfo=galns[i];
95     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
96     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
97     if (alninfo->gapinfo!=NULL) {
98     GMessage("Alignment:\n");
99     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
100     }
101 gpertea 80 }
102 gpertea 89 #endif
103     //---- done
104     delete alnbands;
105     if (galns.Count()) {
106     GXAlnInfo* bestaln=galns.Shift();
107     return bestaln;
108     }
109     else return NULL;
110     }
111 gpertea 86
112 gpertea 89 GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, SGreedyAlignMem* gxmem) {
113     bool editscript=false;
114     #ifdef GDEBUG
115     editscript=true;
116     GMessage("==========> matching Left (5') end ========\n");
117     #endif
118     GList<GXSeed> rseeds(true,true,false);
119     GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
120     GList<GXAlnInfo> galns(true,true,false);
121     int max_top_bands=5;
122     int top_band_count=0;
123     GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
124     for (int b=0;b<alnbands->Count();b++) {
125     if (alnbands->Get(b)->score<4) break;
126     top_band_count++;
127     GXBand& band=*(alnbands->Get(b));
128     band.seeds.setSorted(cmpSeedScore);
129     anchor_seeds.Add(band.seeds.First());
130     if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
131     }
132     #ifdef GDEBUG
133     GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
134     #endif
135    
136     for (int i=0;i<anchor_seeds.Count();i++) {
137     GXSeed& aseed=*anchor_seeds[i];
138     int a1=aseed.a_ofs+(aseed.len>>1)+1;
139     int a2=aseed.b_ofs+(aseed.len>>1)+1;
140     GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
141     seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem,
142     match_reward, mismatch_penalty, Xdrop-2);
143     if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4)
144     //TODO: at 5' end we should use a higher pid threshold
145     // we could also decrease the X-drop value!
146     galns.AddIfNew(alninfo, true);
147     else delete alninfo;
148     }
149    
150     #ifdef GDEBUG
151     for (int i=0;i<galns.Count();i++) {
152     GXAlnInfo* alninfo=galns[i];
153     GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
154     alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
155     if (alninfo->gapinfo!=NULL) {
156     GMessage("Alignment:\n");
157     alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
158     }
159     }
160     #endif
161     //---- done
162     delete alnbands;
163     if (galns.Count()) {
164     GXAlnInfo* bestaln=galns.Shift();
165     return bestaln;
166     }
167     else return NULL;
168 gpertea 80 }
169    
170    
171    
172 gpertea 89 void findTrimming(char* aseq, int alen, char* rseq, int rlen, SGreedyAlignMem* gxmem=NULL) {
173    
174     GXAlnInfo* r_bestaln=match_RightEnd(aseq, alen, rseq, rlen, gxmem);
175    
176     #ifdef GDEBUG
177     if (r_bestaln)
178     GMessage("******** Best 3' Alignment: %d-%d vs %d-%d, score=%d, pid=%.2f\n",
179     r_bestaln->ql, r_bestaln->qr, r_bestaln->sl, r_bestaln->sr, r_bestaln->score, r_bestaln->pid);
180     else GMessage("******** No suitable 3' alignment found.\n");
181     #endif
182     if (r_bestaln)
183     fprintf(stdout, "3prime %d-%d:%d-%d|score=%d|pid=%.2f\n",
184     r_bestaln->ql, r_bestaln->qr, r_bestaln->sl, r_bestaln->sr, r_bestaln->score, r_bestaln->pid);
185    
186     delete r_bestaln;
187    
188     //testing: reverse both sequences and redo the search at 5' end
189     reverseChars(aseq);
190     reverseChars(rseq);
191    
192     GXAlnInfo* l_bestaln=match_LeftEnd(aseq, alen, rseq, rlen, gxmem);
193    
194     #ifdef GDEBUG
195     if (l_bestaln)
196     GMessage("******** Best 5' Alignment: %d-%d vs %d-%d, score=%d, pid=%.2f\n",
197     l_bestaln->ql, l_bestaln->qr, l_bestaln->sl, l_bestaln->sr, l_bestaln->score, l_bestaln->pid);
198     else GMessage("******** No suitable 5' alignment found.\n");
199     #endif
200     if (l_bestaln)
201     fprintf(stdout, "5prime %d-%d:%d-%d|score=%d|pid=%.2f\n",
202     l_bestaln->ql, l_bestaln->qr, l_bestaln->sl, l_bestaln->sr, l_bestaln->score, l_bestaln->pid);
203    
204     delete l_bestaln;
205    
206     GFREE(aseq);
207     GFREE(rseq);
208     }
209    
210 gpertea 74 //================= MAIN =====================
211    
212     int main(int argc, char * const argv[]) {
213 gpertea 85 GArgs args(argc, argv, "X=M=N=a:b:i:");
214 gpertea 74 int e;
215     if ((e=args.isError())>0)
216     GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
217     if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
218     char* seqa=NULL;
219     char* seqb=NULL;
220 gpertea 89 GStr s=args.getOpt('X');
221     if (!s.is_empty())
222     Xdrop=s.asInt();
223     s=args.getOpt('M');
224     if (!s.is_empty())
225     match_reward=s.asInt();
226     s=args.getOpt('N');
227     if (!s.is_empty())
228     mismatch_penalty=s.asInt();
229    
230     s=args.getOpt('i');
231 gpertea 75 int seqa_len=0, seqb_len=0;
232 gpertea 89 if (!s.is_empty()) {
233     GLineReader lr(s.chars());
234     SGreedyAlignMem* gxmem=new SGreedyAlignMem(match_reward, mismatch_penalty, Xdrop);
235     while (loadSeqs(lr, seqa, seqb, seqa_len, seqb_len)) {
236     findTrimming(seqa, seqa_len, seqb, seqb_len, gxmem);
237     }
238     delete gxmem;
239     }
240 gpertea 74 else {
241     s=args.getOpt('a');
242 gpertea 75 if (!s.is_empty()) seqa=seq_cleanup(s.chars(), seqa_len);
243 gpertea 74 s=args.getOpt('b');
244 gpertea 75 if (!s.is_empty()) seqb=seq_cleanup(s.chars(), seqb_len);
245 gpertea 89 if (seqa==NULL || seqa_len<2 || seqb==NULL || seqb_len<2)
246     GError("Error: two sequences with length>1 should be given!\n");
247     findTrimming(seqa, seqa_len, seqb, seqb_len);
248 gpertea 74 }
249 gpertea 85
250 gpertea 74 }