ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/seqalign.cpp
Revision: 83
Committed: Thu Sep 29 21:32:42 2011 UTC (9 years, 8 months ago) by gpertea
File size: 4380 byte(s)
Log Message:
wip

Line File contents
1 #include "GBase.h"
2 #include "GArgs.h"
3 #include "GStr.h"
4 #include "GList.hh"
5 #include "GXAlign.h"
6
7 #define USAGE "Usage:\n\
8 seqalign -i <2line_file> | -a <seq1> -b <seq2>\n"
9
10 bool debug=true;
11
12 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
13 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
14
15 char* seq_cleanup(const char* s, int& s_len) {
16 char* r=NULL;
17 int slen=strlen(s);
18 GMALLOC(r, slen+1);
19 int newlen=0;
20 for (int i=0;i<slen;i++) {
21 if (s[i]<'A' || s[i]>'z' || (s[i]>'Z' && s[i]<'a')) continue;
22 r[newlen]=s[i];
23 newlen++;
24 }
25 r[newlen]=0;
26 s_len=newlen;
27 return r;
28 }
29
30 void loadSeqs(const char* fname, char*& s1, char*& s2,
31 int& s1_len, int& s2_len) {
32 GLineReader lr(fname);
33 char* l=NULL;
34 s1=NULL;
35 s2=NULL;
36 while ((l=lr.nextLine())!=NULL) {
37 if (lr.length()<=3 || l[0]=='#') continue;
38 if (s1==NULL) {
39 s1=seq_cleanup(l, s1_len);
40 }
41 else if (s2==NULL) s2=seq_cleanup(l, s2_len);
42 else break;
43 }
44 }
45
46
47 GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len) {
48 //offsets must be in the middle of a good (best) HSP (ungapped match)
49 //GXAlnInfo* alninfo=GreedyAlign(seqa, 22, seqb, 16, editscript);
50 bool editscript=false;
51 #ifdef GDEBUG
52 editscript=true;
53 GMessage("==========> matching Right (3') end ========\n");
54 #endif
55 GList<GXSeed> rseeds(true,true,false);
56 GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
57 GList<GXAlnInfo> galns(true,true,false);
58 int max_top_bands=5;
59 int top_band_count=0;
60 //GList<GXSeed> anchor_seeds(true,false,true);
61 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
62 for (int b=0;b<alnbands->Count();b++) {
63 if (alnbands->Get(b)->score<4) break;
64 top_band_count++;
65 GXBand& band=*(alnbands->Get(b));
66 band.seeds.setSorted(cmpSeedScore);
67 anchor_seeds.Add(band.seeds.First());
68 //if (band.seeds.Count()>1) anchor_seeds.Add(band.seeds[1]);
69 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
70 }
71 #ifdef GDEBUG
72 GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
73 #endif
74 for (int i=0;i<anchor_seeds.Count();i++) {
75 GXSeed& aseed=*anchor_seeds[i];
76 //TODO: refine this by checking the right-extension score first (GreedyExtend_Right?)
77 // because there is no need to check left if the right side doesn't align to the end
78 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, aseed.a_ofs+(aseed.len>>1)+1, seqa_len,
79 seqb, aseed.b_ofs+(aseed.len>>1)+1, seqb_len, editscript);
80 if (alninfo) galns.AddIfNew(alninfo, true);
81 }
82 for (int i=0;i<galns.Count();i++) {
83 GXAlnInfo* alninfo=galns[i];
84 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
85 alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
86 #ifdef GDEBUG
87 if (alninfo->gapinfo!=NULL) {
88 GMessage("Alignment:\n");
89 alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
90 }
91 #endif
92 //delete alninfo;
93 }
94 //---- done here
95 delete alnbands;
96 if (galns.Count()) {
97 GXAlnInfo* bestaln=galns.Shift();
98 return bestaln;
99 }
100 else return NULL;
101 }
102
103
104
105 //================= MAIN =====================
106
107 int main(int argc, char * const argv[]) {
108 GArgs args(argc, argv, "a:b:i:");
109 int e;
110 if ((e=args.isError())>0)
111 GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
112 if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
113 char* seqa=NULL;
114 char* seqb=NULL;
115 GStr s=args.getOpt('i');
116 int seqa_len=0, seqb_len=0;
117 if (!s.is_empty()) loadSeqs(s.chars(), seqa, seqb, seqa_len, seqb_len);
118 else {
119 s=args.getOpt('a');
120 if (!s.is_empty()) seqa=seq_cleanup(s.chars(), seqa_len);
121 s=args.getOpt('b');
122 if (!s.is_empty()) seqb=seq_cleanup(s.chars(), seqb_len);
123 }
124 if (seqa==NULL || seqa_len<2 || seqb==NULL || seqb_len<2)
125 GError("Error: two sequences with length>1 should be given!\n");
126 #ifdef GDEBUG
127 GMessage("seqa: %s\n",seqa);
128 GMessage("seqb: %s\n",seqb);
129 #endif
130 GXAlnInfo* bestaln=match_RightEnd(seqa, seqa_len, seqb, seqb_len);
131 //#ifdef GDEBUG
132 if (bestaln)
133 GMessage("******** Best Alignment: %d-%d vs %d-%d, score=%d, pid=%.2f\n",
134 bestaln->ql, bestaln->qr, bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
135 else GMessage("******** No suitable terminal alignment found.\n");
136 //#endif
137 delete bestaln;
138 GFREE(seqa);
139 GFREE(seqb);
140 }