ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/trimpoly.c
Revision: 14
Committed: Mon Jul 18 20:36:48 2011 UTC (8 years, 10 months ago) by gpertea
File size: 26850 byte(s)
Log Message:
added old trimpoly code just for archive

Line User Rev File contents
1 gpertea 14 /* takes EST sequences from stdin only
2     */
3    
4     #include <stdlib.h>
5     #include <stdio.h>
6     #include <string.h>
7     #include "getopt.h"
8     #include "trimpoly.h"
9     #include "color_c.h"
10     #include "ctype.h"
11    
12     #define USAGE "Usage: \n\
13     trimpoly [-h] [-n <percN>] [-z <min_read_len>] [-y <Nperc>] [-m <percdist>] \\ \n\
14     [-PN] [-e <end>] [-r <minrun>] [-s <minscore>] [-u <maxgap>] \\ \n\
15     [-l <percdist>|-L <range>] [-X][-D] \n\
16     \n\
17     Trims polyA/polyT and Ns from the end of the FASTA DNA sequences read\n\
18     from standard input.\n\
19     Outputs the following columns, tab-delimited:\n\
20     seq_name, perc_N, end5, end3, initial_length \n\
21     \n\
22     -n try the N-trimming if percent of N is above <percN>\n\
23     this option MUST be present if you want to enable N-trimming!\n\
24     Unless -A or -N switches are present, N trimming is done before \n\
25     polyA/T trimming\n\
26     -y try to extract from a sequence the maximum range having \n\
27     percent of N lower than <Nperc> (default: 3.00)\n\
28     -m <percdist> - incremental distance from ends, in percentage of\n\
29     sequence length, where N-trimming is done (default:12) \n\
30     The resulting distance is auto-limited to 30nt. max\n\
31     -N if -n flag was specified, do only N-trimming (skip the \n\
32     polyA/T trimming)\n\
33     -P do the polyA/T trimming first and then the N trimming;\n\
34     only makes sense if -n option is present.\n\
35     Poly A/T trimming:\n\
36     -e <end> restrict the poly trimming to one end only or to the \n\
37     maximum scoring poly match\n\
38     (default is: trim both ends)\n\
39     if <end> is 3 - trim only polyA from end3\n\
40     if <end> is 5 - trim only polyT from end5\n\
41     if <end> is x - only the maximum scoring of end3/end5 will be trimmed\n\
42     (if -X option is used, polyT/polyA will be swapped)\n\
43     -s <minscore> minimum score for poly A/T trimming (default 7)\n\
44     it is dynamically increased by 1 for every 10nt distance from the ends\n\
45     (scoring: +1 for each A/T, -2 for each intervening nucleotide) \n\
46     -r <minrun> minimum count of consecutive A/T (default 5)\n\
47     to start extending and computing score from\n\
48     -u <extstop> the count of non-matching nucleotides\n\
49     that will determine a stop in extending a match (default 4)\n\
50     -l incremental distance from ends, in percentage of length,\n\
51     where polyA/T trimming is allowed (default:15)\n\
52     -L fixed distance (in nucleotides) from both ends where\n\
53     polyA/T trimming is allowed (overrides -l)\n\
54     -o <file> outputs a multi FASTA file containing only the trimmed\n\
55     sequences \n\
56     -R relative shifting for end3, end5 will be displayed \n\
57     in the tabbed output instead of absolute coordinates\n\
58     -H parse starting end5, end3 coordinates from the FASTA file \n\
59     (expect to find them after sequence name, tab delimited)\n\
60     -X inverted trimming: trim polyA from end5 and polyT from end3\n\
61     -Y try inverted trimming if no poly(A/T) was trimmed in the first pass\n\
62     -D debug display mode: sequences are shown trimmed, colored\n\
63     using console codes (trimmed parts have a red background)\n"
64    
65     #define MIN_PERC_LEN 30
66     /*
67     trimpoly -n2 -m5
68     seems to be a good way to trim ESTs with possible 'dirty' ends
69     */
70    
71     typedef struct feature_t {
72     char* seq_name;
73     char* defline;
74     char* seq; /* sequence, upper case, spaces, tabs, newlines are removed */
75     int seqlen;
76     double perc_N;
77     int end5;
78     int end3;
79     int* NPos; /* position of Ns in the sequence - dynamically alocated array */
80     int NCount;
81     } feature_t;
82    
83     struct feature_t feat; /* current sequence data being analysed */
84    
85     /* global options/flags: */
86     int min_read_len=32;
87     int skip_poly=0; /* -N */
88     int poly_first=0; /* -P */
89     int perc_lenN=12; /* -m */
90     double perc_N=0; /* -n */
91     double nsave_limit=3.00; /* -y */
92     int trim_end=0; /* -e */
93     int min_score=7; /* -s */
94     int min_run=5; /* -r */
95     int max_gap=4; /* -u */
96     int perc_len=15; /* -l */
97     int swap_poly=0; /* -X, -Y */
98     int c_show=0; /* -D */
99     char outfile[256]; /* -o */
100     int do_write=0; /* -o */
101     int relative_shift=0; /* -R */
102     int init_ends=0; /* -H */
103     int fixed_len=0;
104     char* seed_5=SEED_T;
105     char* seed_3=SEED_A;
106     char chr_5='T';
107     char chr_3='A';
108     int first_pass=1;
109    
110     void optErr(char opt);
111     void showUsage();
112     char *seq_copy(char *dest, char *src, int *seqsize);
113     char* lstrstr(char* lstart, char *rend, char* substr);
114     char* rstrstr(char* rstart, char *lend, char* substr);
115    
116     /* these funcs are acting directly on feat structure */
117     int analyse3();
118     int analyse5();
119     void N_trim();
120     void N_analyse(int l5, int l3, int p5, int p3);
121     void N_save(int l5, int l3);
122     double N_calc(int e5, int e3, int* percN);
123    
124     int main(int argc, char * const argv[]) {
125     int c,wpos, init_end5=0, init_end3=0, score1=0, score2=0, save5, save3;
126     int alloc_size;
127     int seq_size=0, readlen;
128     char *p, *p_end, *bufpos;
129     char seq_name[256];
130     char def_line[8192];
131     char line[121];
132     FILE * fout=NULL;
133     char inbuf[BUF_LEN]; /* incoming buffer for sequence lines. */
134     char * seq; /* working buffer */
135     alloc_size=BUF_LEN;
136     seq=malloc(BUF_LEN); /* initial allocation for sequence buffer */
137     while ((c=getopt(argc, argv, "hNPXYDRHz:m:n:e:r:s:l:L:o:u:y:")) > 0) {
138     switch (c)
139     { case 'N': skip_poly=1;break;
140     case 'P': poly_first=1;break;
141     case 'm': perc_lenN=strtol(optarg, (char **)NULL, 10);
142     if (perc_lenN==0) optErr('m');
143     break;
144     case 'z': min_read_len=strtol(optarg, (char **)NULL, 10);
145     if (min_read_len<5) optErr('z');
146     break;
147     case 'n': perc_N=strtod(optarg, (char **)NULL);
148     if (perc_N==0) optErr('n');
149     break;
150     case 'y': nsave_limit=strtod(optarg, (char **)NULL);
151     if (nsave_limit==0) optErr('y');
152     break;
153     case 'e': if (optarg[0]=='x') {
154     trim_end=1;
155     break;
156     }
157     trim_end=strtol(optarg, (char **)NULL, 10);
158     if ((trim_end!=3 && trim_end!=5) || trim_end==0)
159     optErr('e');
160     break;
161     case 'r': min_run=strtol(optarg, (char **)NULL, 10);
162     if (min_run<=1) optErr('r');
163     break;
164     case 'u': max_gap=strtol(optarg, (char **)NULL, 10);
165     if (max_gap<=1) optErr('u');
166     break;
167     case 's': min_score=strtol(optarg, (char **)NULL, 10);
168     if (min_score==0) optErr('s');
169     break;
170     case 'l': perc_len=strtol(optarg, (char **)NULL, 10);
171     if (perc_len==0) optErr('l');
172     break;
173     case 'L': fixed_len=strtol(optarg, (char **)NULL, 10);
174     if (fixed_len==0) optErr('L');
175     break;
176     case 'o': strcpy(outfile, optarg);
177     if (strlen(outfile)==0)
178     optErr('o');
179     else do_write=1;
180     break;
181     case 'X': swap_poly=1;break;
182     case 'Y': swap_poly=2;break;
183     case 'H': init_ends=1;break;
184     case 'D': c_show=1;break;
185     case 'h': showUsage();exit(1);
186     case 'R': relative_shift=1;
187     }
188     } /* while options */
189    
190     if (swap_poly==1) {
191     seed_5=SEED_A;
192     seed_3=SEED_T;
193     chr_5='A';
194     chr_3='T';
195     }
196     bufpos=seq;
197     if (do_write) {
198     if ((fout=fopen(outfile,"w"))==NULL) optErr('o');
199     }
200     for (;;) {
201     p_end=fgets(inbuf, BUF_LEN-1,stdin);
202     /* basic sanity check: */
203     /*if (!p_end) { //wtf was I thinking? what if it's eof?
204     p=strchr(inbuf, '\n');
205     if (!p || *(p+1)!='\0') {
206     fprintf(stderr, "fgets error! Line too long (%s)?!\n",inbuf);
207     exit(1);
208     }
209     }*/
210     /* is this a sequence name? */
211     p=strchr(inbuf, '>');
212     if (p || !p_end) { /* starts new sequence, accumulated old one */
213     if (bufpos!=seq) { /* close and analyse existing previous sequence */
214     /* printf("Final: seqsize=%d\n",seq_size); */
215     feat.seq=seq;
216     feat.seqlen=seq_size;
217     feat.seq_name=seq_name;
218     if (init_ends) {/* parsed initial coordinates */
219     feat.end5=init_end5;
220     feat.end3=init_end3;
221     }
222     else {
223     feat.end5=1;
224     feat.end3=feat.seqlen;
225     }
226     if (skip_poly)
227     N_trim(); /* only test for N_trim */
228     else {
229     if (poly_first==0) N_trim();
230     save5=feat.end5;
231     save3=feat.end3;
232     if (trim_end!=5) score1=analyse3();
233     if (trim_end!=3) score2=analyse5();
234     if (swap_poly==2 && save5==feat.end5 &&
235     save3==feat.end3) {
236     /* printf("here: Score1=%d, Score2=%d\n", score1, score2); */
237     seed_5=SEED_A;
238     seed_3=SEED_T;
239     chr_5='A';
240     chr_3='T';
241     if (trim_end!=5) score1=analyse3();
242     if (trim_end!=3) score2=analyse5();
243     seed_5=SEED_T;
244     seed_3=SEED_A;
245     chr_5='T';
246     chr_3='A';
247     }
248     /* printf("Score1=%d, Score2=%d\n", score1, score2); */
249     if (trim_end==1) {
250     /* printf("score5=%d, score3=%d\n", score2,score1); */
251     if (score1<score2) feat.end3=save3;
252     else feat.end5=save5;
253     }
254     if (poly_first!=0) N_trim();
255     }
256     if (relative_shift) {
257     if (init_ends)
258     printf("%s\t\%4.2f\t%d\t%d\t%d\n",
259     seq_name, feat.perc_N, feat.end5-init_end5,init_end3-feat.end3,
260     feat.seqlen);
261     else
262     printf("%s\t\%4.2f\t%d\t%d\t%d\n",
263     seq_name, feat.perc_N, feat.end5-1,feat.seqlen-feat.end3,
264     feat.seqlen);
265     }
266     else
267     printf("%s\t\%4.2f\t%d\t%d\t%d\n",
268     seq_name, feat.perc_N, feat.end5,feat.end3,
269     feat.seqlen);
270     if (c_show)
271     c_print_trim(seq, feat.end5, feat.end3);
272    
273     if (do_write && feat.perc_N<3.00 && feat.end3-feat.end5>=min_read_len-1) {
274     /* write only those having length>=100 and perc_N<3*/
275     if (init_ends) fprintf(fout,">%s\n",seq_name);
276     else fprintf(fout,"%s",def_line);
277     wpos=feat.end5-1;
278     while (feat.end3-wpos>=60) {
279     strncpy(line,seq+wpos,60);line[60]='\0';
280     fprintf(fout,"%s\n",line);
281     wpos+=60;
282     }
283     if (feat.end3-wpos>0) {
284     strncpy(line,seq+wpos,feat.end3-wpos);
285     line[feat.end3-wpos]='\0';
286     fprintf(fout,"%s\n",line);
287     }
288     }
289     }
290     if (!p_end) break; /* exit on end-of-file */
291     /* parse sequence name and, eventually, init coordinates */
292     strcpy(def_line, p); /* keep defline there for -o output */
293     for (p_end=p+1;(*p_end)!='\0' && !isspace(*p_end); p_end++);
294     strncpy(seq_name, p+1,p_end-p-1);
295     seq_name[p_end-p-1]='\0';
296     if (init_ends) { /* tab delimited fields in def_line*/
297     /* after p_end (\t) there are init values */
298     p_end=strchr(p,'\t');
299     if (p_end==NULL) optErr('H');
300     p_end++;
301     p=strchr(p_end,'\t');
302     if (p==NULL) optErr('H');
303     *p='\0';
304     init_end5=strtol(p_end, (char **)NULL, 10);
305     if (init_end5==0) optErr('H');
306     p++;
307     p_end=strchr(p,'\n');
308     if (!p_end) p_end=strchr(p,'\t');
309     if (p_end==NULL) optErr('H');
310     *p_end='\0';
311     init_end3=strtol(p, (char **)NULL, 10);
312     if (init_end3==0 || init_end3<init_end5) optErr('H');
313     }
314     seq_size=0;
315     bufpos=seq; /* no need to reallocate the buffer for each sequence..
316     let it stay at maximum all along */
317     }
318     else { /* extending current sequence : */
319     readlen=strlen(inbuf);
320     if (seq_size+readlen>alloc_size) { /* must realloc, it's gonna blow */
321     alloc_size+=BUF_LEN;
322     seq=realloc(seq, alloc_size);
323     /* printf("Realloc! seqsize=%d, readlen=%d\n",seq_size, readlen); */
324     bufpos=seq+seq_size;
325     }
326     bufpos=seq_copy(bufpos,inbuf, &seq_size);
327     /* append, removing newline chars and incrementing seq_size accordingly */
328     }
329     }/* for input */
330     free(seq);
331     if (do_write) fclose(fout);
332     return 0;
333     }
334    
335     char* seq_copy(char *dest, char *src, int *seq_size) {
336     /* assume destination already has room for all the chars in src!
337     skip any blanks or newlines on copying
338     RETURN: the position immediately after the last char copied!
339     */
340     int i,j;
341     j=0;
342     for (i=0;(src[i]!='\0');i++) {
343     if (src[i]==' ' || src[i]=='\n'|| src[i]=='\t') continue;
344     /* dest[j]=toupper(src[i]);j++; */
345     dest[j]=src[i];j++;
346     }
347     dest[j]='\0';
348     (*seq_size) += j;
349     return &dest[j];
350     }
351    
352    
353    
354     /*********************************************************************/
355     /* Helper functions */
356     /*********************************************************************/
357    
358    
359    
360     void optErr(char opt) {
361     showUsage();
362     fprintf(stderr, "\nIncorrect value specified for -%c option!\n", opt);
363     exit(1);
364     }
365    
366     void showUsage() {
367     fprintf(stderr, USAGE);
368     }
369    
370    
371     char* lstrstr(char* lstart, char *rend, char* substr) { /*like strstr, but starts searching
372     from lstart, going up to rend and returns a pointer to the first (left)
373     matching character in str */
374     char *p;
375     int l,i;
376     l=strlen(substr);
377     p=lstart;
378     while (p+l<rend) {
379     for (i=0;i<l;i++) if (toupper(*(p+i)) != toupper(*(substr+i))) break;
380     if (i==l) return p;
381     p++;
382     }
383     return NULL;
384     }
385    
386    
387     char* rstrstr(char* rstart, char *lend, char* substr) { /*like strstr, but starts searching
388     from right end, going up to lend and returns a pointer to the last (right)
389     matching character in str */
390     char *p;
391     int l,i;
392     l=strlen(substr);
393     p=rstart-l+1;
394     while (p>=lend) {
395     for (i=0;i<l;i++) if (toupper(*(p+i)) != toupper(*(substr+i))) break;
396     if (i==l) return p+l-1;
397     p--;
398     }
399     return NULL;
400     }
401    
402     /*=================================================*/
403    
404     int analyse5() { /*== end5 trim (polyT if no -X) ======*/
405     int gaplen, locmaxpos,
406     locmaxscore, locminscore, run, locmaxrun, v;
407     char *p;
408     int score;
409     int first_pass=0;
410     int l5=0; /* offset from current end5 */
411     int norange=1;
412     int maxhit_score=0;
413     int maxhit_pos=0;
414     int startpos=0;
415     int pos5=l5; /* distance to end5 */
416     char *seqstart=feat.seq+feat.end5-1;
417     char *seqend=feat.seq+feat.end3-1;
418     while (pos5<feat.end3-feat.end5 &&
419     (p=lstrstr(seqstart+pos5, seqend, seed_5))!=NULL &&
420     (p-seqstart)<(feat.end3-feat.end5)/2 /* while hit is in 5' half! */
421     ) { /* big loop, going through all possible hits */
422     startpos=p-seqstart; /*offset from end5 */
423     locmaxscore=0;
424     locmaxpos=l5;
425     run=0;
426     locmaxrun=0;
427     p+=SEED_LEN;
428     /*
429     printf("Seed found: pos5=%d, (p-feat.seq)=%d, *p=%c%c%c%c%c\n",
430     pos5, p-feat.seq, *p,*(p+1),*(p+2),*(p+3),*(p+4));
431     */
432     score=SEED_LEN;
433     run=SEED_LEN;
434     gaplen=0;
435     while (gaplen<max_gap && score>0 && p+gaplen<=seqend) {
436     if (toupper(p[gaplen])==chr_5) {
437     if (gaplen) { /* new poly run after a gap */
438     p+=gaplen;
439     score-=gaplen*2; /* penalty */
440     gaplen=0;
441     }
442     p++;
443     run++;
444     score++;
445     }
446     else { /* non-match */
447     if (gaplen==0) { /* new gap starting or end of sequence*/
448     if (run>locmaxrun) locmaxrun=run;
449     run=0;
450     if (score>=locmaxscore) { /* check locmax! */
451     locmaxscore=score;
452     locmaxpos=(p-seqstart); /* offset from end5 */
453     }
454     }
455     gaplen++;
456     }
457     /*
458     printf("T loop: l5=%d, gaplen=%d, score=%d, locmaxpos=%d, locmaxscore=%d locmaxrun=%d\n (p-feat.seq)=%d, *p=%c%c%c%c%c\n",
459     l5, gaplen, score, locmaxpos, locmaxscore, locmaxrun, p-feat.seq, *p,*(p+1),*(p+2),*(p+3),*(p+4));
460     */
461     }/* while */
462     if (run>locmaxrun) locmaxrun=run;
463     if (score>=locmaxscore) { /* check locmax! */
464     locmaxscore=score;
465     locmaxpos=(p-seqstart); /* offset from end5 */
466     }
467     pos5=locmaxpos;
468     locminscore=min_score+startpos/10; /* min_score is dynamically adjusted
469     based on the hit distance to the ends */
470     /*
471     printf("feat.end3=%d, feat.end5=%d, l5=%d, startpos=%d, locmaxpos=%d, locmaxscore=%d, locmaxrun=%d\n",
472     feat.end3, feat.end5, l5, startpos, locmaxpos, locmaxscore, locmaxrun);
473    
474     */
475     if (locmaxscore>=locminscore && locmaxrun>=min_run) { /* potential hit */
476     norange=1;
477     if (fixed_len) {
478     if (startpos<=fixed_len) {
479     l5=locmaxpos;
480     maxhit_score=locmaxscore;
481     maxhit_pos=startpos;
482     norange=0;
483     }
484     else break; /* strict fixed range criteria */
485     }
486     else { /* percentual distance test: assess distance to previous hit */
487     v=(feat.end3-feat.end5-l5)*perc_len/100;
488     if ((first_pass && v<MIN_PERC_LEN)) {
489     v=MIN_PERC_LEN;
490     first_pass=0;
491     }
492     if (startpos-l5<v) { /* hit in allowed range */
493     l5=locmaxpos;
494     maxhit_score=locmaxscore;
495     maxhit_pos=startpos;
496     norange=0;
497     }
498     }
499     /* test for any huge hits in the first half */
500     if (norange && l5!=locmaxpos &&
501     (locmaxscore>30 && startpos+(locmaxpos-startpos)/2 < (feat.end3-feat.end5)/2
502     || (feat.end3-feat.end5)-(locmaxpos-startpos)<min_read_len) )
503     {
504     l5=locmaxpos;
505     maxhit_score=locmaxscore;
506     maxhit_pos=startpos;
507     }
508     }
509     } /* big while loop */
510     feat.end5+=l5;
511     return maxhit_score*10-maxhit_pos;
512     }
513    
514    
515    
516     int analyse3() { /*=== end3 search: polyA (if no -X) =============== */
517     int gaplen, score, locmaxpos,
518     locmaxscore, locminscore, run, locmaxrun, v;
519     char *p;
520     int first_pass=0;
521     int norange;
522     char *seqstart=feat.seq+feat.end5-1;
523     char *seqend=feat.seq+feat.end3-1;
524     int l3=0; /*offset from end3 */
525     int pos3=l3; /*distance to end3 */
526     int startpos=0;
527     int maxhit_score=0;
528     int maxhit_pos=0;
529     while (pos3<feat.end3-feat.end5 &&
530     (p=rstrstr(seqend-pos3, seqstart, seed_3))!=NULL &&
531     (seqend-p)<(feat.end3-feat.end5)/2 /* while hit is in 3' half! */
532     ) { /* big loop, going through all possible hits */
533     /* printf("feat.end3=%d, pos3=%d, startpos=%d\n",feat.end3,pos3, seqend-p); */
534     startpos=seqend-p; /*distance to seqend */
535     locmaxscore=0;
536     locmaxpos=l3;
537     run=0;
538     locmaxrun=0;
539     /*
540     printf("Seed found *p=%c%c%c%c%c%c%c\n",
541     *p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5),*(p+6));
542     */
543     p-=SEED_LEN;
544     /*
545     printf("After seed *p=%c%c%c%c%c%c%c\n",
546     *p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5),*(p+6));
547     */
548     /*
549     printf("Seed found: l5=%d, (p-feat.seq)=%d, *p=%c%c%c%c%c\n",
550     l5, p-feat.seq, *p,*(p+1),*(p+2),*(p+3),*(p+4) );
551     */
552     score=SEED_LEN;
553     run=SEED_LEN;
554     gaplen=0;
555     while (gaplen<max_gap && score>0 && p-gaplen>=seqstart) {
556     /*printf("A loop: l3=%d, gaplen=%d, score=%d, locmaxpos=%d, locmaxscore=%d, locmaxrun=%d\n \
557     (p-feat.seq)=%d, *p=%c%c%c%c%c%c%c\n",
558     l3, gaplen, score, locmaxpos, locmaxscore, locmaxrun,
559     p-feat.seq, *(p-gaplen),*(p-gaplen+1),*(p-gaplen+2),*(p-gaplen+3),*(p-gaplen+4),*(p-gaplen+5),
560     *(p-gaplen+6));
561     */
562     if (toupper(*(p-gaplen))==chr_3) {
563     if (gaplen) { /* new poly run after a gap */
564     p-=gaplen;
565     score-=(gaplen<<1); /* penalty */
566     gaplen=0;
567     }
568     p--;
569     run++;
570     score++;
571     /*
572     printf("match: score=%d, *p=%c%c%c%c%c%c%c\n",score,
573     *p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5),*(p+6));
574     */
575     }
576     else { /* non-match*/
577     if (gaplen==0) { /* new gap */
578     if (run>locmaxrun) locmaxrun=run;
579     run=0;
580     if (score>=locmaxscore) { /* check locmax! */
581     locmaxscore=score;
582     locmaxpos=(seqend-p); /* distance to end3 */
583     }
584     }
585     gaplen++; /*extend gap */
586     }
587    
588    
589     }/* while */
590     if (run>locmaxrun) locmaxrun=run;
591     if (score>=locmaxscore) { /* check locmax! */
592     locmaxscore=score;
593     locmaxpos=(seqend-p); /* distance to end3 */
594     }
595     pos3=locmaxpos;
596     locminscore=min_score+startpos/10; /* min_score is dynamically adjusted
597     based on the hit distance to the ends */
598     if (locmaxscore>=locminscore && locmaxrun>=min_run) { /* potential hit */
599     norange=1;
600     if (fixed_len) {
601     if (startpos<=fixed_len) {
602     l3=locmaxpos;
603     maxhit_score=locmaxscore;
604     maxhit_pos=startpos;
605     norange=0;
606     }
607     else break; /* strict fixed range criteria */
608     }
609     else {
610     v=((feat.end3-l3-feat.end5)*perc_len/100);
611     if (first_pass && v<MIN_PERC_LEN) {
612     v=MIN_PERC_LEN;
613     first_pass=0;
614     }
615     if (startpos-l3<v) {
616     l3=locmaxpos;
617     maxhit_score=locmaxscore;
618     maxhit_pos=startpos;
619     norange=0;
620     }
621     }
622     /* instead, just aim for big hits in the 3' half */
623     if (norange && l3!=locmaxpos &&
624     ( (locmaxscore>30 && startpos+(locmaxpos-startpos)/2 < (feat.end3-feat.end5)/2)
625     || (feat.end3-feat.end5)-(locmaxpos-startpos)<min_read_len) ) {
626     l3=locmaxpos;
627     maxhit_score=locmaxscore;
628     maxhit_pos=startpos;
629     }
630     } /* potential hit analysis */
631     } /* big while loop */
632     feat.end3-=l3;
633     return maxhit_score*10-maxhit_pos;
634     }
635    
636    
637     double N_calc(int e5, int e3, int* Ncount) {
638     /* computes the percent and count of Ns; updates feat structure */
639     int i,ch;
640     double result;
641     *Ncount=0;
642     for (i=e5;i<=e3;i++) {
643     ch=toupper(feat.seq[i]);
644     if (ch!='A' && ch!='C' && ch!='G' && ch!='T')
645     (*Ncount)++;
646     }
647     result = ((double)(100*(*Ncount)))/(e3-e5+1);
648     return result;
649     }
650    
651     void N_trim() {
652     /* computes initial perc_N and fill N_Pos
653     call N_analyse as apropriate (according to switches)
654     */
655     int i,j,ch;
656     feat.perc_N=N_calc(feat.end5-1, feat.end3-1, &feat.NCount);
657     /* printf("entering N trim: %f compare to %f\n",feat.perc_N, perc_N); */
658     if (perc_N!=0 && feat.perc_N>perc_N) {
659     j=0;
660     feat.NPos=malloc(feat.NCount*sizeof(int));
661     for (i=feat.end5-1;i<feat.end3;i++) {
662     ch=toupper(feat.seq[i]);
663     if (ch!='A' && ch!='C'
664     && ch!='G' && ch!='T') {
665     feat.NPos[j]=i;
666     j++;
667     }
668     }
669     N_analyse(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
670     /* compute perc_N again after trimming! */
671     feat.perc_N=N_calc(feat.end5-1, feat.end3-1, &feat.NCount);
672     if (feat.perc_N >= nsave_limit && feat.end3-feat.end5>=99) {
673     /* try to save this at any cost: */
674     N_save(feat.end5-1, feat.end3-1);
675     /* this will also update feat.perc_N and feat.NCount */
676     }
677     free(feat.NPos);
678     }
679     }
680    
681     void N_analyse(int l5, int l3, int p5, int p3) {
682     /* assumes feat was filled properly */
683     int old_dif, t5,t3,v;
684     if (l3-l5<min_read_len || p5>p3 ) {
685     feat.end5=l5+1;
686     feat.end3=l3+1;
687     return;
688     }
689    
690     t5=feat.NPos[p5]-l5;
691     t3=l3-feat.NPos[p3];
692     old_dif=p3-p5;
693     v=(int)((((double)(l3-l5))*perc_lenN)/100);
694     if (v>30) v=30; /* enforce limit for long ESTs */
695     if (t5 < v ) {
696     l5=feat.NPos[p5]+1;
697     p5++;
698     }
699     if (t3 < v) {
700     l3=feat.NPos[p3]-1;
701     p3--;
702     }
703     /* restNs=p3-p5; number of Ns in the new CLR */
704     if (p3-p5==old_dif) { /* no change, return */
705     /* don't trim if this may shorten < 100 */
706     if (l5-l3<min_read_len) return;
707     feat.end5=l5+1;
708     feat.end3=l3+1;
709     }
710     else
711     N_analyse(l5,l3, p5,p3);
712     }
713    
714     void N_save(int l5, int l3) {
715     /* find the maximum range having percN<3 */
716     /* needs feat.NPos[], feat.NCount to be set before*/
717     int countN,i,p5,p3, max_extent, start3, max_5, max_3, e3, e5, max_countN=0;
718     double percN=0,max_percN=0;
719     for (i=0;feat.NPos[i]<=feat.end5-1;i++);
720     p5=i-1; /* found first N's position in the selected range */
721     for (i=feat.NCount-1;feat.NPos[i]>=feat.end3-1;i--);
722     p3=i+1; /* found last N's position in the selected range */
723     if (l3-l5<min_read_len || p5>p3 ) return;
724     e5=p5;e3=p3;
725     start3=l3;
726     max_extent=99;max_3=0;max_5=0;
727     while (l3-l5>max_extent /* && e3>=e5 */) { /* outer end5 loop */
728     e3=p3;
729     percN=N_calc(l5,l3, &countN);
730     while (percN>=nsave_limit && l3-l5>max_extent) { /* e3 loop -- decreases until percN<3 */
731     /* printf("l5=%d, l3=%d; e5=%d (%d), e3=%d(%d), percN=%f, %d ? %d\n",l5,l3,e5,
732     feat.NPos[e5], e3, feat.NPos[e3], percN, l3-l5, max_extent);
733     */
734     e3--;
735     while (e3>e5 && feat.NPos[e3-1]==feat.NPos[e3]-1) e3--;
736     /* skip consecutive Ns */
737     l3=feat.NPos[e3]-1;
738     percN=N_calc(l5,l3, &countN);
739     }
740     if (l3-l5>max_extent) {
741     max_extent=l3-l5;
742     max_5=l5;max_3=l3;
743     max_percN=percN;
744     max_countN=countN;
745     }
746     while (e3>e5 && feat.NPos[e5+1]==feat.NPos[e5]+1) e5++;
747     /* skip consecutive Ns */
748     l5=feat.NPos[e5]+1;
749     /* restart end3: */
750     l3=start3;e3=p3;
751     e5++;
752     } /* outer loop */
753     if (max_3!=0) {
754     feat.end5=max_5+1;
755     feat.end3=max_3+1;
756     feat.perc_N=max_percN;
757     feat.NCount=max_countN;
758     }
759     }