ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/jigs_grd_train.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 16523 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2
3 use strict;
4 use Getopt::Std;
5 use File::Basename;
6 use Cwd qw(abs_path cwd);
7 umask 0002;
8
9 my $usage = q~
10 Jigsaw training script on the grid, version 1.4
11
12 jigs_grd_train.pl [options]
13
14 Options:
15
16 -f <fasta_suffix> - the filename suffix of the genomic sequence file
17 (default 'fa')
18 -e <evidence_fie> - evidence file (required)
19 -d <output_dir> - directory to place decision trees in
20 -l <dirlist_file> - (required) the assumption is, a single directory
21 contains all of the information for one genomic seq
22 -p <adj_nt_len> - specify how many nucleotides adjacent to gene model
23 to include in training (default is 50)
24 -m <e-mail_to_notify> - have gridx send an e-mail upon completion
25 -n <num_trees> - number of decision trees to use (default is 10)
26 -t - two types of decision trees can be used, when this
27 option is given, the faster but potentially less
28 accurate trees are used; the default is to use the
29 method which generates slow trees
30 -g <grid_engine> - grid engine to use: 'smp', 'sge' or 'condor'
31 (default is 'condor')
32 -c <numCPUs> - max grid nodes (CPUs) to use (default 20)
33
34 Miscellaneous Options:
35 -h - print this help message
36 -V - obtain program version
37 ~;
38
39 print STDERR "Command line: $0 ".join(' ',@ARGV)."\n";
40
41 getopts('tc:f:m:g:e:d:l:p:n:o:') || die($usage."\n");
42 my $outfile=$Getopt::Std::opt_o;
43
44 my ($evidence_file,$ntree, $treedir,
45 $idir_list, $adj_nt, $fasta, $tsplit)=
46 ($Getopt::Std::opt_e, $Getopt::Std::opt_n, $Getopt::Std::opt_d,
47 $Getopt::Std::opt_l, $Getopt::Std::opt_p, $Getopt::Std::opt_f, $Getopt::Std::opt_t);
48
49 my $gridengine=$Getopt::Std::opt_g || 'condor';
50 my $email=$Getopt::Std::opt_m;
51 my $maxCPUs=$Getopt::Std::opt_c || 20;
52 $treedir='.' unless $treedir;
53 $fasta='fa' unless $fasta;
54 $adj_nt = 50 unless $adj_nt;
55 $ntree = 10 unless $ntree;
56
57 my $CMDQFile='.cmdQueue.'.$$.'.gridcmds';
58
59 if ($treedir ne '.' && ! -d $treedir) {
60 mkdir($treedir) || die("Error creating training directory: $treedir ($!)\n");
61 }
62
63 die("$usage") unless -f $evidence_file && -f $idir_list;
64
65 my $gridCmd="";
66
67 $tsplit = '-a' if defined($tsplit);
68
69 my $tfname = $evidence_file;
70
71 my @asmbls;
72 open(FILE,$idir_list) || die("Cannot open $idir_list: $!");
73 while(my $line = <FILE>) {
74 chomp($line);
75 push( @asmbls, $line);
76 }
77 close(FILE);
78 my $evidence;
79 my $train_template;
80 open(FILE,$evidence_file) || die("Cannot open $evidence_file: $!");
81 while(my $line = <FILE>) {
82 chomp($line);
83 $evidence .= "$line:";
84 if($line =~ /curation/ ) {
85 my @tmp=split(/\s+/,$line);
86 $train_template= $tmp[0];
87 }
88 }
89 close(FILE);
90
91 if ( $treedir ne '.' && ! -d $treedir ) {
92 mkdir($treedir) || die("Error creating directory '$treedir' !\n");
93 }
94
95 my @vecIds = ("acc","don","code","start","stop","intron");
96 my @vlst;
97 my $iter;
98 for($iter = 0; $iter <= $#asmbls; $iter++) {
99 my $dir = $asmbls[$iter];
100 my $dir_id = basename($dir);
101 my $prefix="$dir/$dir_id.";
102 # if ( $addPrefix ) {
103 # $prefix = "$dir/${dir_id}_";
104 # if( ! (-e "${prefix}$fasta") ) {
105 # $prefix = "$dir/$dir_id.";
106 # }
107 # } else {
108 # $prefix = "$dir/";
109 # }
110 my $mytrain_file = $prefix . $train_template;
111 if( -e $mytrain_file ) {
112 my $vid = "ev_vec";
113 writeTrainFile($tfname,$evidence,$prefix);
114 my $vecfile = "$treedir/$dir_id.$vid";
115 runCmd("jigsaw -f ${prefix}$fasta -e ${prefix}$tfname -t $vecfile -d $treedir -q $adj_nt");
116 #die("Error at running jigsaw: $!\n") if $?;
117 remove_incomplete("$vecfile.start");
118 remove_incomplete("$vecfile.stop");
119 for (my $jiter = 0; $jiter <= $#vecIds; $jiter++) {
120 my $vf = "$vecfile.$vecIds[$jiter]";
121 if ( !$vlst[$jiter] ) {
122 $vlst[$jiter] = $vf;
123 } else {
124 $vlst[$jiter] = $vlst[$jiter] . " " . $vf;
125 }
126 }
127 } else {
128 print "Warning: $mytrain_file does not exist, continue anyway.\n";
129 }
130 }
131
132 my @con_data;
133 my @dtFiles;
134 for($iter = 0; $iter <= $#vecIds; $iter++) {
135 #print STDERR "Converting vectors to triplet vectors for $vecIds[$iter]..\n";
136 #print STDERR " calling: convert_vectors_to_triplet_vectors()\n";
137 my $vecs = convert_vectors_to_triplet_vectors($vlst[$iter]);
138 my $cntOcc = 1;
139 $cntOcc = 0 if ( $iter == 2);
140 #print STDERR " calling: wght()\n";
141 $con_data[$iter] = wght($vecs,$cntOcc);
142 #print STDERR " calling: writeFilesAndMakeTrees()\n";
143 my $dtFile = writeFilesAndMakeTrees($treedir,$con_data[$iter],$vecIds[$iter]);
144 push(@dtFiles,$dtFile);
145 }
146
147 my $startDir=cwd(); #from Cwd module
148
149 open(RUNQ, '>'.$CMDQFile) || die("Error creating $CMDQFile!\n");
150 foreach my $dtfile (@dtFiles) {
151 gtree($gridCmd,abs_path($dtfile),$ntree,$tsplit);
152 #this will prepare the file
153 }
154 close(RUNQ);
155
156 #now submit the command queue to the grid:
157 #print STDERR "gridx -g $gridengine -p $maxCPUs -f $CMDQFile -m 'gpertea\@umiacs.umd.edu' -N -O logs_jigs_train -q\n";
158 my $cmd="gridx -g $gridengine -p $maxCPUs -f $CMDQFile";
159 $cmd.=" -m $email" if $email;
160 &runCmd("$cmd -N -O logs_jigs_train -q");
161 #&runCmd("/bin/rm -f $treedir/*.ev_vec.*");
162
163 exit(0);
164
165 #------------------------------------------------
166
167 sub writeFilesAndMakeTrees {
168 my ($treedir,$data,$id) = @_;
169 my $dtfile = "$treedir/$id";
170 my $cnfile = "$treedir/$id.con";
171 open(DTFILE,">$dtfile") || die "unable to write to [$dtfile]\n";
172 open(CNFILE,">$cnfile") || die "unable to write to [$cnfile]\n";
173 my @vals = split(/\n/,$data);
174 for(my $iter = 0; $iter <= $#vals; $iter++) {
175 my @subvals = split(/ /,$vals[$iter]);
176 my $class = 1;
177 if( $subvals[$#subvals] >= 0.5 ) {
178 $class = 2;
179 }
180 for(my $si = 0; $si < $#subvals; $si++) {
181 print DTFILE "$subvals[$si] ";
182 }
183 print DTFILE "$class\n";
184 print CNFILE "$vals[$iter]\n";
185 }
186 close(DTFILE);
187 close(CNFILE);
188 return $dtfile;
189 }
190
191 sub gtree {
192 my ($gridcmd,$file,$num_trees,$split) = @_;
193 for(my $cnt = 1; $cnt <= $num_trees; $cnt++) {
194
195 #my $seedcmd = time() # date"date +%s |";
196 #open(TPIPE,$seedcmd) || die "unable to open [$seedcmd]\n";
197 #my $seed = <TPIPE>;
198 #$seed =~ s/\n//;
199 #close(TPIPE);
200 my $seed=int(rand(120103+$cnt))+$cnt+1299;
201 my $cmd="";
202 #my $output = basename($file).'.dt.'.$cnt;
203 my $output = "$file.dt.$cnt";
204 print RUNQ "mktree -t $file -s $seed -D $output $split\n";
205
206 # if( $gridcmd ) {
207 # my $output = "$file.dt.$cnt";
208 # $cmd = "${gridcmd} -c \"mktree -t $file -s $seed -D $output $split\" -nowait";
209 # } else {
210 # $cmd = "mktree -t $file -s $seed $split";
211 # }
212 # print "$cmd\n";
213 # system($cmd);
214 # if ( !$gridcmd ) {
215 # if ( -e "$file.dt" ) {
216 # $cmd = "mv $file.dt $file.dt.$cnt";
217 # system($cmd);
218 # if ( -e "$file.dt.unpruned" ) {
219 # $cmd = "rm -f $file.dt.unpruned";
220 # system($cmd);
221 # }
222 # } else {
223 # print "some unanticipated problems running mktree\n";
224 # print "[$cmd] did not create: [$file.dt]\n";
225 # }
226 # sleep(5);
227 # }
228 } #for each tree
229 # don't need the dt file anymore
230 # &runCmd("/bin.rm -f $file)") if(!$gridcmd);
231 }
232
233 sub writeTrainFile {
234 my ($tfname,$evlst,$prefix) = @_;
235 my @evlst = split(/:/,$evlst);
236 my $traindata = "${prefix}$tfname";
237 print "write: $traindata\n";
238 open(TRAINDATA,">$traindata") || die "unable to write [$traindata]\n";
239 for(my $iter = 0; $iter <= $#evlst; $iter++) {
240 print TRAINDATA "$prefix$evlst[$iter]\n";
241 }
242 close(TRAINDATA);
243 }
244
245 sub convert_vectors_to_triplet_vectors {
246 my ($lst) = @_;
247 my @files = split(/ /,$lst);
248 my %lookup;
249 for(my $iter = 0; $iter <= $#files; $iter++) {
250 # print "red file: $files[$iter]\n";
251 compress($files[$iter],\%lookup);
252 }
253 return printLookup(\%lookup);
254 }
255
256 #####################################
257 ## assume come in ascending order
258 #####################################
259
260 sub compress {
261 my ($file,$lookup_ref) = @_;
262 #print "open $file\n";
263 if( !open(FILE,$file) ) {
264 return;
265 }
266 my @lines = ("","","","","","");
267 my $fhalf = ($#lines+1)/2;
268 my $fail = 0;
269 for(my $iter = 0; $iter <= $fhalf-1; $iter++) {
270 #print "$iter, $fhalf\n";
271 if($lines[$iter] = <FILE>) {
272 chomp($lines[$iter]);
273 } else {
274 $fail = 1;
275 last;
276 }
277 if($lines[$iter+$fhalf] = <FILE>) {
278 chomp($lines[$iter+$fhalf]);
279 } else {
280 $fail = 1;
281 last;
282 }
283 }
284 if( $fail ) {
285 close(FILE);
286 return;
287 }
288 while($lines[5]) {
289 my $tvals="";
290 for(my $iter = 0; $iter <= $fhalf - 1; $iter++) {
291 $tvals .= $lines[$iter];
292 if( $iter < $fhalf-1 ) {
293 $tvals .= "#";
294 }
295 }
296 my ($id1,$answer1,$vid1,$isValid1,$strnd1) = makeId($tvals);
297 my $tvals2="";
298 for(my $iter = $fhalf; $iter <= $#lines; $iter++) {
299 $tvals2 .= $lines[$iter];
300 if( $iter < $#lines ) {
301 $tvals2 .= "#";
302 }
303 }
304 my ($id2,$answer2,$vid2,$isValid2,$strnd2) = makeId($tvals2);
305 if( $isValid1 ) {
306 doStore($id1,$answer1,$vid1,$strnd1,$lookup_ref);
307 }
308 if( $isValid2 ) {
309 doStore($id2,$answer2,$vid2,$strnd2,$lookup_ref);
310 }
311 #3
312 for(my $iter = 0; $iter <= $fhalf - 2; $iter++) {
313 $lines[$iter] = $lines[$iter+1];
314 }
315 if( $lines[$fhalf-1] = <FILE> ) {
316 chomp($lines[$fhalf-1]);
317 } else {
318 last;
319 }
320 for(my $iter = $fhalf; $iter <= $#lines - 1; $iter++) {
321 $lines[$iter] = $lines[$iter+1];
322 }
323 if( $lines[$#lines] = <FILE> ) {
324 chomp($lines[$#lines]);
325 } else {
326 last;
327 }
328 }
329 close(FILE);
330 }
331
332 sub printLookup {
333 my ($lookup_ref) = @_;
334 my $vals;
335 my $cnt = 0;
336 my $tstop = 0;
337 foreach my $id (keys %$lookup_ref) {
338 if( !$id ) {
339 next;
340 }
341 my ($nocode, $code);
342 if($$lookup_ref{$id}{0}) {
343 $nocode =$$lookup_ref{$id}{0};
344 } else {
345 $nocode =0;
346 }
347 if($$lookup_ref{$id}{1}) {
348 $code =$$lookup_ref{$id}{1};
349 } else {
350 $code =0;
351 }
352 my @vals = split(/ /,$id);
353 if(!$nocode && !$code) {
354 print "problems...";
355 print "[$id]\n";
356 exit(1);
357 }
358 #my $exp = $nocode+$code;
359 my $cnt = 0;
360 for(my $iter = 0; $iter <= $#vals; $iter++) {
361 $cnt += $vals[$iter];
362 }
363 if( !$cnt ) {
364 next;
365 }
366 #print "$id ";
367 $vals .= "$id ";
368 $vals .= printCode($nocode);
369 #print " ";
370 $vals .= " ";
371 $vals .= printCode($code);
372 $vals .= "\n";
373 #print "\n";
374 $cnt++;
375 }
376 return $vals;
377 }
378
379 sub printCode {
380 my ($string) = @_;
381 my @vals = split(/\+/,$string);
382 my $max = 0;
383 my $sum = 0;
384 my $size = scalar(@vals);
385 for( my $iter = 0; $iter < $size; $iter++) {
386 if( $vals[$iter] > $max ) {
387 $max = $vals[$iter];
388 }
389 $sum += $vals[$iter];
390 }
391 my $avg = 0;
392 if( $size > 0 ) {
393 $avg = ($sum/$size);
394 }
395 if( $size == 1 && $vals[0] == 0) {
396 $size = 0;
397 }
398 return "$max $size $sum $avg";
399 }
400
401 sub makeId {
402 my ($lns) = @_;
403 #print "makeId input: [$lns]\n";
404 my @lines = split(/\#/,$lns);
405 my $id = "";
406 my $vid = "";
407 my $answer = -1;
408 my $isValid = 1;
409 my $middle = $#lines/2;
410 if( $middle != 1 || $#lines != 2 ) {
411 print "error check failed, $middle, [$lns], $#lines\n";
412 exit(1);
413 }
414 my $line0 = $lines[0];
415 my @vals0 = split(/ /,$line0);
416 my $line1 = $lines[1];
417 my @vals1 = split(/ /,$line1);
418 my $line2 = $lines[2];
419 my @vals2 = split(/ /,$line2);
420 my $lGap = $vals1[0]-$vals0[1];
421 my $rGap = $vals2[0]-$vals1[1];
422 my ($mEnd5,$mEnd3) = ($vals1[0],$vals1[1]);
423 if( $lGap < 1 ) {
424 print "error: $lns, [$lGap] \n";
425 exit(1);
426 }
427 if( $rGap < 1 ) {
428 print "error 2 \n";
429 exit(1);
430 }
431 my $strnd;
432 for(my $iter = 0; $iter <= $#lines; $iter++) {
433 my $line = $lines[$iter];
434 my @vals = split(/ /,$line);
435 if(0) { # is there a good reason to keep these out?
436 if( $iter == $middle ) {
437 my $doCont = 0;
438 for(my $cnt =3; $cnt <= $#vals-1; $cnt++) {
439 if( $vals[$cnt] ) {
440 $doCont = 1;
441 last;
442 }
443 }
444 if( ! $doCont ) {
445 $isValid = 0;
446 last;
447 }
448 }
449 }
450 my ($v1,$v2);
451 if( $iter == 0 && $lGap > 1 ) {
452 $v1=$v2=$mEnd5-1;
453 } elsif( $iter == 2 && $rGap > 1 ) {
454 $v1=$v2=$mEnd3+1;
455 } else {
456 ($v1,$v2) = ($vals[0],$vals[1]);
457 }
458 if( !$vid ) {
459 $vid = $v1 . " " . $v2;
460 } else {
461 $vid = $vid . " " . $v1 . " " . $v2;
462 }
463 my $bp = $v2-$v1 + 1;
464 my $subid = "";
465 for(my $jiter = 3; $jiter < $#vals; $jiter++) {
466 if( ($lGap == 1 && $iter==0) || $iter==1 || ($iter==2 && $rGap==1) ) {
467 $subid .= $vals[$jiter];
468 } else {
469 $subid .= "0"
470 }
471 if( $jiter < $#vals - 1 ) {
472 $subid .= " ";
473 }
474 }
475 $strnd = $vals[2];
476 if( !$id ) {
477 $id = $subid;
478 } elsif( $vals[2] eq "+" ) {
479 $id = $id . " " . $subid;
480 } else {
481 $id = $subid . " " . $id;
482 }
483 if( $iter == $middle ) {
484 $answer = $vals[$#vals];
485 }
486 }
487 if( (!$id || $answer == -1) && $isValid ) {
488 print "fatal error, [$id], [$answer], [$lns]\n";
489 exit(1);
490 }
491 return ($id,$answer,$vid,$isValid,$strnd);
492 }
493
494 sub doStore {
495 my ($id,$answer,$vls,$strnd,$lookup_ref) = @_;
496 #print "[$id]\n [$answer]\n[$vls]\n";
497 my @vals = split(/ /,$vls);
498 my $middle = ($#vals+1)/2 - 1;
499 my $bp = 0;
500 for(my $iter = 0; $iter < $#vals; $iter+=2) {
501 $bp += $vals[$iter+1]-$vals[$iter]+1;
502 }
503 my $v1 = $vals[$middle]-$vals[$middle-1];
504 #print "$vals[$middle] $vals[$middle-1] $vals[$middle+2] $vals[$middle+1]\n";
505 if( $v1 < 1 ) {
506 print "error @vals $middle\n";
507 exit(1);
508 } else {
509 #$v1 = 1.0 - (1.0/$v1);
510 }
511 my $v2 = $vals[$middle+2]-$vals[$middle+1];
512 if( $v2 < 1 ) {
513 print "error 2 @vals, $middle\n";
514 exit(1);
515 }
516 my $diststr;
517 if( $strnd eq "+" ) {
518 $diststr = $v1 . " " . $v2;
519 } else {
520 $diststr = $v2 . " " . $v1;
521 }
522 if( $$lookup_ref{$id}{$answer} ) {
523 $$lookup_ref{$id}{$answer} .= "+" . $bp;
524 #print "pow: $$lookup_ref{$id}{$answer}\n";
525 } else {
526 #print "$id $answer\n";
527 $$lookup_ref{$id}{$answer} = $bp;
528 #print "poww: $$lookup_ref{$id}{$answer}\n";
529 }
530 }
531
532 sub wght {
533 my ($data,$cntOcc) = @_;
534 my @data_arr = split(/\n/,$data);
535 my $noCode = 0;
536 my $code = 0;
537 my $numFeat;
538 my $noCodeIdx;
539 my $codeIdx;
540 my $results;
541
542 for(my $iter = 0; $iter <= $#data_arr; $iter++) {
543 my $line = $data_arr[$iter];
544 my @vals = split(/ /,$line);
545 $numFeat = $#vals-7;
546 if( $cntOcc ) {
547 $noCodeIdx = $#vals-6;
548 } else {
549 $noCodeIdx = $#vals-5;
550 }
551 $codeIdx = $noCodeIdx+4;
552 $noCode += $vals[$noCodeIdx];
553 $code += $vals[$codeIdx];
554 }
555
556 my $max = 0;
557 for(my $iter = 0; $iter <= $#data_arr; $iter++) {
558 my $line = $data_arr[$iter];
559 $line =~ s/\n//;
560 my @vals = split(/ /,$line);
561 my $val1 = 0;
562 if( $noCode > 0 ) {
563 $val1 = ($vals[$noCodeIdx]/$noCode);
564 }
565 my $val2 = 0;
566 if( $code > 0 ) {
567 $val2 = ($vals[$codeIdx]/$code);
568 }
569 my $tval = 0;
570 if( $val1+$val2 > 0 ) {
571 $tval = -log($val1+$val2);
572 }
573 if($tval > $max) {
574 $max = $tval;
575 }
576 }
577
578 if( $max <= 0) {
579 print "training error - probably a directory structure/file format isue.\n";
580 exit(1);
581 }
582
583 for(my $iter = 0; $iter <= $#data_arr; $iter++) {
584 my $line = $data_arr[$iter];
585 $line =~ s/\n//;
586 my @vals = split(/ /,$line);
587 my $val1 = 0;
588 if( $noCode > 0 ) {
589 $val1 = ($vals[$noCodeIdx]/$noCode);
590 }
591 my $val2 = 0;
592 if( $code > 0 ) {
593 $val2 = ($vals[$codeIdx]/$code);
594 }
595 my $tnc = $vals[$noCodeIdx];
596 my $tc = $vals[$codeIdx];
597 my $prob = 0;
598 if( $tnc+$tc > 0 ) {
599 $prob = $tc/($tnc+$tc);
600 }
601 my $numPred = 0;
602 for(my $iter = 0; $iter < $numFeat; $iter++) {
603 if($vals[$iter] > 0) {
604 $numPred++;
605 }
606 }
607 for(my $iter = 0; $iter < $numFeat; $iter++) {
608 $results .= "$vals[$iter] ";
609 }
610 my $tval = 0;
611 if( $val1+$val2 > 0 ) {
612 $tval = -log($val1+$val2);
613 }
614 if( $max > 0 ) {
615 $tval = ($tval/$max);
616 }
617 $tval = 1-$tval;
618 $tval += 0.01;
619 $results .= "$prob\n";
620 #print "$prob $tval ";
621 #print "\n";
622 }
623 return $results;
624 }
625
626 sub remove_incomplete {
627 my ($file) = @_;
628 open(FILE,$file) || die "unable to open [$file]\n";
629 my $pass = 0;
630 while(my $line = <FILE>) {
631 chomp($line);
632 my @vals = split(/ /,$line);
633 if( $vals[$#vals] ) {
634 $pass = 1;
635 last;
636 }
637 }
638 close(FILE);
639 if( !$pass ) {
640 print "looks like incomplete gene removing file....$file\n";
641 system("rm -f $file");
642 }
643 }
644
645 sub runCmd {
646 my $cmd = $_[0];
647 print STDERR "#>running: $cmd\n";
648 system($cmd);
649 }

Properties

Name Value
svn:executable *