ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/ace2fasta.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 2502 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4
5 my $usage = q/Usage:
6 ace2fasta.pl -o <contigs_fasta> [-c <components_file>] <acefile>
7 /;
8 umask 0002;
9 getopts('o:c:') || die($usage."\n");
10 my $outfile=$Getopt::Std::opt_o ||
11 die("$usage\nMust specify output file name!\n");
12 open(CFASTA, '>'.$outfile) || die("Error creating file $outfile!\n");
13 my $lnkfile=$Getopt::Std::opt_c;
14 if ($lnkfile) {
15 open(COMP, '>'.$lnkfile) || die("Error creating file $lnkfile!\n");
16 }
17
18 my $ctg; #current contig
19 my $ctgseq; #current contig sequence
20 my $ctglen; #current contig seq length
21 my $numseqs;
22 my @ctgcomp; #list of component sequence names
23 my %seqs; # seq => [ seqlen, strand, seqL, seqR, asmL, asmR ]
24 my %ctgnames;
25 while (<>) {
26 if (m/^CO\s+(\S+)\s+(\d+)\s+(\d+)/) {
27 my ($c, $l, $n)=($1, $2, $3);
28 #if ($c ne $ctg) {
29 &writeCtg() if $ctg;
30 undef %seqs;
31 undef @ctgcomp;
32 my $ctgdup=++$ctgnames{$c};
33 $c='ASM_'.$c;
34 $c.='.'.($ctgdup-1) if ($ctgdup>1);
35 ($ctg, $ctglen, $numseqs)=($c, $l, $n);
36 # }
37 $ctgseq='';
38 my $seqline;
39 while (<>) {
40 ($seqline)=(/^(\S+)/);
41 last unless $seqline;
42 $ctgseq.=$seqline;
43 }
44 } #contig start line
45 elsif (m/^AF (\S+) ([UC]) ([\-\d]+)/) {
46 my $strand=($2 eq 'U')?'+':'-';
47 $seqs{$1}=[0, $strand, 0, 0, $3, 0]; #the untrimmed asmL position for now
48 }
49 elsif (m/^RD (\S+) (\d+) (\d+) (\d+)/) {
50 my ($seqname, $seqlen)=($1, $2);
51 my $seqd=$seqs{$seqname};
52 push(@ctgcomp, $seqname);
53 die("Error at ACE parsing: no sequence found for RD $seqname\n")
54 unless $seqd;
55 $seqd->[0]=$seqlen;
56 my $rdseq='';
57 my $seqline;
58 while ( defined($_=<>) && (($seqline)=(/(\S+)/))) {
59 $rdseq.=$seqline;
60 }
61 do {
62 $_=<>;
63 } until m/^QA (\d+) (\d+)/ || !defined($_);
64 die("Couldn't find the QA entry for RD $seqname!\n") unless defined($_);
65 ($seqd->[2], $seqd->[3]) = ($1, $2);
66 my ($trimL, $trimR)=($seqd->[2]-1, $seqlen-$seqd->[3]);
67 $seqd->[5]=$seqd->[4]+$seqlen-$trimR-1;
68 $seqd->[4]+=$trimL;
69 }
70 }
71
72 writeCtg() if $ctg;
73
74 close(CFASTA);
75 close(COMP) if $lnkfile;
76
77 sub writeCtg {
78 print CFASTA ">$ctg $numseqs\n";
79 $ctgseq=~tr/-*\n\r//d;
80 # $ctglen=length($ctgseq);
81 print CFASTA join("\n", (unpack('(A72)*',$ctgseq)))."\n";
82 if ($lnkfile) {
83 print COMP ">$ctg $numseqs $ctglen\n";
84 foreach my $seqname (@ctgcomp) {
85 my $sd=$seqs{$seqname} ||
86 die("Error: no seqdata for component $seqname!\n");
87 print COMP $seqname.' '.join(' ',@$sd)."\n";
88 }
89 }
90
91 }

Properties

Name Value
svn:executable *