ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/yamap/yamap_embl.pl
Revision: 1.3
Committed: Wed Dec 13 11:19:45 2006 UTC (9 years, 5 months ago) by gawi79
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +82 -88 lines
Log Message:
Re-written to work with both single sequence files and multi-sequence files

Line File contents
1 #!/usr/bin/perl
2
3 # A script to read in fasta files and Artemis feature tables,
4 # and to write out as EMBL formatted files
5
6 use strict;
7 use Bio::SeqIO;
8 use File::Basename;
9 use IO::File;
10 use Getopt::Std;
11
12 my $id = `hostname --long`;
13 chomp($id);
14
15 # usage
16 unless (@ARGV ==3) {
17 die "\n\nProper Command Line Usage: yamap_embl.pl infile outfile multiseq\nPlease try again.\n\n\n";}
18
19 my $file = shift;
20 my $dir = shift;
21 my $multiseq = shift;
22
23 my $embl_dir = dirname($dir);
24 opendir(DIR, $dir) || warn "Can't opendir $dir: $!";
25 my @tabs = grep { /\.tab$/ || /\.crunch$/ } readdir(DIR);
26 closedir DIR;
27 my $pwd =`pwd`;
28 print "pwd = $pwd\n";
29 # open seq.in and seq.out files
30 #print "Reading file " . $basename . "...\n";
31 my $infile = Bio::SeqIO->new(-file => "$dir/$file",
32 -format => "FASTA");
33 my ($seqlen,$seq);
34 while (my $seqin = $infile->next_seq())
35 {
36 $seq = $seqin->seq();
37 $seqlen = length $seq;
38 }
39 # print EMBL headers
40 print "Writing file $file.embl...\n";
41 my $outfile = "$embl_dir/$file.embl";
42 open (OUT,">$outfile") or die "Can't open $outfile for writing: $!";
43 print OUT <<EOF;
44 ID $file standard; metagenomic DNA; CON; $seqlen BP.
45 AC
46 DE $id
47 XX
48 FT source 1..$seqlen
49 EOF
50
51 # foreach art file, print to file
52 foreach my $tab (@tabs)
53 {
54 my $artio = IO::File->new("$dir/$tab", 'r') or die "could not open $tab: $!";
55 if ($tab =~ /\.crunch$/)
56 {
57 # if a crunch file, format first
58 while (my $line = <$artio>)
59 {
60 chomp($line);
61 my @parts = split(/\s+/, $line);
62 my $notes;
63 my $end = length (@parts) -1;
64 for (my $j=8; $j<=$end; $j++)
65 {
66 $notes .= "$parts[$j]";
67 $notes .= " " unless ($j == $end);
68 }
69 print OUT <<EOF;
70 FT CRUNCH_D $parts[2]..$parts[3]
71 FT /blast_score=$parts[0]
72 FT /score=$parts[1]
73 FT /percent_id=$parts[1]
74 FT /query_id=$parts[4]
75 FT /subject_start=$parts[5]
76 FT /subject_end=$parts[6]
77 FT /subject_id=$parts[7]
78 FT /note="hit to $parts[7] $parts[5]..$parts[6] score: $parts[0] percent id: $parts[1] my $notes"
79 EOF
80 }
81 }
82 elsif ($tab =~ /\.tab$/)
83 {
84 # if a .tab file, print out directly
85 while (<$artio>)
86 {
87 print OUT;
88 }
89 }
90 }
91
92 print OUT "XX\n";
93 # print the actual sequence
94 my $sequence = $seq;
95 my $base_a = $sequence =~ tr/aA/aA/;
96 my $base_c = $sequence =~ tr/cC/cC/;
97 my $base_g = $sequence =~ tr/gG/gG/;
98 my $base_t = $sequence =~ tr/tT/tT/;
99 print OUT "SQ Sequence $seqlen BP; $base_a A; $base_c C; $base_g G; $base_t T; $seqlen other;\n";
100 # now for the cunning EMBL formatting bit - 6 groups of 10
101 my @blobs = $sequence =~ /\D{1,10}/g;
102 my $numlines = int(((scalar @blobs)/6) + 0.5);
103 my $lineend = 60;
104 for (my $i = 0; $i <= ($numlines); $i++)
105 {
106 print OUT " ";
107 for (my $j = 0; $j <= 5; $j++)
108 {
109 my $plonk = shift(@blobs);
110 if (defined $plonk)
111 {
112 if (length $plonk == 10) { print OUT "$plonk"; }
113 else
114 { print OUT $plonk, " " x (10 - (length $plonk)) };
115 }
116 else { print OUT " "; }
117 print OUT " ";
118 }
119 print OUT " $lineend\n";
120 $lineend += 60;
121 }
122
123 # vital end character
124 print OUT "//";
125
126 # close the EMBL file
127 close OUT;
128
129
130
131
132 __END__
133 my $b=0;
134 my $c=0;
135 for ($c=0; $c < $seqlen; $c+=10)
136 {
137 print OUT " " if $b%6==0;
138 print OUT " ", lc(substr($sequence, $c, 10));
139 printf OUT ("%10d\n", $c+10) if $b%6==5;
140 $b++;
141 }