ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/modelit/scripts/parse_blast.pl
Revision: 1299
Committed: Tue Jan 11 17:38:33 2011 UTC (8 years, 9 months ago) by hstehr
File size: 12514 byte(s)
Log Message:
adding Model-It server files to repository
Line User Rev File contents
1 hstehr 1299 #!/usr/bin/perl
2     # Example taken from Chapters 8 and 12 of "Beginning perl for bioinformatics"
3     # Modified by Ioannis Filippis
4    
5     # Examples 12-1 and 12-2 Parse alignments from BLAST output file
6    
7     use strict;
8     use warnings;
9     #use BeginPerlBioinfo; # see Chapter 6 about this module
10     use Getopt::Long;
11    
12     # declare and initialize variables
13     my $beginning_annotation = '';
14     my $ending_annotation = '';
15     my %alignments = ( );
16     my $alignment = '';
17     my @HSPs = ( );
18     my($expect, $query, $query_start, $query_end, $subject, $subject_start, $subject_end) = ('','','','','','','');
19     my %args = ();
20     my $residues;
21     my @residues;
22     my $origQueryResidues = '';
23     my $psipredResidues = '';
24     my $origQueryRes;
25     my $alignQueryRes;
26     my $psipred;
27     my $help;
28     my $queryFile = '';
29     my $queryFasta = '';
30     my $queryHeader = '';
31     my $queryId = '';
32     my $subjectGiven = '';
33     my $subjectFasta = '';
34     my $finalQueryFasta = '';
35     my $finalSubjectFasta = '';
36     my $queryOrig2Align;
37     my $queryAlign2Orig;
38     my $subjectOrig2Align;
39     my $subjectAlign2Orig;
40     my $psipredPred;
41     my $psipredConf;
42     my $dssp;
43    
44     # prints usage
45     # - if not all necessary arguments are passed or
46     # - help option is passed
47     GetOptions("help|?" =>\$help,
48     "input=s"=>\$args{input},
49     "pdbcode=s"=>\$args{pdbcode},
50     "chaincode=s"=>\$args{chaincode},
51     "alignment=s"=>\$args{alignment},
52     "residues:s"=>\$residues,
53     "secstruc:s"=>\$psipred);
54     if (defined $help) {
55     usage();
56     die;
57     }
58     foreach (keys %args) {
59     if (!(defined $args{$_} && exists $args{$_})) {
60     usage();
61     die;
62     }
63     }
64    
65     $queryFile = join( '', get_file_data($args{input}));
66     ($queryHeader, $queryFasta) = ($queryFile =~ />(.*?)\n(.*)/ms);
67     $queryFasta =~ s/\n//g;
68     ($queryId) = ($queryHeader =~ /^([^ \t]+)/);
69    
70     $subjectGiven = $args{pdbcode}."|".$args{chaincode};
71     $subjectGiven =~ tr/a-z/A-Z/;
72    
73     $subjectFasta = `dumpseq -p $args{pdbcode}$args{chaincode} -s -N`;
74     $subjectFasta =~ s/\n//;
75    
76     if (defined $psipred) {
77     ($psipredPred, $psipredConf) = parse_psipred($psipred);
78     $psipredPred =~ tr/C/ /;
79     $psipredConf =~ tr/C/ /;
80    
81     #This works only in shell : DO NOT DELETE
82     #$dssp= `dumppdb -p $args{pdbcode}$args{chaincode} -s | dssp -- 2>/dev/null | perl -pe 's/^.*\.$//' | sed -e '/^$/d' | sed -e '1d' | cut -c 17 | perl -pe 's/\n//'`;
83     `dumppdb -p $args{pdbcode}$args{chaincode} -s | dssp -- 2>/dev/null 1>$args{pdbcode}$args{chaincode}.dssp`;
84     $dssp = parse_dssp("$args{pdbcode}$args{chaincode}.dssp");
85     unlink("$args{pdbcode}$args{chaincode}.dssp");
86     $dssp =~ tr/[HGI]/H/;
87     $dssp =~ tr/[EB]/E/;
88     $dssp =~ tr/[TS ]/ /;
89     }
90    
91     parse_blast(\$beginning_annotation, \$ending_annotation, \%alignments, $args{alignment});
92     if ($alignments{$subjectGiven}) {
93     $alignment = $alignments{$subjectGiven};
94     @HSPs = parse_blast_alignment_HSP($alignment);
95     ($expect, $query, $query_start, $query_end, $subject, $subject_start, $subject_end) = extract_HSP_information($HSPs[1]);
96    
97     $finalQueryFasta = "-" x ($subject_start - 1) . substr($queryFasta, 0, $query_start-1).$query.substr($queryFasta, $query_end) . "-" x (length($subjectFasta) - $subject_end);
98     $finalSubjectFasta = substr($subjectFasta, 0, $subject_start-1) . "-" x ($query_start - 1) . $subject. "-" x (length($queryFasta) - $query_end) . substr($subjectFasta, $subject_end);
99    
100     ($queryOrig2Align, $queryAlign2Orig) = get_mapping($finalQueryFasta);
101     ($subjectOrig2Align, $subjectAlign2Orig) = get_mapping($finalSubjectFasta);
102    
103     if(defined $psipred) {
104     $psipredPred = get_align_seq($psipredPred, $queryAlign2Orig, '-');
105     $psipredConf = get_align_seq($psipredConf, $queryAlign2Orig, '-');
106     $dssp = get_align_seq($dssp, $subjectAlign2Orig, '-');
107     }
108    
109     if(defined $residues) {
110     @residues = split(/,/, $residues);
111     for my $res (@residues) {
112     $finalSubjectFasta = lowercase($finalSubjectFasta, $subjectOrig2Align, $res);
113     $alignQueryRes = $subjectOrig2Align->[$res-1];
114     $origQueryRes = $queryAlign2Orig->[$alignQueryRes-1];
115     $origQueryResidues .= $origQueryRes.",";
116     $finalQueryFasta = lowercase($finalQueryFasta, $queryOrig2Align, $origQueryRes);
117     if(defined $psipred) {
118     $dssp = lowercase($dssp, $subjectOrig2Align, $res);
119     $psipredPred = lowercase($psipredPred, $queryOrig2Align, $origQueryRes);
120     if (substr($psipredPred, $alignQueryRes-1, 1) eq substr($dssp, $alignQueryRes-1, 1)) {
121     $psipredResidues .= $origQueryRes.",";
122     } else {
123     $psipredResidues .= "-1,";
124     }
125     }
126     }
127     $origQueryResidues =~ s/,$//;
128     $psipredResidues =~ s/,$//;
129     }
130    
131     if (length($finalQueryFasta) != length($finalSubjectFasta)) {
132     die "Calculated sequences differ in length!!!!!!!!!!!\n";
133     } else {
134     print ">".$queryHeader." | $query_start-$query_end | ".($subject_start-1+$query_start)."-".($subject_start-1+$query_start-1+length($query))."\n".$finalQueryFasta."\n";
135     print ">".$args{pdbcode}.$args{chaincode}." | $subject_start-$subject_end | ".($query_start-1+$subject_start)."-".($query_start-1+$subject_start-1+length($subject))."\n".$finalSubjectFasta."\n";
136     if(defined $residues) {
137     print "\n\nOriginal Subject Residues : $residues\n";
138     print "Original Query Residues : $origQueryResidues\n";
139     }
140     if(defined $psipred) {
141     print "Psipred Agreement Residues: $psipredResidues\n";
142     print "\n\n$queryId: $finalQueryFasta\n$queryId: $psipredConf\n$queryId: $psipredPred\n";
143     print "$args{pdbcode}$args{chaincode}: $finalSubjectFasta\n$args{pdbcode}$args{chaincode}: $dssp\n";
144     }
145     }
146     } else {
147     die "No such pdb found in the blast file!!!!!!!!!!!!\n";
148     }
149    
150     exit;
151    
152     sub usage {
153     print "Usage:\n perl parse_blast.pl\n";
154     print "\t -i : file with input target sequence in FASTA format\n";
155     print "\t -p : pdb code of the subject/template sequence\n";
156     print "\t -c : pdb chain code of the subject/template sequence\n";
157     print "\t -a : psi-blact classic output file for input target sequence\n";
158     print "\t [-r] : comma-separated list of residues to be examined. Optional\n";
159     print "\t [-s] : psipred output file. Optional\n";
160     exit;
161     }
162    
163     sub get_mapping {
164     my $seq = shift;
165     my @chars = split(//,$seq);
166     my $origCharPos = 0;
167     my $alignCharPos = 0;
168     my @orig2align = ();
169     my @align2orig = ();
170    
171     for my $char (@chars) {
172     $alignCharPos++;
173     if (!($char eq "-")) {
174     $origCharPos++;
175     push(@orig2align, $alignCharPos);
176     push(@align2orig, $origCharPos);
177     } else {
178     push(@align2orig, -1);
179     }
180     }
181    
182     return (\@orig2align, \@align2orig);
183     }
184    
185     sub lowercase {
186     my ($seq, $orig2align, $origCharPos) = @_;
187     my $alignCharPos = $orig2align->[$origCharPos-1];
188     return (substr($seq, 0, $alignCharPos-1).lcfirst(substr($seq, $alignCharPos-1)));
189     }
190    
191     sub get_align_seq {
192     my ($seq, $align2orig, $gapChar) = @_;
193     my @map = @{$align2orig};
194     my $alignSeq = '';
195    
196     for my $pos (@map) {
197     if ($pos == -1) {
198     $alignSeq .= $gapChar;
199     } else {
200     $alignSeq .= substr($seq, ($pos-1), 1);
201     }
202     }
203    
204     return $alignSeq;
205     }
206    
207     sub parse_psipred {
208    
209     my $filename = shift;
210     my $psipred_output_file = '';
211     my $pred = '';
212     my $conf = '';
213    
214     $psipred_output_file = join( '', get_file_data($filename));
215     $pred = join ( '' , ($psipred_output_file =~ /^Pred: (.*)\n/gm) );
216     $conf = join ( '' , ($psipred_output_file =~ /^Conf: (.*)\n/gm) );
217    
218     return ($pred, $conf);
219     }
220    
221     sub parse_dssp {
222    
223     my $filename = shift;
224     my $dssp_output_file = '';
225     my $dssp = '';
226     my $mainPart = '';
227    
228     $dssp_output_file = join( '', get_file_data($filename));
229     ($mainPart) = ($dssp_output_file =~ /.*CA\s+(^.*)/ms);
230     $dssp = join ( '' , ($mainPart =~ /^.{16}(.{1}).*\n/gm) );
231     $dssp =~ s/\n+$//;
232    
233     return ($dssp);
234     }
235    
236     # get_file_data
237     #
238     # A subroutine to get data from a file given its filename
239    
240     ################################################################################
241     # Subroutine from Chapter 8
242     ################################################################################
243    
244     sub get_file_data {
245    
246     my($filename) = @_;
247    
248     use strict;
249     use warnings;
250    
251     # Initialize variables
252     my @filedata = ( );
253    
254     unless( open(GET_FILE_DATA, $filename) ) {
255     print STDERR "Cannot open file \"$filename\"\n\n";
256     exit;
257     }
258    
259     @filedata = <GET_FILE_DATA>;
260    
261     close GET_FILE_DATA;
262    
263     return @filedata;
264     }
265    
266     ################################################################################
267     # Subroutines for Example 12-1
268     ################################################################################
269    
270     # parse_blast
271     #
272     # -parse beginning and ending annotation, and alignments,
273     # from BLAST output file
274    
275     sub parse_blast {
276    
277     my($beginning_annotation, $ending_annotation, $alignments, $filename) = @_;
278    
279     # $beginning_annotation-reference to scalar
280     # $ending_annotation -reference to scalar
281     # $alignments -reference to hash
282     # $filename -scalar
283    
284     # declare and initialize variables
285     my $blast_output_file = '';
286     my $alignment_section = '';
287    
288     # Get the BLAST program output into an array from a file
289     $blast_output_file = join( '', get_file_data($filename));
290    
291     # Extract the beginning annotation, alignments, and ending annotation
292     ($$beginning_annotation, $alignment_section, $$ending_annotation)
293     = ($blast_output_file =~ /(.*?)(^>.*)(^ Database:.*)/ms);
294    
295     # Populate %alignments hash
296     # key = ID of hit
297     # value = alignment section
298     %$alignments = parse_blast_alignment($alignment_section);
299     }
300    
301     # parse_blast_alignment
302     #
303     # -parse the alignments from a BLAST output file,
304     # return hash with
305     # key = ID
306     # value = text of alignment
307    
308     sub parse_blast_alignment {
309    
310     my($alignment_section) = @_;
311    
312     # declare and initialize variables
313     my(%alignment_hash) = ( );
314    
315     # loop through the scalar containing the BLAST alignments,
316     # extracting the ID and the alignment and storing in a hash
317     #
318     # The regular expression matches a line beginning with >,
319     # and containing the ID between the first pair of | characters;
320     # followed by any number of lines that don't begin with >
321    
322     while($alignment_section =~ /^>.*\n(^(?!>).*\n)+/gm) {
323     my($value) = $&;
324     my($key) = ($value =~ /pdb\|(.*?) /);
325     $alignment_hash{$key} = $value;
326     }
327    
328     return %alignment_hash;
329     }
330    
331     ################################################################################
332     # Subroutines for Example 12-2
333     ################################################################################
334    
335     # parse_blast_alignment_HSP
336     #
337     # -parse beginning annotation, and HSPs,
338     # from BLAST alignment
339     # Return an array with first element set to the beginning annotation,
340     # and each successive element set to an HSP
341    
342     sub parse_blast_alignment_HSP {
343    
344     my($alignment ) = @_;
345    
346     # declare and initialize variables
347     my $beginning_annotation = '';
348     my $HSP_section = '';
349     my @HSPs = ( );
350    
351     # Extract the beginning annotation and HSPs
352     ($beginning_annotation, $HSP_section )
353     = ($alignment =~ /(.*?)(^ Score =.*)/ms);
354    
355     # Store the $beginning_annotation as the first entry in @HSPs
356     push(@HSPs, $beginning_annotation);
357    
358     # Parse the HSPs, store each HSP as an element in @HSPs
359     while($HSP_section =~ /(^ Score =.*\n)(^(?! Score =).*\n)+/gm) {
360     push(@HSPs, $&);
361     }
362    
363     # Return an array with first element = the beginning annotation,
364     # and each successive element = an HSP
365     return(@HSPs);
366     }
367    
368     # extract_HSP_information
369     #
370     # -parse a HSP from a BLAST output alignment section
371     # - return array with elements:
372     # Expect value
373     # Query string
374     # Query range
375     # Subject string
376     # Subject range
377    
378     sub extract_HSP_information {
379    
380     my($HSP) = @_;
381    
382     # declare and initialize variables
383     my($expect) = '';
384     my($query) = '';
385     my($query_start) = '';
386     my($query_end) = '';
387     my($subject) = '';
388     my($subject_start) = '';
389     my($subject_end) = '';
390    
391     ($expect) = ($HSP =~ /Expect = (\S+),/);
392    
393     $query = join ( '' , ($HSP =~ /^Query.*\n/gm) );
394    
395     $subject = join ( '' , ($HSP =~ /^Sbjct.*\n/gm) );
396    
397     ($query_start, $query_end) = ($query =~ /(\d+).*\D(\d+)/s);
398    
399     ($subject_start, $subject_end) = ($subject =~ /(\d+).*\D(\d+)/s);
400    
401     #$query =~ s/[^acgt]//g;
402    
403     #$subject =~ s/[^acgt]//g;
404    
405     $query = join ( '' , ($HSP =~ /^Query:[\s\d]+([^\s]+)[\s\d]+\n/gm) );
406    
407     $subject = join ( '' , ($HSP =~ /^Sbjct:[\s\d]+([^\s]+)[\s\d]+\n/gm) );
408    
409     return ($expect, $query, $query_start, $query_end, $subject, $subject_start, $subject_end);
410     }