ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/modelit/scripts/parse_blast.pl
Revision: 1299
Committed: Tue Jan 11 17:38:33 2011 UTC (8 years, 8 months ago) by hstehr
File size: 12514 byte(s)
Log Message:
adding Model-It server files to repository
Line File contents
1 #!/usr/bin/perl
2 # Example taken from Chapters 8 and 12 of "Beginning perl for bioinformatics"
3 # Modified by Ioannis Filippis
4
5 # Examples 12-1 and 12-2 Parse alignments from BLAST output file
6
7 use strict;
8 use warnings;
9 #use BeginPerlBioinfo; # see Chapter 6 about this module
10 use Getopt::Long;
11
12 # declare and initialize variables
13 my $beginning_annotation = '';
14 my $ending_annotation = '';
15 my %alignments = ( );
16 my $alignment = '';
17 my @HSPs = ( );
18 my($expect, $query, $query_start, $query_end, $subject, $subject_start, $subject_end) = ('','','','','','','');
19 my %args = ();
20 my $residues;
21 my @residues;
22 my $origQueryResidues = '';
23 my $psipredResidues = '';
24 my $origQueryRes;
25 my $alignQueryRes;
26 my $psipred;
27 my $help;
28 my $queryFile = '';
29 my $queryFasta = '';
30 my $queryHeader = '';
31 my $queryId = '';
32 my $subjectGiven = '';
33 my $subjectFasta = '';
34 my $finalQueryFasta = '';
35 my $finalSubjectFasta = '';
36 my $queryOrig2Align;
37 my $queryAlign2Orig;
38 my $subjectOrig2Align;
39 my $subjectAlign2Orig;
40 my $psipredPred;
41 my $psipredConf;
42 my $dssp;
43
44 # prints usage
45 # - if not all necessary arguments are passed or
46 # - help option is passed
47 GetOptions("help|?" =>\$help,
48 "input=s"=>\$args{input},
49 "pdbcode=s"=>\$args{pdbcode},
50 "chaincode=s"=>\$args{chaincode},
51 "alignment=s"=>\$args{alignment},
52 "residues:s"=>\$residues,
53 "secstruc:s"=>\$psipred);
54 if (defined $help) {
55 usage();
56 die;
57 }
58 foreach (keys %args) {
59 if (!(defined $args{$_} && exists $args{$_})) {
60 usage();
61 die;
62 }
63 }
64
65 $queryFile = join( '', get_file_data($args{input}));
66 ($queryHeader, $queryFasta) = ($queryFile =~ />(.*?)\n(.*)/ms);
67 $queryFasta =~ s/\n//g;
68 ($queryId) = ($queryHeader =~ /^([^ \t]+)/);
69
70 $subjectGiven = $args{pdbcode}."|".$args{chaincode};
71 $subjectGiven =~ tr/a-z/A-Z/;
72
73 $subjectFasta = `dumpseq -p $args{pdbcode}$args{chaincode} -s -N`;
74 $subjectFasta =~ s/\n//;
75
76 if (defined $psipred) {
77 ($psipredPred, $psipredConf) = parse_psipred($psipred);
78 $psipredPred =~ tr/C/ /;
79 $psipredConf =~ tr/C/ /;
80
81 #This works only in shell : DO NOT DELETE
82 #$dssp= `dumppdb -p $args{pdbcode}$args{chaincode} -s | dssp -- 2>/dev/null | perl -pe 's/^.*\.$//' | sed -e '/^$/d' | sed -e '1d' | cut -c 17 | perl -pe 's/\n//'`;
83 `dumppdb -p $args{pdbcode}$args{chaincode} -s | dssp -- 2>/dev/null 1>$args{pdbcode}$args{chaincode}.dssp`;
84 $dssp = parse_dssp("$args{pdbcode}$args{chaincode}.dssp");
85 unlink("$args{pdbcode}$args{chaincode}.dssp");
86 $dssp =~ tr/[HGI]/H/;
87 $dssp =~ tr/[EB]/E/;
88 $dssp =~ tr/[TS ]/ /;
89 }
90
91 parse_blast(\$beginning_annotation, \$ending_annotation, \%alignments, $args{alignment});
92 if ($alignments{$subjectGiven}) {
93 $alignment = $alignments{$subjectGiven};
94 @HSPs = parse_blast_alignment_HSP($alignment);
95 ($expect, $query, $query_start, $query_end, $subject, $subject_start, $subject_end) = extract_HSP_information($HSPs[1]);
96
97 $finalQueryFasta = "-" x ($subject_start - 1) . substr($queryFasta, 0, $query_start-1).$query.substr($queryFasta, $query_end) . "-" x (length($subjectFasta) - $subject_end);
98 $finalSubjectFasta = substr($subjectFasta, 0, $subject_start-1) . "-" x ($query_start - 1) . $subject. "-" x (length($queryFasta) - $query_end) . substr($subjectFasta, $subject_end);
99
100 ($queryOrig2Align, $queryAlign2Orig) = get_mapping($finalQueryFasta);
101 ($subjectOrig2Align, $subjectAlign2Orig) = get_mapping($finalSubjectFasta);
102
103 if(defined $psipred) {
104 $psipredPred = get_align_seq($psipredPred, $queryAlign2Orig, '-');
105 $psipredConf = get_align_seq($psipredConf, $queryAlign2Orig, '-');
106 $dssp = get_align_seq($dssp, $subjectAlign2Orig, '-');
107 }
108
109 if(defined $residues) {
110 @residues = split(/,/, $residues);
111 for my $res (@residues) {
112 $finalSubjectFasta = lowercase($finalSubjectFasta, $subjectOrig2Align, $res);
113 $alignQueryRes = $subjectOrig2Align->[$res-1];
114 $origQueryRes = $queryAlign2Orig->[$alignQueryRes-1];
115 $origQueryResidues .= $origQueryRes.",";
116 $finalQueryFasta = lowercase($finalQueryFasta, $queryOrig2Align, $origQueryRes);
117 if(defined $psipred) {
118 $dssp = lowercase($dssp, $subjectOrig2Align, $res);
119 $psipredPred = lowercase($psipredPred, $queryOrig2Align, $origQueryRes);
120 if (substr($psipredPred, $alignQueryRes-1, 1) eq substr($dssp, $alignQueryRes-1, 1)) {
121 $psipredResidues .= $origQueryRes.",";
122 } else {
123 $psipredResidues .= "-1,";
124 }
125 }
126 }
127 $origQueryResidues =~ s/,$//;
128 $psipredResidues =~ s/,$//;
129 }
130
131 if (length($finalQueryFasta) != length($finalSubjectFasta)) {
132 die "Calculated sequences differ in length!!!!!!!!!!!\n";
133 } else {
134 print ">".$queryHeader." | $query_start-$query_end | ".($subject_start-1+$query_start)."-".($subject_start-1+$query_start-1+length($query))."\n".$finalQueryFasta."\n";
135 print ">".$args{pdbcode}.$args{chaincode}." | $subject_start-$subject_end | ".($query_start-1+$subject_start)."-".($query_start-1+$subject_start-1+length($subject))."\n".$finalSubjectFasta."\n";
136 if(defined $residues) {
137 print "\n\nOriginal Subject Residues : $residues\n";
138 print "Original Query Residues : $origQueryResidues\n";
139 }
140 if(defined $psipred) {
141 print "Psipred Agreement Residues: $psipredResidues\n";
142 print "\n\n$queryId: $finalQueryFasta\n$queryId: $psipredConf\n$queryId: $psipredPred\n";
143 print "$args{pdbcode}$args{chaincode}: $finalSubjectFasta\n$args{pdbcode}$args{chaincode}: $dssp\n";
144 }
145 }
146 } else {
147 die "No such pdb found in the blast file!!!!!!!!!!!!\n";
148 }
149
150 exit;
151
152 sub usage {
153 print "Usage:\n perl parse_blast.pl\n";
154 print "\t -i : file with input target sequence in FASTA format\n";
155 print "\t -p : pdb code of the subject/template sequence\n";
156 print "\t -c : pdb chain code of the subject/template sequence\n";
157 print "\t -a : psi-blact classic output file for input target sequence\n";
158 print "\t [-r] : comma-separated list of residues to be examined. Optional\n";
159 print "\t [-s] : psipred output file. Optional\n";
160 exit;
161 }
162
163 sub get_mapping {
164 my $seq = shift;
165 my @chars = split(//,$seq);
166 my $origCharPos = 0;
167 my $alignCharPos = 0;
168 my @orig2align = ();
169 my @align2orig = ();
170
171 for my $char (@chars) {
172 $alignCharPos++;
173 if (!($char eq "-")) {
174 $origCharPos++;
175 push(@orig2align, $alignCharPos);
176 push(@align2orig, $origCharPos);
177 } else {
178 push(@align2orig, -1);
179 }
180 }
181
182 return (\@orig2align, \@align2orig);
183 }
184
185 sub lowercase {
186 my ($seq, $orig2align, $origCharPos) = @_;
187 my $alignCharPos = $orig2align->[$origCharPos-1];
188 return (substr($seq, 0, $alignCharPos-1).lcfirst(substr($seq, $alignCharPos-1)));
189 }
190
191 sub get_align_seq {
192 my ($seq, $align2orig, $gapChar) = @_;
193 my @map = @{$align2orig};
194 my $alignSeq = '';
195
196 for my $pos (@map) {
197 if ($pos == -1) {
198 $alignSeq .= $gapChar;
199 } else {
200 $alignSeq .= substr($seq, ($pos-1), 1);
201 }
202 }
203
204 return $alignSeq;
205 }
206
207 sub parse_psipred {
208
209 my $filename = shift;
210 my $psipred_output_file = '';
211 my $pred = '';
212 my $conf = '';
213
214 $psipred_output_file = join( '', get_file_data($filename));
215 $pred = join ( '' , ($psipred_output_file =~ /^Pred: (.*)\n/gm) );
216 $conf = join ( '' , ($psipred_output_file =~ /^Conf: (.*)\n/gm) );
217
218 return ($pred, $conf);
219 }
220
221 sub parse_dssp {
222
223 my $filename = shift;
224 my $dssp_output_file = '';
225 my $dssp = '';
226 my $mainPart = '';
227
228 $dssp_output_file = join( '', get_file_data($filename));
229 ($mainPart) = ($dssp_output_file =~ /.*CA\s+(^.*)/ms);
230 $dssp = join ( '' , ($mainPart =~ /^.{16}(.{1}).*\n/gm) );
231 $dssp =~ s/\n+$//;
232
233 return ($dssp);
234 }
235
236 # get_file_data
237 #
238 # A subroutine to get data from a file given its filename
239
240 ################################################################################
241 # Subroutine from Chapter 8
242 ################################################################################
243
244 sub get_file_data {
245
246 my($filename) = @_;
247
248 use strict;
249 use warnings;
250
251 # Initialize variables
252 my @filedata = ( );
253
254 unless( open(GET_FILE_DATA, $filename) ) {
255 print STDERR "Cannot open file \"$filename\"\n\n";
256 exit;
257 }
258
259 @filedata = <GET_FILE_DATA>;
260
261 close GET_FILE_DATA;
262
263 return @filedata;
264 }
265
266 ################################################################################
267 # Subroutines for Example 12-1
268 ################################################################################
269
270 # parse_blast
271 #
272 # -parse beginning and ending annotation, and alignments,
273 # from BLAST output file
274
275 sub parse_blast {
276
277 my($beginning_annotation, $ending_annotation, $alignments, $filename) = @_;
278
279 # $beginning_annotation-reference to scalar
280 # $ending_annotation -reference to scalar
281 # $alignments -reference to hash
282 # $filename -scalar
283
284 # declare and initialize variables
285 my $blast_output_file = '';
286 my $alignment_section = '';
287
288 # Get the BLAST program output into an array from a file
289 $blast_output_file = join( '', get_file_data($filename));
290
291 # Extract the beginning annotation, alignments, and ending annotation
292 ($$beginning_annotation, $alignment_section, $$ending_annotation)
293 = ($blast_output_file =~ /(.*?)(^>.*)(^ Database:.*)/ms);
294
295 # Populate %alignments hash
296 # key = ID of hit
297 # value = alignment section
298 %$alignments = parse_blast_alignment($alignment_section);
299 }
300
301 # parse_blast_alignment
302 #
303 # -parse the alignments from a BLAST output file,
304 # return hash with
305 # key = ID
306 # value = text of alignment
307
308 sub parse_blast_alignment {
309
310 my($alignment_section) = @_;
311
312 # declare and initialize variables
313 my(%alignment_hash) = ( );
314
315 # loop through the scalar containing the BLAST alignments,
316 # extracting the ID and the alignment and storing in a hash
317 #
318 # The regular expression matches a line beginning with >,
319 # and containing the ID between the first pair of | characters;
320 # followed by any number of lines that don't begin with >
321
322 while($alignment_section =~ /^>.*\n(^(?!>).*\n)+/gm) {
323 my($value) = $&;
324 my($key) = ($value =~ /pdb\|(.*?) /);
325 $alignment_hash{$key} = $value;
326 }
327
328 return %alignment_hash;
329 }
330
331 ################################################################################
332 # Subroutines for Example 12-2
333 ################################################################################
334
335 # parse_blast_alignment_HSP
336 #
337 # -parse beginning annotation, and HSPs,
338 # from BLAST alignment
339 # Return an array with first element set to the beginning annotation,
340 # and each successive element set to an HSP
341
342 sub parse_blast_alignment_HSP {
343
344 my($alignment ) = @_;
345
346 # declare and initialize variables
347 my $beginning_annotation = '';
348 my $HSP_section = '';
349 my @HSPs = ( );
350
351 # Extract the beginning annotation and HSPs
352 ($beginning_annotation, $HSP_section )
353 = ($alignment =~ /(.*?)(^ Score =.*)/ms);
354
355 # Store the $beginning_annotation as the first entry in @HSPs
356 push(@HSPs, $beginning_annotation);
357
358 # Parse the HSPs, store each HSP as an element in @HSPs
359 while($HSP_section =~ /(^ Score =.*\n)(^(?! Score =).*\n)+/gm) {
360 push(@HSPs, $&);
361 }
362
363 # Return an array with first element = the beginning annotation,
364 # and each successive element = an HSP
365 return(@HSPs);
366 }
367
368 # extract_HSP_information
369 #
370 # -parse a HSP from a BLAST output alignment section
371 # - return array with elements:
372 # Expect value
373 # Query string
374 # Query range
375 # Subject string
376 # Subject range
377
378 sub extract_HSP_information {
379
380 my($HSP) = @_;
381
382 # declare and initialize variables
383 my($expect) = '';
384 my($query) = '';
385 my($query_start) = '';
386 my($query_end) = '';
387 my($subject) = '';
388 my($subject_start) = '';
389 my($subject_end) = '';
390
391 ($expect) = ($HSP =~ /Expect = (\S+),/);
392
393 $query = join ( '' , ($HSP =~ /^Query.*\n/gm) );
394
395 $subject = join ( '' , ($HSP =~ /^Sbjct.*\n/gm) );
396
397 ($query_start, $query_end) = ($query =~ /(\d+).*\D(\d+)/s);
398
399 ($subject_start, $subject_end) = ($subject =~ /(\d+).*\D(\d+)/s);
400
401 #$query =~ s/[^acgt]//g;
402
403 #$subject =~ s/[^acgt]//g;
404
405 $query = join ( '' , ($HSP =~ /^Query:[\s\d]+([^\s]+)[\s\d]+\n/gm) );
406
407 $subject = join ( '' , ($HSP =~ /^Sbjct:[\s\d]+([^\s]+)[\s\d]+\n/gm) );
408
409 return ($expect, $query, $query_start, $query_end, $subject, $subject_start, $subject_end);
410 }