#!
/usr/bin/perl -w
#
--
#
AUTHOR: Amber Fedynak (afedynak@gmail.com)
#
DATE: Feb 22, 2010
#
NOTE: Copyright (c) 2010 under GNU GPL
#
--
# Import Modules
use strict;
use Bio::SeqIO;
# -- Main program
# DESCRIPTION: Reads in mature and hairpin
RNA FASTA files. Retrieve all matching pairs.
# OUTPUT: Fasta
file "output.fa" of all matches and the 5' microRNA variations.
# --
my %matureRNA = ();
my %hairpinRNA = ();
my $matureRNA_ref = read_FASTA_to_hash
("mature.fa", "ath");
my $hairpinRNA_ref = read_FASTA_to_hash
("hairpin.fa", "ath");
%matureRNA = %$matureRNA_ref;
%hairpinRNA = %$hairpinRNA_ref;
get_RNA_variations("output.fa", \%matureRNA, \%hairpinRNA);
# --
# SUB: get_RNA_variations:
Retrieve each matureRNA and matching hairpin RNA,
write sequences and 5' variations to output file
# ARGS: (Hash 1, Hash 2); Hash of Bio::Seq objects
# OUTPUT: FASTA sequence
file with hairpinRNA, microRNA,
and 5' variations
# --
sub get_RNA_variations {
my $outfile = $_[0];
my %temp1 = %{$_[1]};
my %temp2 = %{$_[2]};
# For each mature RNA, find matching hairpin
RNA
for my $key1 ( sort keys %temp1 ) {
my $pid1 = $temp1{$key1}->primary_id;
# Only retrieve sequences with "ath" as header
$pid1 =~ /^ath.+\*?$/;
# When match found, write hairpinRNA,
matureRNA, and 5' sequence variations to output file
for my $key2 ( sort keys %temp2 ) {
my $pid2 =
$temp2{$key2}->primary_id;
if ($pid1 =~
m/^$pid2/i) {
my $seq1 = $temp1{$key1}->seq;
my $seq2 = $temp2{$key2}->seq;
# Retrieve 0-3
nucleotides off 5' end of sequence
$seq2 =~ m/(\w{0,3})$seq1/i;
# Write the hairpin, microRNA, and 5' variations to output file
write_FASTA_to_file($outfile,
$temp2{$key2}->primary_id, $temp2{$key2}->desc, $seq2);
write_FASTA_to_file($outfile,
$pid1, $temp2{$key2}->desc, $seq1);
write_y_variations($1, $seq1, $pid1,
$outfile);
}
}
}
}
# --
# SUB: write_y_variations:
Print out microRNA sequences with 1, 2, and 3
nucleotide extensions off the 5' end
# ARGS: (Array, Scalar, Scalar, Scalar);
Array containing up to 3 nucleotides, sequence, primary id, output file
# OUTPUT: Concatenates up to 3 sequences
with 5' variations to output file
# --
sub write_y_variations {
my @ySeq = split(//, $_[0]);
my $microSeq = $_[1];
my $microPid = $_[2];
my $outfile = $_[3];
my $length =
scalar(@ySeq);
# Print out microRNA
sequences with up to 3 nucleotides off the 5' end
for my $i (1..3) {
if (($i == 1)&&(scalar(@ySeq)
>=1)) {
write_FASTA_to_file($outfile,
"+1 " . $microPid, "", $ySeq[$length-1] . $microSeq);
}
elsif (($i ==
2)&&(scalar(@ySeq) >=2)) {
write_FASTA_to_file($outfile,
"+2 " . $microPid, "", $ySeq[$length-2]
. $ySeq[$length-1]
. $microSeq);
}
elsif (($i ==
3)&&(scalar(@ySeq) >=3)) {
write_FASTA_to_file($outfile,
"+3 " . $microPid, "", $ySeq[$length-3]
. $ySeq[$length-2]
. $ySeq[$length-1]
. $microSeq);
}
}
}
# --
# SUB: write_FASTA_to_file:
Concatenates FASTA sequence to end of output file. Changes RNA to DNA.
#
ARGS: (Scalar, Scalar, Scalar, Scalar); Output filename, primary id,
description, sequence
# OUTPUT: Prints sequence info in FASTA
format.
# --
sub write_FASTA_to_file {
my $file = $_[0];
my $pid = $_[1];
my $desc = $_[2];
my $seq = $_[3];
$seq =~ s/U/T/g;
open (FILE,
">>$file") or die ("Error sub write_FASTA_to_file:
Cannot open file $file for writing..!\n");
print FILE
">", $pid, " ", $desc, " ", "\n";
print FILE $seq, "\n";
close (FILE);
}
# --
# SUB: read_FASTA_to_hash:
Reads in FASTA file as hash of Bio::Seq objects.
Primary id used as hash key.
# Selects
only those sequences with "$id" as header.
# ARGS: (Scalar, Scalar); Input filename,
Header
# RETURN: Reference to hash of Bio::Seq objects
# --
sub read_FASTA_to_hash {
my $file = $_[0];
my $id = $_[1];
my $count =0;
my %data = ();
my $seq_in = Bio::SeqIO->new( -format => 'fasta',
-file
=> $file);
while( my $seq = $seq_in->next_seq() ) {
if ($seq->primary_id
=~ /^$id/) {
$count++;
$data{$seq->primary_id} = $seq;
}
}
return \%data;
}