#! /usr/bin/perl -w

# --

# AUTHOR: Amber Fedynak (afedynak@gmail.com)

# DATE: Feb 22, 2010

# --

# Import Modules

use strict;

use Bio::SeqIO;

# -- Main program

# DESCRIPTION: Reads in mature and hairpin RNA FASTA files. Retrieve all matching pairs.

# OUTPUT: Fasta file "output.fa" of all matches and the 5' microRNA variations.

# --

my %matureRNA = ();

my %hairpinRNA = ();

my $matureRNA_ref = read_FASTA_to_hash ("mature.fa", "ath");

my $hairpinRNA_ref = read_FASTA_to_hash ("hairpin.fa", "ath");

%matureRNA = %$matureRNA_ref;

%hairpinRNA = %$hairpinRNA_ref;

get_RNA_variations("output.fa", \%matureRNA, \%hairpinRNA);

# --

# SUB: get_RNA_variations: Retrieve each matureRNA and matching hairpin RNA, write sequences and 5' variations to output file

# ARGS: (Hash 1, Hash 2); Hash of Bio::Seq objects

# OUTPUT: FASTA sequence file with hairpinRNA, microRNA, and 5' variations

# --

sub get_RNA_variations {

my $outfile = $_[0];

my %temp1 = %{$_[1]};

my %temp2 = %{$_[2]};

# For each mature RNA, find matching hairpin RNA

for my $key1 ( sort keys %temp1 ) {

my $pid1 = $temp1{$key1}->primary_id;

# Only retrieve sequences with "ath" as header

$pid1 =~ /^ath.+\*?$/;

# When match found, write hairpinRNA, matureRNA, and 5' sequence variations to output file

for my $key2 ( sort keys %temp2 ) {

my $pid2 = $temp2{$key2}->primary_id;

if ($pid1 =~ m/^$pid2/i) {

my $seq1 = $temp1{$key1}->seq;

my $seq2 = $temp2{$key2}->seq;

# Retrieve 0-3 nucleotides off 5' end of sequence

$seq2 =~ m/(\w{0,3})$seq1/i;

# Write the hairpin, microRNA, and 5' variations to output file

write_FASTA_to_file($outfile, $temp2{$key2}->primary_id, $temp2{$key2}->desc, $seq2);

write_FASTA_to_file($outfile, $pid1, $temp2{$key2}->desc, $seq1);

write_y_variations($1, $seq1, $pid1, $outfile);

}

# --

# SUB: write_y_variations: Print out microRNA sequences with 1, 2, and 3 nucleotide extensions off the 5' end

# ARGS: (Array, Scalar, Scalar, Scalar); Array containing up to 3 nucleotides, sequence, primary id, output file

# OUTPUT: Concatenates up to 3 sequences with 5' variations to output file

# --

sub write_y_variations {

my @ySeq = split(//, $_[0]);

my $microSeq = $_[1];

my $microPid = $_[2];

my $outfile = $_[3];

my $length = scalar(@ySeq);

# Print out microRNA sequences with up to 3 nucleotides off the 5' end

for my $i (1..3) {

if (($i == 1)&&(scalar(@ySeq) >=1)) {

write_FASTA_to_file($outfile, "+1 " . $microPid, "", $ySeq[$length-1] . $microSeq);

}

elsif (($i == 2)&&(scalar(@ySeq) >=2)) {

write_FASTA_to_file($outfile, "+2 " . $microPid, "", $ySeq[$length-2] . $ySeq[$length-1] . $microSeq);

}

elsif (($i == 3)&&(scalar(@ySeq) >=3)) {

write_FASTA_to_file($outfile, "+3 " . $microPid, "", $ySeq[$length-3] . $ySeq[$length-2] . $ySeq[$length-1] . $microSeq);

}

# --

# SUB: write_FASTA_to_file: Concatenates FASTA sequence to end of output file. Changes RNA to DNA.

# ARGS: (Scalar, Scalar, Scalar, Scalar); Output filename, primary id, description, sequence

# OUTPUT: Prints sequence info in FASTA format.

# --

sub write_FASTA_to_file {

my $file = $_[0];

my $pid = $_[1];

my $desc = $_[2];

my $seq = $_[3];

$seq =~ s/U/T/g;

open (FILE, ">>$file") or die ("Error sub write_FASTA_to_file: Cannot open file $file for writing..!\n");

print FILE ">", $pid, " ", $desc, " ", "\n";

print FILE $seq, "\n";

close (FILE);

}

# --

# SUB: read_FASTA_to_hash: Reads in FASTA file as hash of Bio::Seq objects. Primary id used as hash key.

# Selects only those sequences with "$id" as header.

# ARGS: (Scalar, Scalar); Input filename, Header

# RETURN: Reference to hash of Bio::Seq objects

# --

sub read_FASTA_to_hash {

my $file = $_[0];

my $id = $_[1];

my $count =0;

my %data = ();

my $seq_in = Bio::SeqIO->new( -format => 'fasta',

-file => $file);

while( my $seq = $seq_in->next_seq() ) {

if ($seq->primary_id =~ /^$id/) {

$count++;

$data{$seq->primary_id} = $seq;

}

return \%data;

}