#! /usr/bin/perl -w

# --

# AUTHOR: Amber Fedynak (afedynak@gmail.com)

# DATE: Feb 22, 2010

# NOTE: Copyright (c) 2010 under GNU GPL

# --

 

 

   # Import Modules

   use strict;

   use Bio::SeqIO;

 

 

    # -- Main program

    # DESCRIPTION: Reads in mature and hairpin RNA FASTA files. Retrieve all matching pairs.

    # OUTPUT: Fasta file "output.fa" of all matches and the 5' microRNA variations.   

    # --

    my %matureRNA = ();

    my %hairpinRNA = ();

 

    my $matureRNA_ref = read_FASTA_to_hash ("mature.fa", "ath");

    my $hairpinRNA_ref = read_FASTA_to_hash ("hairpin.fa", "ath");

 

    %matureRNA = %$matureRNA_ref;

    %hairpinRNA = %$hairpinRNA_ref;

 

    get_RNA_variations("output.fa", \%matureRNA, \%hairpinRNA);

  

 

 

 

    # --

    # SUB: get_RNA_variations: Retrieve each matureRNA and matching hairpin RNA, write sequences and 5' variations to output file

    # ARGS: (Hash 1, Hash 2); Hash of Bio::Seq objects

    # OUTPUT: FASTA sequence file with hairpinRNA, microRNA, and 5' variations

    # --

    sub get_RNA_variations {

          my $outfile = $_[0];

          my %temp1 = %{$_[1]};

          my %temp2 = %{$_[2]};

        

          # For each mature RNA, find matching hairpin RNA

          for my $key1 ( sort keys %temp1 ) {

                my $pid1 = $temp1{$key1}->primary_id;

 

                # Only retrieve sequences with "ath" as header

                $pid1 =~ /^ath.+\*?$/;   

 

                # When match found, write hairpinRNA, matureRNA, and 5' sequence variations to output file

                for my $key2 ( sort keys %temp2 ) {

                      my $pid2 = $temp2{$key2}->primary_id;

                      if ($pid1 =~ m/^$pid2/i) {

                        my $seq1 = $temp1{$key1}->seq;

                        my $seq2 = $temp2{$key2}->seq;

 

                        # Retrieve 0-3 nucleotides off 5' end of sequence

                        $seq2 =~ m/(\w{0,3})$seq1/i;

 

                        # Write the hairpin, microRNA, and 5' variations to output file

                        write_FASTA_to_file($outfile, $temp2{$key2}->primary_id, $temp2{$key2}->desc, $seq2);

                        write_FASTA_to_file($outfile, $pid1, $temp2{$key2}->desc, $seq1);

                        write_y_variations($1, $seq1, $pid1, $outfile);

                  }

                }

          }

    }

 

 

 

 

 

    # --

    # SUB: write_y_variations: Print out microRNA sequences with 1, 2, and 3 nucleotide extensions off the 5' end

    # ARGS: (Array, Scalar, Scalar, Scalar); Array containing up to 3 nucleotides, sequence, primary id, output file

    # OUTPUT: Concatenates up to 3 sequences with 5' variations to output file

    # --

    sub write_y_variations {

      my @ySeq = split(//, $_[0]);

      my $microSeq = $_[1];

      my $microPid = $_[2];

      my $outfile = $_[3];

      my $length = scalar(@ySeq);

 

      # Print out microRNA sequences with up to 3 nucleotides off the 5' end

      for my $i (1..3) {

 

            if (($i == 1)&&(scalar(@ySeq) >=1)) {

                  write_FASTA_to_file($outfile, "+1 " . $microPid, "",  $ySeq[$length-1] . $microSeq);

            }

 

            elsif (($i == 2)&&(scalar(@ySeq) >=2)) {

                  write_FASTA_to_file($outfile, "+2 " . $microPid, "", $ySeq[$length-2] . $ySeq[$length-1] . $microSeq);

            }    

 

            elsif (($i == 3)&&(scalar(@ySeq) >=3)) {

                  write_FASTA_to_file($outfile, "+3 " . $microPid, "", $ySeq[$length-3] . $ySeq[$length-2] . $ySeq[$length-1] . $microSeq);

            }

      }

    }

 

 

 

    # --

    # SUB: write_FASTA_to_file: Concatenates FASTA sequence to end of output file. Changes RNA to DNA.

    # ARGS: (Scalar, Scalar, Scalar, Scalar); Output filename, primary id, description, sequence

    # OUTPUT: Prints sequence info in FASTA format. 

    # --

    sub write_FASTA_to_file {

      my $file = $_[0];

      my $pid = $_[1];

      my $desc = $_[2];

      my $seq = $_[3];

      $seq =~ s/U/T/g;

 

      open (FILE, ">>$file") or die ("Error sub write_FASTA_to_file: Cannot open file $file for writing..!\n");

      print FILE ">", $pid, " ", $desc, " ", "\n";

 

      print FILE $seq, "\n";

      close (FILE);    

    }

 

 

 

    # --

    # SUB: read_FASTA_to_hash: Reads in FASTA file as hash of Bio::Seq objects. Primary id used as hash key.

    #       Selects only those sequences with "$id" as header.

    # ARGS: (Scalar, Scalar); Input filename, Header

    # RETURN: Reference to hash of Bio::Seq objects 

    # --

    sub read_FASTA_to_hash {

      my $file = $_[0];

      my $id = $_[1];

      my $count =0;

      my %data = ();

      my $seq_in  = Bio::SeqIO->new( -format => 'fasta',

                                       -file => $file);

 

 

      while( my $seq = $seq_in->next_seq() ) {

          if ($seq->primary_id =~ /^$id/) {

            $count++;

            $data{$seq->primary_id} = $seq;

         }

      }

      return \%data;

    }