[BiO BB] PCR primer checking

Fri Nov 22 14:30:45 EST 2002

I think e-pcr would be the way to go.

With a file that looks like

Pair1      TTTTATGACCTTTTAAATCACACAAC      GGTGTTGAGTACGCGAATGA    353 

Running the attached script should give you an output file (e-pcr.out, along with lots of other files)

Pari1 9.1-132438756 81510652 81511005 N3
Pair1 9.1-132438756 86487251 86487603 N0

Which says the primer pairs amplified 2 different segments of the genome, one perfectly and the second with a few mismatches in one of the primers. The fourth column in the input file is how big a product you expect, and there is a value in the e-pcr command line which sets how much leeway you want to give on that. 

With a bit of work you can hack the script to get the position of the mismatch (ie. if these are at the 3' end). I had that in but due to a disk and backup tape loss I can't give that to you today, but its on my list of things to add back in. 

The script is a hack I know, so if you have any questions please ask. You'll need to get the fasta files for the chromosomes from your favourite place (eg. Ensembl). That and have bioperl installed. 

In the very least it should give you a hint on how to get started.

#!/bin/perl

use FileHandle;
use Bio::Seq::LargePrimarySeq;
 use Bio::SeqIO;

$filein=$ARGV[0];
$fileout=$ARGV[1];

my $directory_of_chromsome_files_to_screen="/data/ensembl/pub/human-7.30/data/golden_path/chromosomes/";

for ($mismatch=0;$mismatch < 3;$mismatch++) {
  #for ($mismatch=0;$mismatch < 3;$mismatch++) {

  my $fileout=$fileout."_N".$mismatch;

  print STDERR "Going to epcr\n";

  my $epcr_output_file=run_e_pcr($filein,$directory_of_chromsome_files_to_screen,$fileout,$mismatch);

  # print STDERR "Going to parse $epcr_output_file\n";

  #  parse_e_pcr($epcr_output_file,$mismatch);
}

print "Preparing final summary\n";
my $summary_file=$fileout."_all_hits";
epcr_summary($summary_file,$fileout);

sub run_e_pcr{
  print STDERR "Running e-PCR, this will take awhile....\n";
  my ($filein,$dir_of_chromosomes,$fileout,$mismatch)=@_;
  # my $fileout_ePCR=$fileout."_"."ePCR";
  opendir(CHROMOSOMES,$dir_of_chromosomes);
  my @chromosomes=readdir(CHROMOSOMES);
  foreach my $chromosome (@chromosomes) {
    my $fileout_ePCR=$fileout."_".$chromosome."_ePCR";
    if ($chromosome =~ /fa/) {
      next unless  ($chromosome =~ /chr9.fa/);
      #      if ($chromosome =~ /chr1.fa/) {
      my $path_to_file="$dir_of_chromosomes"."/".$chromosome;
      #print STDERR "Running on $path_to_file\n";
      print STDERR "Running e-PCR $filein $path_to_file N=$mismatch and output to $fileout_ePCR\n";

      exec `e-PCR $filein $path_to_file N=$mismatch M=50 >  $fileout_ePCR`;
      #      open (IN,"e-PCR $filein $path_to_file N=$mismatch |");
      #      while (<IN>) {
      #	print_to_file("$fileout_ePCR",$_);
      #      }
      #      close (IN);

      print "Parsing output\n";
      parse_e_pcr($fileout_ePCR,$mismatch);

      #      }
    }

  }
  #  return ($fileout_ePCR);

}

sub parse_e_pcr{
  use Bio::Tools::EPCR;
  my ($file_to_parse,$mismatch)=@_;
  @filename_parts=split('_',$file_to_parse);
  my $all_hits=$filename_parts[0]."_all_hits";
  print "all results file is $all_hits\n";
  my %amplifications;
  my %hits_per_primer;
  print "Trying to open>$file_to_parse<with N>$mismatch<mismatch\n";
  my $parser = new Bio::Tools::EPCR(-file => "$file_to_parse");
  while ( my $feat = $parser->next_feature ) {
    my @values = $feat->each_tag_value('name');
    my $name=$values[0];
    my $note=$values[1];
    #my $name=$feat->each_tag_value('name');
    #my $note=$feat->each_tag_value('note');
    #print "name:$name\n";
    #print "note:$note\n";
    $amplifications{$name}{$feat->seqname}{$feat->start}=$feat->end;
    $hits_per_primer{$name}++;
  }
  foreach my $primer (keys %hits_per_primer) {
    if ($hits_per_primer{$primer} > 1) {
      print "We had $hits_per_primer{$primer} hits for $primer\n";
      my $fileout_multiple_hits=$file_to_parse."__multiple_hits";
      #     print_to_file($fileout_multiple_hits,$primer);
      #   foreach my $primer (sort numeric keys  %amplifications) {
      foreach my $chr (sort numeric keys %{$amplifications{$primer}}) {
	foreach my $start (sort numeric keys %{$amplifications{$primer}{$chr}}) {
	  print_to_file( $fileout_multiple_hits,"$primer $chr $start $amplifications{$primer}{$chr}{$start}");
	}
      }
      #  }
    }
  }
  foreach my $primer (sort numeric keys  %amplifications) {
    foreach my $chr (keys %{$amplifications{$primer}}) {
      foreach my $start (keys %{$amplifications{$primer}{$chr}}) {
	my $all_mismatch=$file_to_parse."_".$mismatch;
	print_to_file( "$all_mismatch","$primer $chr $start $amplifications{$primer}{$chr}{$start}");
	print_to_file( "$all_hits","$primer $chr $start $amplifications{$primer}{$chr}{$start} $mismatch");
      }
    }
  }
}

sub numeric {
  $a <=> $b;
}

sub print_to_file{
  my ($filehandle,$string)=@_;
  my $fh= new FileHandle(">>$filehandle") || die;
  $fh->print($string,"\n");
  ;
};

sub epcr_summary{
  my ($summary_file,$final_output)=@_;

  print "Going to summarize $summary_file into $final_output\n";
  my %summary;
  my %count;
  open (IN,"$summary_file");
  while (<IN>) {
    chomp;
    my ($primer, $chr, $start,$end, $mismatch)=split;
    print "Found  $primer, $chr, $start,$end, $mismatch\n";
    if ($summary{$primer}{$chr}{$start}{$end}) {
      print "Skipping\n";
    } else {
      print "this >$summary{$primer}{$chr}{$start}{$end}< is empty Adding\n";
      $count{$primer}++;
      $summary{$primer}{$chr}{$start}{$end}="N".$mismatch ;
      print "Just added $primer, $chr, $start,$end, $mismatch to tracking hash\n" ;
      print "Count for primer is now:$count{$primer}<\n";
    }

  }
  foreach my $primer (keys %count) {
    print "Count for $primer is:$count{$primer}<\n";
    if ($count{$primer} > 1) {
      print "Count is over 1 for $primer\n";
      # foreach my $primer (sort numeric keys  %summary) {
      foreach my $chr (sort numeric keys %{$summary{$primer}}) {
	print "chr:$chr\n";
	my @range=split('\.',$chr);
	#print "0:$range[0],1,$range[1],2,$range[2]\n";
	my $path_to_file="$directory_of_chromsome_files_to_screen"."/chr".$range[0].".fa";
	print "file:$path_to_file\n";
	my $in  = Bio::SeqIO->new('-file' => "$path_to_file",
				  '-format' => 'largefasta');
	my $seqobj = $in->next_seq();
	  foreach my $start (sort numeric keys %{$summary{$primer}{$chr}}) {
	    foreach my $end (sort numeric keys %{$summary{$primer}{$chr}{$start}}) {
	      my $subseq = $seqobj->subseq($start,$end);
	      open (IN,"e-PCR $filein $path_to_file N=$mismatch |");
	      print_to_file( $final_output,"$primer $chr $start $end $summary{$primer}{$chr}{$start}{$end} $subseq");
	    }
	  }
      }
    }
  }
}

>Hi all,

>I'm hoping a method exists that would allow me to align PCR primers against
>the human genome to determine all products that could be amplified if my
>sample contained extracted human DNA.  Theoretically I could BLAST each
>primer with stringent parameters and record the coordinates on the genome
>looking for matches with a Kb or two of one another.  To do that manually
>would be quite tedious, especially when you consider that I'm only really
>interested in matches at the 3' end.

>Anyone know of a way to do this?

>- Bill

*************************************************************************
This email message is confidential and may contain privileged information. Any unauthorized dissemination or copying is strictly prohibited. If you have received this message in error, please delete it and notify us immediately.

< XGI >