#!/usr/bin/perl # Cared for by Milo Thurston (mith@ceh.ac.uk) ###################################################### # A reasonably simple application to search for all # # mono-hexanucleotide repeats in Genbank or # # fasta formatted files, and provide detailed output # ###################################################### ####################################################################################### # You should have received a copy of the GNU General Public License # # in the file COPYING along with this script; if not, write to the Free Software # # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # ####################################################################################### use strict; use CGI; use Cwd; use Bio::Location::Split; use Bio::SeqIO; use Config::Simple; use File::Copy; use Getopt::Std; use List::Util qw(min max); use Term::ReadLine; my $version = "1.6.7"; ##################### # usage information # ##################### my $usage=" Usage: msatfinder [options] files msatfinder -b *.gbk msatfinder -b -l examples.list Options: -b backup your old data directories (adds date & time suffix) -d delete most recent data directories, don't search for microsatellites -e <1-3> engine to use - see the manual (default 1) -f set flank size, overriding config file (default 300) -h list these options -l read a list of genomes from a text file -s silence most output to screen (overrides config file) -x delete all data directories, don't search for microsatellites By default, msatfinder will take the list of files given on the command line as the files to read. Instead you may use the -l option, and tell it to read a list of the genomes you would like to be processed. Use options \"-d\" and \"-x\" with caution. "; ################ # idendify cwd # ################ my $cwd = getcwd; ####################### # date related things # ####################### my $date = `date`; chomp($date); my %dtrans = ( "JAN" => "01", "FEB" => "02", "MAR" => "03", "APR" => "04", "MAY" => "05", "JUN" => "06", "JUL" => "07", "AUG" => "08", "SEP" => "09", "OCT" => "10", "NOV" => "11", "DEC" => "12" ); my $datestring = `date +%j%H%M%S`; chomp($datestring); #################################### # import settings from config file # #################################### my $config = &getconfig($cwd,@ARGV); ############################ # various global variables # ############################ my $override = $config->{'FINDER.override'}; my $artemis = $config->{'FINDER.artemis'}; my $mine = $config->{'FINDER.mine'}; my $fastafile = $config->{'FINDER.fastafile'}; my $sumswitch = $config->{'FINDER.sumswitch'}; my $screendump = $config->{'FINDER.screendump'}; my $run_eprimer = $config->{'DEPENDENCIES.run_eprimer'}; my $eprimer_args = $config->{'DEPENDENCIES.eprimer_args'}; my $debug = $config->{'COMMON.debug'}; my $flank_size = $config->{'COMMON.flank_size'}; my $remote_link = $config->{'FINDER.remote_link'}; my $primer3core = $config->{'DEPENDENCIES.primer3core'}; my $eprimer = $config->{'DEPENDENCIES.eprimer'}; my $dbname = $config->{'VIEWER.dbname'}; my $dbtype = $config->{'VIEWER.dbtype'}; my $dbusername = $config->{'VIEWER.gdbusername'}; my $dbusername2 = $config->{'VIEWER.sdbusername'}; my $cgiloc = $config->{'VIEWER.cgiloc'}; my $msatview = $config->{'VIEWER.msatview'}; my $run_mview = $config->{'ALIGNER.run_mview'}; ######################################## # subdirectories created by msatfinder # ######################################## my $mine_dir = $config->{'COMMON.mine_dir'}; my $repeat_dir = $config->{'COMMON.repeat_dir'}; my $tab_dir = $config->{'COMMON.tab_dir'}; my $bigtab_dir = $config->{'COMMON.bigtab_dir'}; my $fasta_dir = $config->{'COMMON.fasta_dir'}; my $prime_dir = $config->{'COMMON.prime_dir'}; my $count_dir = $config->{'COMMON.count_dir'}; my @newdirs = ($mine_dir,$repeat_dir,$tab_dir,$bigtab_dir,$fasta_dir,$prime_dir,$count_dir); ############################################## # determine which options have been selected # ############################################## my %opts=(); getopts('abcdxhlt:e:f:m:s',\%opts); # print help message if ($opts{h}) { print $usage; exit; } if ($opts{s}) { $screendump = 0; } # delete files if ($opts{d}) { print "Deleting existing data.\n"; foreach my $dir (@newdirs) { $dir =~ s/\///; system("rm -rf $dir") == 0 or warn "Having trouble deleting directory $dir, $!"; } print "Exiting.\n"; exit; } # delete ALL files if ($opts{x}) { print "Deleting ALL old data.\n"; foreach my $dir (@newdirs) { $dir =~ s/\///; system("rm -rf $dir*") == 0 or warn "Having trouble deleting directory $dir, $!"; } print "Exiting.\n"; exit; } # backup files if ($opts{b}) { print "Backing up date files with suffix $datestring\n"; foreach my $move (@newdirs) { $move =~ s/\///; system("cp -r $cwd/$move $cwd/$move.$datestring") == 0 or warn "Can't copy $move, $!"; } } # which engine to use my $engine; if ($opts{e}) { $engine = $opts{e}; } else { $engine = 1; } if ($engine == 1) { print "Using regexp engine.\n" if ($screendump == 1); } elsif ($engine == 2) { print "Using multipass engine.\n" if ($screendump == 1); } elsif ($engine == 3) { print "Using iterative engine.\n" if ($screendump == 1); } else { die "Somehow, no engine has been defined!\n"; } print "Current directory: $cwd\n" if ($screendump == 1); # what are the flanks? if ($opts{f}) { if ($opts{f} eq "") { $flank_size = 300; } else { $flank_size = $opts{f}; } print "Overriding flank size in config file - using value: $flank_size\n" if ($screendump == 1); } ##################################### # set up motif range and thresholds # ##################################### my $threshold = $config->{'FINDER.motif_threshold'}; $threshold =~ s/^\|//; $threshold =~ s/\|$//; my @thresholds; my @motif_range; # load motif ranges into array my @motif_range_pre = split(/\|/, $threshold); # remove any duplicates my %seen = (); my @threshes=(); foreach my $blah (@motif_range_pre) { my ($mot,$thresh) = split(/,/,$blah); $thresholds[$mot-1] = $thresh; push(@threshes,$thresh); $seen{$mot}++; } my @uniq = keys %seen; @motif_range = sort { $a <=> $b } @uniq; print "Using motif types: ", join(",",@motif_range), "\n" if ($screendump == 1); die "You must specify motif types to search for.\nExiting!\n" unless (@motif_range); # check that there are thresholds appropriate to # each motif type... my $unspec = 0; my $typeerror = 0; foreach my $type (@motif_range) { $unspec++ unless ($thresholds[$type-1] =~ /^\d+$/ and $thresholds[$type-1] > 0); $typeerror++ unless ($type =~ /^\d+$/); } # dodgy thresholds if ($typeerror >= 1) { if ($typeerror == 1) { print "Error - you have entered a motif type incorrectly: \n"; } else { print "Error - some motif types entered incorrectly:\n"; } foreach my $type (@motif_range) { print "* $type\n" unless ($type =~ /^\d+$/); } print "Please edit the .rc file and try again. Exiting!\n"; exit; } # oops, at least one threshold undefined if ($unspec >= 1) { if ($unspec == 1) { print "Error - a threshold is not defined correctly:\n"; } else { print "Error - some thresholds are not specified. You have supplied the following:\n"; } print "motif\tthreshold\n"; foreach my $type (@motif_range) { my $defstring; if (@thresholds[$type-1] > 0) { $defstring = @thresholds[$type-1]; } else { $defstring = "MISSING"; } print "$type:\t$defstring\n"; } print "Please edit the .rc file and try again. Exiting!\n"; exit; } ############################################################### # open a file containing a list of the genomes to be searched # ############################################################### my @infiles = (); my @nreg = (); my $list_file; if ($opts{l}) { if (@ARGV) { chomp($list_file = shift); } unless (defined $list_file) { print "You need to specify the name of a list file to process.\n"; my @possibles = glob "*\.list"; if (@possibles) { print "Please select one of the following files:\n"; foreach (@possibles) { print " - $_\n"; } print "Type one of the above filenames (use the up arrow to select),\n or type in another.\n"; # get the user input my $term = new Term::ReadLine 'choose file'; my $prompt = "Select an file: "; foreach (@possibles) { $term->addhistory($_); } $term->ornaments('0'); $list_file = $term->readline($prompt); } else { print "I couln't find any list files in $cwd. Please choose a filename, or select X to exit.\n"; # get the user input my $term = new Term::ReadLine 'choose file'; my $prompt = "Select an file: "; $term->ornaments('0'); $list_file = $term->readline($prompt); exit if ($list_file =~ /^[Xx]/); } } # actually open the file at last... open (LISTFILE, "$list_file") or die "Can't open the file \"$list_file\": $!\nPlease check you have a valid list file, and try again."; while (my $infile = ) { chomp $infile; push (@infiles, $infile); } close LISTFILE; } else { @infiles = @ARGV; } # exit if no infiles unless (@infiles) { print "You don't seem to have specified any files to process...\n"; print $usage; exit; } # apply the override if ($override == 1) { $artemis = 0; $mine = 0; $fastafile = 0; $sumswitch = 0; $screendump = 0; $run_eprimer = 0; } # check the deps my @deps = qw(primer3_core eprimer3); my %find = (); $find{primer3_core} = $primer3core; $find{eprimer3} = $eprimer; if ($run_eprimer == 1) { &depcheck(\%find,$screendump,@deps); } # die if you have more things to look for than thresholds if (scalar @motif_range > scalar @thresholds) { die "You seem to have more types of msat to look for than you have thresholds! Please edit the config file and try again.\nExiting!\n"; } # precalculate some stuff for findmotifs_regex my $minrep = &min(@threshes); my $maxmotiflen = scalar(@thresholds); #Consistency check - see findmotifs_regex if( $minrep < 3 and $engine == 1) { print "Can't cope with finding less than three repeats of patterns! (\$minrep is $minrep) whilst using the regex engine.\n"; exit; }; if( $thresholds[0] =~ /\d+/ and $thresholds[0] < scalar(@thresholds) and $engine == 1) { print "Threshold for monos too small - cannot resolve ambiguities.\n"; exit; }; # this makes sure that your directories exist # before trying to write anything to them my $dir; foreach $dir (@newdirs) { $dir =~ s/\///; if (-e "$cwd/$dir") { unlink glob "$dir/*"; } unless (-e "$cwd/$dir") { mkdir "$cwd/$dir" or warn "Can't create $dir"; } } ################################################################# # an explication of main variables used throughout this script, # # except for all the ones I haven't bothered to list here. # ################################################################# my $aat; # %A divided by %AT my $abs_path; # path to $repeat_dir (same as $cwd . $repeat_dir) my $allfile; # one of the NC_xxx.xxx numbers in @all_files my $all_files2; # prepares a list of files w/ and w/o repeats my $alphabet; # DNA, RNA or protein? my $analysis_type; # how the .db files were prepared my $ATgenome; # AT% in the entire genome my $blaster; # the entire repeat plus flanking sequence my $cdsnum; # number of CDS regions found per genome my $cgc; # %C divided by %CG my $CGgen_flankseq; # GC content of flank my $CGgen_repeat; # GC content of repeat my $CGgenome; # % CG in the complete genome my $CGmatch; # see the previous CG variable my $checklength; # my $checkright; # Determine actual size of my $checkright_len; # flanks, and the position my $checkright_len_pc; # of the repeat in the my $checkleft; # genome (dist from L & R). my $checkleft_len; # my $checkleft_len_pc; # my $circular; # linear or circular DNA/RNA? my $count; # counts the number of genomes my $coding; # is this repeat coding or non-coding? my $def; # Genbank DEFINITION line my $dodgy; # file without valid content my $end; # used for determining repeat length my $entry; # CGI object used for MINE.db writing my $feature; # header for the .msat_tab file my $feature_plus_flank; # header for the .flank_tab file my $file; # the NC_xxx.xxx number currently being examined my $filein; # the Bio::SeqIO object my $flankseq; # flanking sequence only my $fname; # what the fasta file is called my $fout; # the famous lovely fasta header my $genome_len; # total length of the genome my $gentaxon; # a more general taxonomic category my $howmany; # number of list files in $cwd my $isaaa = 0; # is a amino acid... my $isformat; # what format is the file? my $join; # manipulation of Genbank CDS lines my $left; # sequence upstream of the repeat my $linkfile; # symlink to NC file, found in $cwd/_tab my $loc; # genbank locus my $match_count; # number of matches found my $match_len; # length of the match my $matrix_file; # file with num. reps. vs. size my $mineend; # "end" of filename on MINE.db and fasta repeat files my $minenum; # number of repeats found per genome my $motif_type; # mono, di, tri etc. my $motif_revcom; # motif reverse compliment my $numall; # total number of msats found my $numfiles; # number of files surveyed my $numreps; # number of repeats ($mineum minus the zeros) my $percentcode; # percentage coding for each genome my $prefix; # the taxonomic group (phages, viruses etc.) my $realfile; # file that $linkfile links to my $replength; # stores the longest repeat length my $repsum; # sum of repeat length in this genome my $repsumpc; # percentage of this genome consisting of repeats my $reverse; # set to 1 if the gene is a "compliment" my $right; # sequence downstream of the repeat my $sequence_string; # does what it says on the tin my $strand; # ss or ds? my $organism; # genbank ORGANISM my $tabfile; # name of the feature table associated with $file my $tempcoding; # sum of total number of coding bases my $totalreps = 0; # number of genomes containing repeats my $totalreppc; # percentage of genomes containing repeats my $primercount = 0; # count total number of genomes with primers # these variables are responsible for c/gc content &c. my $base_a; my $base_t; my $base_g; my $base_c; # arrays and hashes my @all_files=(); # stores all the $file seen my @all_files2=(); # stores a little more than @all_files (for output 8) my @genome_len=(); # list of genome lengths my @line=(); # lines of the genbank file my %mineseen=(); # stores the highest $minenum for each $file my %ncseen=(); # print each $file once to the brief summary my @dodgy=(); # files that contain invalid content my $total_ids; # how many unique seqs examined? my %repeat_counter=(); # store up the total type and number of repeats; my @short_names=(); # all short names (first key to repeat_counter) my @all_motifs=(); # all motif types ever found (second key to repeat_counter) my %motifs_by_genome=(); # all motif types found in one particular genome my %max_replength=(); # longest number of repeat units for any repeat in a genome my %has_msat=(); # remember the genomes with msats in my %unique_motifs=(); # unique motifs from all genomes my %repseen=(); # sums up the total length of repeats # in the whole genome my %genlenhash=(); # matches genome length to $file my %defseen=(); # matches $def to @all_files2 my %primercheck=(); # can primers be made for this particular sequence? my %pccodreps=(); # percentage of repeats in coding region my $genename; # remember the start of each repeat, to determine my $protname; # which gene it is in, later. my $prodname; # hashes used for determining which # entries are saved into a CSV text file # later in the script %ncseen=(); %mineseen=(); ################################### # .db files prepared by msatfinder # ################################### $analysis_type="msatfinder, run on $date, thresholds >$thresholds[0..5]"; # determine $prefix (taxonomic_group) from # the name of the working directory ($prefix = $cwd) =~ s#.*/##; $prefix =~ /(\w+)\.*/; $prefix = lc $1; $cwd .= "/"; # set up the taxon information for use in # the final database file $gentaxon = $config->{"FINDER.$prefix"}; unless (defined $gentaxon) { $gentaxon = $prefix; } # more will be added here to allow more generic # categorisation of the organelles, bacteria &c. # N.B. "specific" and "generic" taxa are notes # for the convenience of the user only (e.g. they # remind me what directories I put things in. ######################################### # Where should the output files go? # ######################################### # The summarised repeats now go in a # subdirectory of the working directory. # I have called this "./Repeats" $abs_path = $cwd . $repeat_dir; # path to $repeat_dir ########################################## # Open the core output files for writing # ########################################## # a list of files that seem corrupted (some genbank links are broken resulting in 'empty' files # during automated download from NCBI open (OUTPUT_FILE1, ">$abs_path$prefix.errors") or die "can't open $abs_path$prefix.errors file: $!"; # an index file giving the search parameters and details open (OUTPUT_FILE2, ">$abs_path$prefix.index") or die "can't open $abs_path$prefix.index file: $!"; # summary data of coding regions, etc. for each rep file open (OUTPUT_FILE3, ">$abs_path$prefix.repeats") or die "can't open $abs_path$prefix.summary file: $!"; # a list of all files, whether or not repeats are present open (OUTPUT_FILE5, ">$abs_path$prefix.genomes") or die "can't open $abs_path$prefix.total_sum: $!"; ######################################### # initialize some variables # ######################################### $| = 1; # flush the buffer... $count = 0; # number of genomes... ######################### ######################### # Start searching files # ######################### ######################### # Read through the list of file names to get all files to search foreach my $file (@infiles) { # take in the list of files push (@all_files, $file); $numfiles = @all_files; # announce the cuurent file being searched print "-----\n" if ($screendump == 1); print "Searching file: $file\n" if ($screendump == 1); # MINE file rep number setting $minenum = "0"; ++$count; $sequence_string = undef; ############################## # check if it's a fasta file # # if not, get LOCUS # ############################## $alphabet = undef; $strand = undef; $organism = "undefined"; $def = "undefined"; my $oldstop = 0; my ($oldline,$olderline) = ("ftang","ftang"); $isformat = 5; open (IN, "<$file") or die "can't open the genome file $file: $!"; my $sprotorembl = 0; while (my $line =) { chomp($line); # fasta format if ($line =~ "^>") { print "It looks like this is a FASTA file: $file\n" if ($screendump == 1); $isformat = 1; last; } # Swissprot format elsif ($line =~ /^ID/) { # could be EMBL or Swissprot $sprotorembl = 1; last; } # genbank format - the original, needs a lot of # processing to extract info. that bioperl can't get elsif ( $line =~ /^LOCUS/) { print "It looks like this is a Genbank file: $file\n" if ($screendump == 1); $isformat = 0; # get alphabet, and strandedness # I've assumed that DNA is ds unless # specifically stated otherwise. # NCBI claim that this is the case. my @locs = split("\s", $line); foreach my $loc (@locs) { if ($loc =~ /NA$/) { if ($loc =~ /(\D\D)-(\D+)/) { $strand = lc $1; $alphabet = lc $2; } else { $strand = "ds"; $alphabet = lc $loc; } } } } elsif ($isformat == 5) # assume must be ASCII format { print "I don't recognise the format of this this file: $file\nI'll assume that it's ASCII." if ($screendump == 1); $isformat = 4; last; } # definition elsif ($line =~ /^\s*DEFINITION\s*/) { $def = $line; $def =~ s/DEFINITION\s+//; } # species name elsif ($line =~ /^\s*ORGANISM\s*/) { $organism = "$line"; $organism =~ s/\s*ORGANISM\s*\b//; } # exit the loop after reading the important stuff elsif ($line =~ "FEATURES") { last; } # match the old line, in case a definition # extends over two lines (e.g. ORGANISM) if ($oldline =~ /^DEFINITION/ and $line =~ /^\s+/) { $line =~ s/\s+//; $def = "$def " . $line; print "Case 1: gbline is $line, oldline is $oldline, def is $def\n" if ($debug == 1); } if ($oldline =~ /^\s*ORGANISM/ and $line =~ /^\s+/) { $line =~ s/\s+//; $organism = "$organism " . $line; print "Case 2: gbline is $line, oldline is $oldline, organism is $organism\n" if ($debug == 1); } if ($olderline =~ /^\s*ORGANISM/ and $line =~ /^\s+/) { $line =~ s/\s+//; $organism = "$organism " . $line; print "Case 3: gbline is $line, olderline is $olderline, organism is $organism\n" if ($debug == 1); } # saves the previous line $olderline = $oldline; $oldline = $line; } close IN; # swissprot or EMBL format? if ($sprotorembl == 1) { # see if file is swissprot # if not, it must be EMBL my $testfile = Bio::SeqIO->new(-file => "$file", -format => 'EMBL'); my $testseq; while (1) { my $testseq; eval { $testfile->verbose(2); undef $@; $testseq = $filein->next_seq() }; #Done eval if($@) { print "It looks like this is an EMBL file.\n" if ($screendump == 1); $isformat = 3; last; } else { print "It looks like this is a Swissprot file.\n" if ($screendump == 1); $isformat = 2; last; } } $testfile->close(); } # new object, depeding on format if ($isformat == 1) # FASTA { $filein = Bio::SeqIO->new(-file => "$file", -format => 'Fasta'); } elsif ($isformat == 2) # Swissprot { $filein = Bio::SeqIO->new(-file => "$file", -format => 'swiss'); } elsif ($isformat == 3) # EMBL { $filein = Bio::SeqIO->new(-file => "$file", -format => 'EMBL'); } elsif ($isformat == 0) # genbank { $filein = Bio::SeqIO->new(-file => "$file", -format => 'genbank'); } elsif ($isformat == 4) # ASCII { $filein = Bio::SeqIO->new(-file => "$file", -format => 'raw'); } else { die "Unknown file type - please report this error.\n"; } ########################################### # loop through every sequence in the file # ########################################### @nreg=(); my $headerprint = 0; my $noofseqs = 1; print "Reading sequences from file $file...\n" if ($screendump == 1); SEQ: while (my $seq = $filein->next_seq()) # this point is very slow... { last unless $seq; # how much of this sequence is coding $tempcoding = 0; $cdsnum = 0; # first, check if amino acid not dna if ($seq->alphabet() =~ /protein/) { $isaaa = 1; print "This looks like an amino acid sequence.\n" if ($screendump == 1); } else { print "This looks like a nucleic acid sequence.\n" if ($screendump == 1); } # print headers &fileheaders($isformat,$isaaa) if ($headerprint == 0); $headerprint = 1; # get the ID of this sequence # and the sequence string $sequence_string = lc($seq->seq()); $genome_len = length $sequence_string; my $id = $seq->display_id(); $id =~ s/;//; $id =~ s/\.gbk//; # create a short name to refer to # this particular sequence my ($short_name,@ids); if ($isformat == 4) { $short_name = $datestring . "_" . $noofseqs; } else { @ids = split("\|", $id); if ($id =~ /^gi/) { $short_name = $ids[4]; } elsif ($id =~ /^gb/) { $short_name = $ids[2]; } else { $short_name = $id; } } $noofseqs++; # check for a valid sequence if (! $seq->validate_seq($sequence_string)) { print "Sequence $short_name is invalid!\n"; print OUTPUT_FILE1 "INVALID_SEQ|$short_name\n"; next SEQ; } # count msats in CDS regions for this genome $pccodreps{$short_name} = 0; push(@short_names,$short_name); $reverse = 0; # stuff needed to get the species details my $sp; my $ps; my ($species, $genus, @classification, $common_name, $sub_species, $binomial, $division, $gdate, @gdates, $specific_host, $lab_host, $db_xref, $note, $feature_count) = ""; my $ann; my (@annotations1,@annotations2); ################################################## # if a genbank file, extract further information # ################################################## if ($isformat == 0 or $isformat == 2 or $isformat == 3) { # species object $sp = $seq->species(); # information on the organism $alphabet = $seq->alphabet() unless (defined $alphabet); eval { $genus = $sp->genus(); } or $genus = "NA"; eval { $species = $sp->species(); } or $species = "NA"; eval { @classification = $sp->classification(); } or @classification = qw(na); eval { $common_name = $sp->common_name(); } or $common_name = "NA"; eval { $sub_species = $sp->sub_species(); } or $sub_species = "NA"; eval { $binomial = $sp->binomial(); } or $binomial = "NA"; eval { $division = $seq->division(); } or $division = "NA"; @gdates = $seq->get_dates(); my $gdate1 = $gdates[0]; my ($d1,$d2,$d3) = split("-",$gdate1); $gdate = $d3 . "-" . $dtrans{$d2} . "-" . $d1; $circular = 'linear'; $circular = 'circular' if $seq->is_circular(); $feature_count = $seq->feature_count(); # features my $whirley = Whirley->new(4); print "Reading all annotations in sequence...\n" if ($screendump == 1); foreach my $feat ($seq->all_SeqFeatures()) # this part is very slow... { print STDERR "Please wait: ", $whirley->(), "\r" if ($screendump == 1); my $tag = $feat->primary_tag(); # source: get hosts, notes &c. for this genome if ($tag =~ "source") { my @tags = $feat->all_tags(); foreach my $val (@tags) { my @tagvalues = $feat->each_tag_value($val); if ($val =~ /specific_host/) { $specific_host = $tagvalues[0]; } if ($val =~ /lab_host/) { $lab_host = $tagvalues[0]; } if ($val =~ /note/) { $note = $tagvalues[0]; } if ($val =~ /db_xref/) { $db_xref = $tagvalues[0]; $db_xref =~ s/taxon://; } } } # get start, stop and other stuff here. if ($tag =~ "CDS") { $cdsnum++; my $location = new Bio::Location::Split; $location = $feat->location(); if ($location->isa('Bio::Location::Split')) { my @sublocs = $location->sub_Location(); foreach my $loc (@sublocs) { my $featstart = $loc->start(); my $featend = $loc->end(); &addregion($featstart,$featend); } } else { my $featstart = $feat->start(); my $featend = $feat->end(); &addregion($featstart,$featend); } } } } ######################################## # calculate other things based on data # # from these info.-rich formats # # but not swissprot - is amino acid # ######################################## unless ($isaaa == 1) { # how much coding DNA is present? $tempcoding = &howlong || "0"; if ($tempcoding > $genome_len) { print "Something's wrong here!\n"; print "Tempcoding: $tempcoding\n"; print "Length: $genome_len\n"; print OUTPUT_FILE1 "CODING_MISMATCH|$short_name|$tempcoding|$genome_len\n"; } # determine the base count $base_a = $sequence_string =~ tr/aA/aA/; $base_c = $sequence_string =~ tr/cC/cC/; $base_g = $sequence_string =~ tr/gG/gG/; $base_t = $sequence_string =~ tr/tT/tT/; # check it! my $totlen = $genome_len - ($base_a + $base_t + $base_c + $base_g); if( $totlen < 0 ) { warn "Base count larger than seq length in $short_name!\n"; } } # increment the unique id count $total_ids++; # more debugging code if ($debug == 1) { print "IDS: @ids\n"; print "ID: $short_name\n"; print "FC: $feature_count\n"; } # annouce the sequence being searched print "Inspecting sequence: $short_name...\n" if ($screendump == 1); # after conversion genome is in $sequence_string; get length and G+C $genlenhash{$short_name} = $genome_len; push (@genome_len, $genome_len); unless ($isaaa == 1) { $CGgenome = $sequence_string =~ tr/cgCG/cgCG/; $ATgenome = $sequence_string =~ tr/atAT/atAT/; # determine if the genome is unusually # a or c rich if ($ATgenome == 0) { print "ATgenome is $ATgenome in $short_name\n"; print "File $short_name has invalid content!\n"; push (@dodgy, $short_name); print OUTPUT_FILE1 "LOW_AT_CONTENT|$short_name\n"; next; } $aat = $base_a/$ATgenome; $aat = sprintf("%.3f", $aat); if ($CGgenome == 0 ) { print "CGgenome is $CGgenome in $short_name\n"; print "File $short_name has invalid content!\n"; push (@dodgy, $short_name); next; } $cgc = $base_c/$CGgenome; $cgc = sprintf("%.3f", $cgc); } # stop division by zero if ($genome_len >0) { unless ($isaaa == 1) { $CGgenome = ($CGgenome/$genome_len)*100; $CGgenome = sprintf("%.3f", $CGgenome); } # send an error message if the sequence is very short - probably not a genome? if (length $sequence_string <=50) {print OUTPUT_FILE1 "DODGY_SEQ|$short_name|$genome_len|$sequence_string\n";} # make sure that the $percentcode is correct # moved up from lower in the script unless ($isaaa == 1) { $percentcode = 100*($tempcoding/$genome_len); $percentcode = sprintf("%.3f", $percentcode); } # hashes used when writing summary files # at the end of the script $mineseen{$short_name}=$minenum; $repseen{$short_name}=0; $defseen{$short_name} = $def; # Here we create yet another array to store all files names # "total_sum" if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 0) { my $all_files2 = join "|", $short_name, $prefix, $gentaxon, $division, $strand, $alphabet, $circular, $gdate, $binomial, $genus, $species, $sub_species, $specific_host, $lab_host, $db_xref, $note, $common_name, $organism, $def, $genome_len, $tempcoding, $percentcode, $CGgenome, $cdsnum, $base_a, $base_t, $base_g, $base_c, $aat, $cgc; push (@all_files2, $all_files2); } if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 1) { my $all_files2 = join "|", $short_name, $prefix, $gentaxon, $division, $alphabet, $circular, $gdate, $binomial, $genus, $species, $sub_species, $specific_host, $lab_host, $db_xref, $note, $common_name, $genome_len; push (@all_files2, $all_files2); } if ($isformat == 1 and $isaaa == 0) { my $all_files2 = join "|", $short_name, $genome_len, $CGgenome, $base_a, $base_t, $base_g, $base_c, $aat, $cgc; push (@all_files2, $all_files2); } if ($isformat == 1 and $isaaa == 1) { my $all_files2 = join "|", $short_name, $genome_len; push (@all_files2, $all_files2); } ################################################################## # This is the important part where the repeats are found using a # # the regexp below. Loop through this section by repeat length # ################################################################## $repsum = 0; my $res; # array ref with all msats within it my $whirley = Whirley->new($engine); if ($engine == 1) { $res = &findmotifs_regex($sequence_string,$whirley,$screendump); } elsif ($engine == 2) { $res = &findmotifs_multipass($sequence_string,$whirley,$screendump); } elsif ($engine == 3) { $res = &findmotifs_iterate($sequence_string,$whirley,$screendump); } # now process the msats in the res array # MATCH (a place marker for gvim) $replength = 0; my @motifs_in_this_genome = (); my %unique_motifs_local=(); next unless (@$res); foreach my $msat (@$res) { $has_msat{$short_name} = 1; my $match = $msat->{motif}; my $motif_type = length($match); my $match_len = $msat->{footprint}; my $match_count = $match_len/$motif_type; my $whole_match = $match x $match_count; my $start = $msat->{mpos}; my $end = $start + $match_len - 1; $repseen{$short_name} += $match_len; # ignore poly-N and poly-X of any type next if ($match =~ /n/ or $match =~ /x/); # count total number found $numall++; # determine the motif type, and # count the number of msats of # each type and length $repeat_counter{$short_name}{$match}[$match_count]++; push(@all_motifs,$match) unless ($unique_motifs{$match}); push (@motifs_in_this_genome,$match) unless ($unique_motifs_local{$match}); $unique_motifs{$match} = 1; $unique_motifs_local{$match} = 1; # reverse complement $motif_revcom = $match; $motif_revcom =~ tr/ACTGactg/TGACtgac/; # set the various coding region descriptions # to N/A (assuming non-coding unless we find # otherwise $genename = "N/A"; $protname = "N/A"; $prodname = "N/A"; $coding = 0; # determine whether or not the start or end of this repeat # falls within a CDS region my $check1 = &isitin($start); my $check2 = &isitin($end); if ($check1 == 1 or $check2 == 1) { $coding = 1; $pccodreps{$short_name}++; # determine which gene it is in # and all that foreach my $feat ($seq->all_SeqFeatures()) { # this information is not connected to # the CDS wherein the repeat is found... my $tag = $feat->primary_tag(); if ($tag =~ "source") { my @tags = $feat->all_tags(); foreach my $val (@tags) { my @tagvalues = $feat->each_tag_value($val); if ($val =~ /specific_host/) { $specific_host = $tagvalues[0]; } if ($val =~ /lab_host/) { $lab_host = $tagvalues[0]; } if ($val =~ /note/) { $note = $tagvalues[0]; } if ($val =~ /db_xref/) { $db_xref = $tagvalues[0]; $db_xref =~ s/taxon://; } } } # ...but this information is. It seemed like a good # idea to put them in the same place. my $featstart = $feat->start(); my $featend = $feat->end(); $reverse = $feat->strand(); if (($start >= $featstart and $start <= $featend and $tag eq "CDS") or ($start >= $featstart and $end <= $featend and $tag eq "CDS")) { if ($debug == 1) { print "--\n"; print "NAME: $short_name\n"; print "TAG: $tag\n"; print "Start: $featstart\n"; print "Repstart: $start\n"; print "End: $featend\n"; print "--\n"; } my @tags = $feat->all_tags(); foreach my $val (@tags) { my @tagvalues = $feat->each_tag_value($val); if ($val =~ /gene/) { $genename = $tagvalues[0]; } if ($val =~ /product/) { $prodname = $tagvalues[0]; } if ($val =~ /protein_id/) { $protname = $tagvalues[0]; } } } } } # print out stuff print "\nFound ($match)$match_count msat at $start:\n" if ($screendump == 1); # create Swissprot formatted output for Artemis if ($artemis > 0) { $feature = < $genome_len) { $fend = $genome_len; } else { $fend = $end + $flank_size; } $feature_plus_flank = < $replength); # print the feature to the artemis file if $artemis > 0 if ($artemis > 0 && $whole_match) { $tabfile = "$short_name.msat_tab"; my $basename = [split(/\//, $file)]->[-1]; print "Opening output file $tabfile\n" if ($screendump == 1); open (ART, ">>$tab_dir/$tabfile") or die "Can't open $tab_dir/$tabfile file for writing: $!"; print ART $feature; # link files in the artemis directory unless (-e "$tab_dir$file") { print "Linking $file to $tab_dir\n" if ($debug == 1); system("cd $tab_dir; ln -sf ../$basename .") == 0 or warn "Can't link $file to $tab_dir: $!"; close ART; } # print the "flank tabs" out if ($artemis == 2) { $tabfile = "$short_name.flank_tab"; print "Opening output file $tabfile\n" if ($screendump == 1); open (ART2, ">>$bigtab_dir/$tabfile") or die "Can't open your $bigtab_dir/$tabfile for writing: $!"; print ART2 $feature_plus_flank; unless (-e "$bigtab_dir$file") { print "Linking $file to $bigtab_dir\n" if ($debug == 1); system("cd $bigtab_dir; ln -sf ../$basename .") == 0 or warn "Can't link $file to $bigtab_dir: $!"; } close ART2; } } # used for determining the sequence upstream and # downstream of the match $left = substr ($sequence_string, $start-$flank_size, $flank_size-1); $right = substr ($sequence_string, $end, $flank_size); $blaster= $left . $whole_match . $right; $flankseq = $left . $right; my $repeat_plus_flank = length $blaster; my $flankseq_length = length($flankseq); my $motif_length = length($match); #calcuate %GC of flanks $CGgen_flankseq = $flankseq =~ tr/cgCG/cgCG/; $CGgen_flankseq = ($CGgen_flankseq/$flankseq_length)*100 unless ($flankseq_length == 0); $CGgen_flankseq = sprintf("%.3f", $CGgen_flankseq); #calcuate %GC of repeat $CGgen_repeat = $whole_match =~ tr/cgCG/cgCG/; print "CG\% MSAT: $CGgen_repeat / (length $whole_match))\n" if ($debug == 1); $CGgen_repeat = ($CGgen_repeat/(length $whole_match))*100; $CGgen_repeat = sprintf("%.3f", $CGgen_repeat); # this part writes the MINE.db files in # $PWD/MINE, one file per repeat #if ($mine == 1 && $whole_match). # If debugging is turned on, produce output to stderr # if any flanking regions extend past the end of the sequence if ($whole_match) { $minenum++; $mineend = $start . ".$match." . "$match_count"; $checklength = ((2*$flank_size) + $match_len-1); $checkright = substr ($sequence_string, $end); $checkright_len = length $checkright; $checkright_len_pc = sprintf("%.2f", 100*($checkright_len/$genome_len)); $checkleft = substr ($sequence_string, 1, $start-1); $checkleft_len = length $checkleft; $checkleft_len_pc = sprintf("%.2f", 100*($checkleft_len/$genome_len)); if ($repeat_plus_flank < $checklength && $debug == 1) { warn "Short flanks in $short_name.$mineend\:\t$checkleft_len\t$checkright_len\n"; } ######################################## # create new CGI for the MINE .db file # ######################################## if ($mine == 1) { print "Writing MINE file $short_name.$mineend.db\n" if ($screendump == 1); open (ENTRY, ">$mine_dir/$short_name.$mineend.db") or die "Won't open $short_name.$mineend.db"; my $entry = new CGI; $entry -> delete('keywords'); $entry -> append('filename', "$short_name.$mineend.db"); if ($remote_link) { $entry -> append('genome', "$short_name"); } else { $entry -> append('genome', $short_name); } $entry -> append('analysis_type', $analysis_type); $entry -> append('specific_taxon', $prefix); $entry -> append('generic_taxon', $gentaxon); $entry -> append('organism', $organism); $entry -> append('strand', $strand) unless ($isaaa == 1); $entry -> append('alphabet', $alphabet); $entry -> append('binomial', $binomial); $entry -> append('genus', $genus); $entry -> append('species', $species); $entry -> append('strain/subspecies', $sub_species); $entry -> append('common_name', $common_name); $entry -> append('definition', $def); $entry -> append('genome_length',$genome_len); $entry -> append('flank_length', $flank_size); $entry -> append('repeat_plus_flank', $repeat_plus_flank); $entry -> append('start', $start); $entry -> append('stop', $end); $entry -> append('repeat', "($match)$match_count"); $entry -> append('motif', $match); $entry -> append('motifrevcom', $motif_revcom) unless ($isaaa == 1); $entry -> append('footprint', $match_len); $entry -> append('type', $motif_type); $entry -> append('dist_from_left', $checkleft_len); $entry -> append('pc_from_left', $checkleft_len_pc); $entry -> append('dist_from_right', $checkright_len); $entry -> append('pc_from_right', $checkright_len_pc); $entry -> append('total_nt_coding', $tempcoding) unless ($isaaa == 1); $entry -> append('percent_coding', $percentcode) unless ($isaaa == 1); $entry -> append('coding_msat', $coding) unless ($isaaa == 1); $entry -> append('gene_name', $genename) unless ($isaaa == 1); $entry -> append('protein_name', $protname) unless ($isaaa == 1); $entry -> append('product_name', $prodname) unless ($isaaa == 1); $entry -> append('No_of_coding_regions', $cdsnum) unless ($isaaa == 1); $entry -> append('A/AT ratio', $aat) unless ($isaaa == 1); $entry -> append('C/CG ratio', $cgc) unless ($isaaa == 1); $entry -> append('GC content (genome)', $CGgenome) unless ($isaaa == 1); $entry -> append('GC content (flanks)', $CGgen_flankseq) unless ($isaaa == 1); $entry -> append('GC content (repeat)', $CGgen_repeat) unless ($isaaa == 1); $entry -> append('Artemis_repeats', "$short_name.msat_tab"); $entry -> append('Artemis_flank_repeats', "$short_name.flank_tab"); $entry -> append('seq', $blaster); $entry -> save("ENTRY"); close(ENTRY); } } ############## # fasta file # ############## # write stuff my $fout; if ($isaaa == 1) { my $cname = $common_name || "N/A"; $fout = ">$short_name.$mineend, Name: $cname, Repeat: ($match)$match_count, start: $start, end: $end, thresholds: @thresholds, compliment=$reverse"; $fname = "$fasta_dir$short_name.$mineend"; } else { $fout = ">$short_name.$mineend, Name: $common_name, Repeat: ($match)$match_count, start: $start, end: $end, thresholds: @thresholds, A:$base_a C:$base_c G:$base_g T:$base_t, coding=$coding, genename=$genename, proteinname=$protname, productname=$prodname, compliment=$reverse"; } my $fname = "$fasta_dir$short_name.$mineend.fasta"; if ($fastafile == 1) { print "Opening fasta FILE $fname\n" if ($screendump == 1); open (FASTA, ">$fname") or die "Can't open your fasta file for writing: $!"; print FASTA "$fout\n"; print FASTA "$blaster\n"; close FASTA; } # send left and right flanks to eprimer if ($run_eprimer == 1) { if ($isaaa == 1) { print "Primer generation disabled for AA files!\n" if $screendump == 1; } else { &primers($blaster,$match_len,$short_name,length($left),$mineend); } } # now print the .db data to a text CSV file # for general reference - "summary" if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 0) { print OUTPUT_FILE3 "$short_name.$mineend|$short_name|$prefix|$gentaxon|$division|$strand|$alphabet|$circular|$gdate|$binomial|$genus|$species|$sub_species|$specific_host|$lab_host|$db_xref|$note|$common_name|$organism|$def|$genome_len|$flank_size|$repeat_plus_flank|$start|$end|($match)$match_count|$match|$motif_revcom|$match_len|$match_count|$checkleft_len|$checkleft_len_pc|$checkright_len|$checkright_len_pc|$motif_type|$tempcoding|$percentcode|$coding|$genename|$protname|$prodname|$reverse|$primercheck{$short_name}|$CGgenome|$CGgen_flankseq|$CGgen_repeat|$cdsnum\n" if ($sumswitch == 1); } if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 1) { print OUTPUT_FILE3 "$short_name.$mineend|$short_name|$prefix|$gentaxon|$division|$alphabet|$circular|$gdate|$binomial|$genus|$species|$sub_species|$specific_host|$lab_host|$db_xref|$note|$common_name|$genome_len|$flank_size|$repeat_plus_flank|$start|$end|($match)$match_count|$match|$match_len|$match_count|$checkleft_len|$checkleft_len_pc|$checkright_len|$checkright_len_pc|$motif_type\n" if ($sumswitch == 1); } if ($isformat == 1 and $isaaa == 0) { print OUTPUT_FILE3 "$short_name.$mineend|$short_name|$genome_len|$flank_size|$repeat_plus_flank|$start|$end|($match)$match_count|$match|$motif_revcom|$match_len|$match_count|$checkleft_len|$checkleft_len_pc|$checkright_len|$checkright_len_pc|$motif_type|$primercheck{$short_name}|$CGgenome|$CGgen_flankseq|$CGgen_repeat\n" if ($sumswitch == 1); } if ($isformat == 1 and $isaaa == 1) { print OUTPUT_FILE3 "$short_name.$mineend|$short_name|$genome_len|$flank_size|$repeat_plus_flank|$start|$end|($match)$match_count|$match|$match_len|$match_count|$checkleft_len|$checkleft_len_pc|$checkright_len|$checkright_len_pc|$motif_type\n" if ($sumswitch == 1); } # increment the count of the number of # unique genomes with repeats if ($ncseen{$short_name} == undef) { $totalreps++; } $ncseen{$short_name}=1; # this hash determines the number of repeats per genome # each key will be reset each time through this loop # resulting in the highest number being retained. $mineseen{$short_name}=$minenum; # G+C content of the repeat $CGmatch = $whole_match =~ tr/cgCG/cgCG/; # stop division by zero if ($match_len >0) {$CGmatch = ($CGmatch/$match_len)*100;} # re-set reverse for each new repeat found $reverse = 0; } # end of MATCH # sort the motifs into order based on lenth then alphabet print "SN(0): @motifs_in_this_genome\n" if ($debug == 2); print "RL: $replength\n" if ($debug == 2); $motifs_by_genome{$short_name} = \@motifs_in_this_genome; $max_replength{$short_name} = $replength; } # end while $in } # end of ->next_seq() } # end of all the files in LISTFILE # SEARCHING COMPLETE # print out a list of all files, whether or # not they have repeats print "-----\n"; foreach my $line (@all_files2) { # extract number of repeats $line =~ /^(.+?)\|/; $file = $1; $numreps = $mineseen{$file}; $repsum = $repseen{$file}; $def = $defseen{$file}; my $count_coding_reps = $pccodreps{$file}; my $pccr; if ($numreps > 0 and $isformat != 2) { $pccr = 100 * ($count_coding_reps/$numreps); $pccr = sprintf("%.3f", $pccr); } else { $pccr = "N/A"; } # set a switch (0/1) if there are msats my $repyn; if ($numreps == 0) { $repyn = 0; } elsif ($numreps > 0) { $repyn = 1; } # determine genome length my $genome_length = $genlenhash{$file}; if ($genome_length > 0 && $repsum > 0) { $repsumpc = 100*($repsum/$genome_length); $repsumpc = sprintf("%.3f", $repsumpc); } else { $repsumpc = 0; } unless (defined $numreps) { $numreps = "0"; } print "numreps: $numreps, file: $file\n" if ($debug == 1); if ($isformat == 0 or $isformat == 2 or $isformat == 3) { print OUTPUT_FILE5 "$line|$repsum|$repsumpc|$repyn|$numreps\n"; } if ($isformat == 1) { print OUTPUT_FILE5 "$line|$repsum|$repsumpc|$repyn|$numreps\n"; } if ($debug == 1) { print "FILE: $file\n"; print "RS: $repsum\n"; print "DEF: $def\n"; print "CCR: $count_coding_reps\n"; print "NR: $numreps\n"; print "PCCR: $pccr\n"; print "-----\n"; } } # do a bit of summary to STDOUT if ($screendump ==1 ) { print "\n\nA total of $count files surveyed\n"; foreach $allfile (@all_files) { print "$allfile\n"; } if (defined $dodgy[0]) { print "\n\nFiles with invalid GC content were found.\n"; foreach $dodgy (@dodgy) { print OUTPUT_FILE1 "INVALID_GC|$dodgy\n"; } print "\n\n"; } } ############################# # provide handy index files # ############################# # set up arrays/hashes for summation of all # values over all genomes - fill these with values # for single genomes, calculated in the next loop # and print them later my %overall_size_matrix=(); my $overall_max_replength = 0; my %overall_repeat_counter =(); my %rem_revcom = (); # determine the lowest threshold in use my @lowest; foreach my $mr (@motif_range) { push (@lowest, $thresholds[$mr]); } ##################################### # print count files for each genome # ##################################### foreach my $short_name (@short_names) { print "SN(1): $short_name\n" if ($debug == 2); next unless ($has_msat{$short_name} == 1); print "SN(2): $short_name\n" if ($debug == 2); # get max replength $overall_max_replength = $max_replength{$short_name} if ($max_replength{$short_name} > $overall_max_replength); # display by exact motif my $count_file = "$count_dir/$short_name.motif.count"; open (COUNTFILE, ">$count_file") or die "Can't open the file \"$count_dir/$count_file\": $!"; print COUNTFILE "units,"; foreach my $unmot ($motifs_by_genome{$short_name}) { print "UN: @$unmot\n" if ($debug == 2); my @out = (); my @sorted = sort { length($a) <=> length($b) || $a cmp $b} @$unmot; foreach my $un (@sorted) { push (@out,$un); } my $out = join ",", @out; $out =~ s/,$//; print COUNTFILE "$out\n"; print "OUT: $out\n" if ($debug == 2); } for (my $k=$lowest[0];$k<=$max_replength{$short_name};$k++) { print COUNTFILE "$k,"; foreach my $unmot ($motifs_by_genome{$short_name}) { my $count = 1; my @sorted = sort { length($a) <=> length($b) || $a cmp $b} @$unmot; foreach my $un (@sorted) { print COUNTFILE $repeat_counter{$short_name}{$un}[$k] || 0; print COUNTFILE "," unless $count == scalar(@$unmot); $overall_repeat_counter{$un}[$k] += $repeat_counter{$short_name}{$un}[$k]; $count++; } } print COUNTFILE "\n"; } close COUNTFILE; # by motif length # first, print file header $count_file = "$count_dir/$short_name.type.count"; open (COUNTFILE, ">$count_file") or die "Can't open the file \"$count_dir/$count_file\": $!"; print COUNTFILE "units,"; my $out = join ",", @motif_range; $out =~ s/,$//; print COUNTFILE "$out\n"; my %size_matrix=(); # count up the number of each class of msat for (my $k=$lowest[0];$k<=$max_replength{$short_name};$k++) { foreach my $unmot ($motifs_by_genome{$short_name}) { my @out = (); foreach my $un (@$unmot) { $size_matrix{length($un)}[$k] += $repeat_counter{$short_name}{$un}[$k]; $overall_size_matrix{length($un)}[$k] += $repeat_counter{$short_name}{$un}[$k]; } } } # print them to file for (my $k=$lowest[0];$k<=$max_replength{$short_name};$k++) { print COUNTFILE "$k,"; foreach my $i (@motif_range) { print COUNTFILE $size_matrix{$i}[$k] || 0; print COUNTFILE "," unless $i == @motif_range[-1]; } print COUNTFILE "\n"; } close COUNTFILE; } ######################################################### # print out summary files of numbers and types of msats # # found in all genomes # ######################################################### # by motif length my $count_file = "$abs_path$prefix.type.count"; open (COUNTFILE, ">$count_file") or die "Can't open the file \"$count_dir/$count_file\": $!"; # print headers print COUNTFILE "units,"; my $out = join ",", @motif_range; $out =~ s/,$//; print COUNTFILE "$out\n"; # do the calculations for (my $k=$lowest[0];$k<=$overall_max_replength;$k++) { print COUNTFILE "$k,"; foreach my $i (@motif_range) { print COUNTFILE $overall_size_matrix{$i}[$k] || 0; print COUNTFILE "," unless $i == @motif_range[-1]; } print COUNTFILE "\n"; } close COUNTFILE; # genome totals, but by exact motif # use reverse compliment of motifs for this one (only), # to reduce size of the file my $count_file = "$abs_path$prefix.motif.count"; open (COUNTFILE, ">$count_file") or die "Can't open the file \"$count_dir/$count_file\": $!"; print COUNTFILE "units,"; # print headers my @sorted = sort { length($a) <=> length($b) || $a cmp $b} @all_motifs; my %mseen=(); # prepare another hash with revcom'd motifs therein my %revcoms = (); my @word_keys = (); for (my $k=$lowest[0];$k<=$overall_max_replength;$k++) { foreach my $motif (@sorted) { my $key = join '', sort split '', $motif; push (@word_keys, $key) unless ($mseen{$key} == 1);; $revcoms{$key}[$k] += $overall_repeat_counter{$motif}[$k]; $mseen{$key} = 1; } } my $rcout = join ",", @word_keys; $rcout =~ s/,$//; print COUNTFILE "$rcout\n"; # print values for (my $k=$lowest[0];$k<=$overall_max_replength;$k++) { print COUNTFILE "$k,"; my $count = 1; foreach my $un (@word_keys) { print COUNTFILE $revcoms{$un}[$k] || 0; print COUNTFILE "," unless $count == scalar(@word_keys); $count++; } print COUNTFILE "\n"; } close COUNTFILE; # get total length of genomes searched my $totlen = 0; foreach my $length(@genome_len) { $totlen += $length; } # do an index file with an overall summary. # count total msats by type my %totalcount = (); foreach my $i (@motif_range) { foreach my $num (@{$overall_size_matrix{$i}}) { $totalcount{$i} += $num; } } # count total msats by motif # compressing by revcom my %totalrevcom = (); my $numrevs = 0; foreach my $stuff (@sorted) { for (my $k=$lowest[0];$k<=$overall_max_replength;$k++) { my $key = join '', sort split '', $stuff; $totalrevcom{$key} += $overall_repeat_counter{$stuff}[$k]; } } my $totfound = $numall || 0; # the main summary file my %engine_list = (1 => 'regex', 2 => 'multipass', 3 => 'iterative'); print OUTPUT_FILE2 <$data_index") or die "Can't open $data_index: $!"; print DI "\n\n\n"; print DI <Below is a listing of the various directories and files created by msatfinder, viz.
  • $repeat_dir contains most of the summary information, including the overall summary, $prefix.index.
  • $count_dir summary of msat types and motifs for each sequence run.
  • $fasta_dir fasta files for each msat found.
  • $prime_dir primer files for each msat found.
  • $tab_dir, $bigtab_dir feature tables for viewing in artemis (with and without flanks, respectively).
  • $mine_dir MINE files - summaries of information on each msat, for use with the MINE code from our lab.
EOF foreach my $dir ($repeat_dir,$count_dir,$fasta_dir,$prime_dir,$tab_dir,$bigtab_dir,$mine_dir) { print DI "

$dir

\n"; my @files = glob "$dir*"; foreach my $file (@files) { my $fileroot = [split("/",$file)]->[-1]; print DI "$fileroot
\n"; } } print DI "\n"; close DI; ############# # finished! # ############# print "Msatfinder finished!\n"; ############# # FUNCTIONS # ############# # determine whether primers can be made from the # left and right flanking sequences #&primers($blaster,$match_len,$short_name,length($left),$mineend); sub primers { my $priseq = shift; my $mlen = shift; my $id = shift; my $left = shift; my $end = shift; # make sure that primers include the repeat region my $start_include = $left -1; my @endings = split(/\./,$end); my $end_include = $left + (length($endings[1]) * $endings[2]) + 1; # get primer data my $infile = "$prime_dir$id.$end.temp"; open (WRITE, ">$infile") or die "Can't open $infile: $!"; print WRITE $priseq; close WRITE; my $outfile = "$prime_dir$id.$end.primers.txt"; if ($screendump == 1) { print "Determining PCR primers.\n"; print "Outfile: $outfile\n"; } system("$find{eprimer3} -sequence $infile -outfile $outfile -target $start_include,$end_include $eprimer_args > /dev/null 2>&1") == 0 or warn "Could not run \"$find{eprimer3} -sequence $infile -outfile $outfile -target $start_include,$end_include $eprimer_args\"; $!"; if ($screendump == 1) { print "Running command: $find{eprimer3} -sequence $infile -outfile $outfile -target $start_include,$end_include $eprimer_args\n"; print "Unlinking file: $infile\n" } unlink $infile unless ($debug == 1); $primercount++; $primercheck{$id} = 1; # remove output files if no primers were found, unless the user wants to know why unless ($eprimer_args =~ /pickanyway/ and $eprimer_args =~ /explain/) { if (-e $outfile) { open (CHECK, "<$outfile") or die "Error checking eprimer output $outfile: $!"; my @lines = ; close CHECK; unless (grep /FORWARD/, @lines) { unlink $outfile; print "No valid primers found for $id\n" if ($screendump == 1); $primercount--; $primercheck{$id} = 0; } } } } ######################################### # find microsatellites by various means # ######################################### # there are three means available: # 1. Iterate through the sequence once, trying to build the longest msat possible. # 2. Go through once with a powerful regular expression, and post-filter. # 3. Go through scalar $motif_range times, finding all msats. sub findmotifs_iterate { no warnings; my $sequence = shift; my $whirley = shift; my $screendump = shift; my @result = (); my $iterate = 0; CREEP: while ($iterate < (length($sequence) -1)) # position within sequence { print STDERR "Please wait: ", $whirley->(), "\r" if ($screendump == 1); print "IT: $iterate\n" if ($debug == 2); CHECK: foreach my $i (@motif_range) # mono,di,tri &c. { print "I: $i, ", $motif_range[-1], "\n" if ($debug == 2); my $units = 1; # number of repeat units of msat my $first = substr($sequence,$iterate,$i); # attempt to extend the match... while (1) { my $offset = $iterate + ($units*$i); my $subsequent = substr($sequence,$offset,$i); print "Comp(1): $first, $subsequent, $i\n" if ($debug == 2); if ($first ne $subsequent) { print "Comp(2): $first, $subsequent, $i\n" if ($debug == 2); if ($units >= $thresholds[$i-1]) { # attempt to reject false matches here, e.g. a tri picked up # as hexa or nona, &c. my $footprint = $units * $i; if (&splitmotif($first) == 1) # reject this - it's too small { $iterate = $iterate + $footprint; next CREEP; } else # keep this one, as it is a good match after all! { print "Match($units): ", "$first." x $units, "\n" if ($debug == 2); push(@result,{motif=>$first,footprint=>$footprint,mpos=>($iterate+1)}); $iterate = $iterate + $footprint; next CREEP; } } elsif ($i < $motif_range[-1]) { next CHECK; } else { $iterate++; next CREEP; } } $units++; } } } return \@result; } # thanks to T. Booth for this one sub findmotifs_regex { # V3 with speed of teh true ninja my $sequence = shift; my $whirley = shift; my $screendump = shift; my @result = (); #Internal stuff #savepos must be declared with 'our' to see it within the regex. our $savepos; # Pre-generate main regex for even more speed: our ($mr, $regexes1); if(!$mr) { #Setup environment... $mr = $minrep - 2; $regexes1 = qr/((.{1,$maxmotiflen}?)(?>\2)\2{$mr,})/; } #Use a regex to spot candidate repeats #This will favour short motifs over long ones while ($sequence =~ /$regexes1/g) { my $motif = $2; my $motiflen = length($2); my $footprint = length($1); $savepos = pos($sequence); my $mpos = $savepos - $footprint; next unless ($thresholds[$motiflen - 1] =~ /\d+/); #We have a candidate but: consider aaaaataaaaataaaaataaaaataaaaat # where the footprint of aaaaa (5) is shorter than the longest motif (6). # # Design decision : if thresholds permit motif longer than shortest footprint # then bail out - see above - that sorts out the ambiguity and takes us to the next case. # # Also - consider aaaaataataataataatg # Do we want a(5) and taa(4) or aat(5) ? # Depends if a(5) is within the threshold. # # If threshold check fails re-scan from $pos with new $minmotiflen # and look for a better candidate starting within the original footprint. while( $footprint < $thresholds[$motiflen - 1] * $motiflen ) { print STDERR "Please wait: ", $whirley->(), "\r" if ($screendump == 1); $motif = undef; pos($sequence) = $mpos; my $minmotiflen = $motiflen + 1; last if $minmotiflen > $maxmotiflen; # This was really slow for big sequences because it scans the whole of the rest # of the sequence. In fact we only need to look up to savepos # $sequence =~ /$regexes[$minmotiflen]/g; # # # last if !pos($sequence); # No match at all # # $mpos = pos($sequence) - length($1); # last if $mpos >= $savepos; # Match later in sequence - we'll come back to it. #Alternative uses a hook into the regex engine: use re 'eval'; $sequence =~ /( (.{$minmotiflen,$maxmotiflen}?) (?(?{$-[2]<$savepos})(?:(?>\2)\2{$mr,})|(.) ) )/gx; #Give up if we hit the end or go past $savepos and therefore matched one char last if $3; last unless pos($sequence); $mpos = pos($sequence) - length($1); $motif = $2; $motiflen = length($2); $footprint = length($1); } # So now we have 2 cases: # 1) $motif is set - add to array # 2) $motif is undef : reset pos to $savepos and continue if($motif) { push(@result,{ motif=> $motif, footprint=> $footprint, mpos=> (1 + $mpos) }); } else { pos($sequence) = $savepos; } } # end main loop print "\n"; return \@result; } sub findmotifs_multipass { # useful in that it finds overlapping msats - each pass through # the sequence pays no heed to the result of previous passes, so # results may differ from the previous two methods my $sequence = shift; my $whirley = shift; my $screendump = shift; my @result = (); foreach my $i (@motif_range) { my $pattern = "." x $i; my $thresh = ($thresholds[$i-1] - 1); while ($sequence =~ /(($pattern)\2{$thresh,})/g) { print STDERR "Please wait: ", $whirley->(), "\r" if ($screendump == 1); my $whole_match = $1; my $match = $2 if (defined $2); if (&splitmotif($match) == 1) { next; } # reject this - it's too small else { push(@result,{motif=>$match,footprint=>length($whole_match),mpos=>(1 + pos($sequence)-length($whole_match))}); } } } return \@result; } # add arrays sub addarrays { my @res = (); my ($ar, $i); for $ar (@_) { for($i = 0; $i < @$ar; $i++) { $res[$i] += $ar->[$i] if $ar->[$i] } }; return @res; } # print output files sub fileheaders { my $isformat = shift; my $isaaa = shift; # Genbank &c. file if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 0) { print OUTPUT_FILE3 "repeat|genome|specific_taxon|generic_taxon|division|strand|alphabet|circular|date|binomial|genus|species|strain/subspecies|specific_host|lab_host|taxid|notes|common_name|organism|definition|genome_length|flank_length|repeat_plus_flank|start|stop|repeat|motif|motifrevcom|footprint|repeat_units|dist_from_left|pc_from_left|dist_from_right|pc_from_right|motif_type|total_nt_coding|percent_coding|coding_repeat|gene|protein|product|reverse|primers|GC_content(genome)|GC_content(flank)|GC_content(repeat)|No_of_coding_regions\n"; print OUTPUT_FILE5 "genome|specific_taxon|generic_taxon|division|strand|alphabet|circular|date|binomial|genus|species|strain/subspecies|specific_host|lab_host|taxid|notes|common_name|organism|definition|genome_length|total_nt_coding|percent_coding|GC_content|no_of_coding_regions|a|t|g|c|a/at|c/gc|total_rep_length|pc_repeats|msats|No_of_msats\n"; } if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 1) { print OUTPUT_FILE3 "repeat|genome|specific_taxon|generic_taxon|division|alphabet|circular|date|binomial|genus|species|strain/subspecies|specific_host|lab_host|taxid|notes|common_name|sequence_length|flank_length|repeat_plus_flank|start|stop|repeat|motif|footprint|repeat_units|dist_from_left|pc_from_left|dist_from_right|pc_from_right|motif_type\n"; print OUTPUT_FILE5 "genome|specific_taxon|generic_taxon|division|alphabet|circular|date|binomial|genus|species|strain/subspecies|specific_host|lab_host|taxid|notes|common_name|sequence_length|total_rep_length|pc_repeats|msats|No_of_msats\n"; } # fasta file if ($isformat == 1 and $isaaa == 0) { print OUTPUT_FILE3 "repeat|genome|genome_length|flank_length|repeat_plus_flank|start|stop|repeat|motif|motifrevcom|footprint|repeat_units|dist_from_left|pc_from_left|dist_from_right|pc_from_right|motif_type|primers|GC_content(genome)|GC_content(flank)|GC_content(repeat)\n"; print OUTPUT_FILE5 "genome|genome_length|GC_content|A|T|G|C|A/AT|C/GC|total_rep_length|pc_repeats|msats|No_of_msats\n"; } # fasta file (amino acid) if ($isformat == 1 and $isaaa == 1) { print OUTPUT_FILE3 "repeat|genome|genome_length|flank_length|repeat_plus_flank|start|stop|repeat|motif|footprint|repeat_units|dist_from_left|pc_from_left|dist_from_right|pc_from_right|motif_type\n"; print OUTPUT_FILE5 "genome|genome_length|total_rep_length|pc_repeats|msats|No_of_msats\n"; } } # determine if msat can be # split into smaller motifs sub splitmotif { my $motif = shift; my $length = length $motif; print "Motif: $motif\n" if ($debug == 1); for (my $i=1;$i<$length;$i++) { # only run this if haven't already looked for motifs # of length $i... unless (grep { /^$i$/ } @motif_range and ($engine == 1 or $engine == 2)) { # split motif into array of items of # length $i my @stuff = (); my $pos = 0; while ($pos < $length) { push(@stuff,substr($motif,$pos,$i)); $pos += $i; } # see if all items in array match... if (scalar @stuff > 1) # ...unless there's only one item, or none. { my @matching = grep { /^$stuff[0]$/ } @stuff; if (scalar @stuff == scalar @matching) { return 1; } } } } return 0; } ##################################### # perform various size calculations # ##################################### # add msat + flanking regions together # sub addregion { my ($start,$end) = @_; if ($start > $end) { ($start,$end) = ($end,$start); } my $done = 0; my $k; my $tempcoding = 0; for ($k=0;$k<@nreg;$k++) { # new region starts after old region ends next if ($nreg[$k][1] < $start); # new region ends before old region starts if ($end < $nreg[$k][0]) { splice(@nreg,$k,0,[$start,$end]); $done = 1; last; } # regions overlap $nreg[$k][0] = $start < $nreg[$k][0] ? $start : $nreg[$k][0]; $nreg[$k][1] = $end > $nreg[$k][1] ? $end : $nreg[$k][1]; $done = 1; # Combine two regions if the new section has brough about an overlap while ($k < $#nreg and $nreg[$k][1] > $nreg[$k+1][0]) { $nreg[$k][1] = $nreg[$k+1][1]; splice(@nreg,$k+1,1); } last; } unless ($done) { push @nreg,[$start,$end]; } } # determine if region is within the defined area sub isitin { my ($what) = @_; my $within = 0; my $k; for ($k=0; $k<@nreg; $k++) { if ($what >= $nreg[$k][0] and $what <= $nreg[$k][1]) { return 1; } } return 0; } # return length of coding regions sub howlong { my $length; for (my $k=0; $k<@nreg; $k++) { $length += ($nreg[$k][1] - $nreg[$k][0]); } return $length; } ############################### # depchecks, config files &c. # ############################### # read config file sub getconfig { my $cwd = shift; my $arg = shift; my $config_file="msatfinder.rc"; my %config; if (-R "$cwd/$config_file") { Config::Simple->import_from("$cwd/$config_file", \%config); print "Using config file in $cwd\n" unless ($0 =~ /viewer/ or $> == 48); # or 81... return \%config; } elsif ($arg =~ /-h/ or $arg =~ /--help/) { return \%config; } else { print "No configuration file was found. Please make sure you have a copy of msatminer.rc in $cwd, and try again.\n"; die; } } # check dependencies sub depcheck { my $find = shift; my $screendump = shift; my @missing; foreach my $dep (@_) { if (-x $find->{$dep}) { print "Dependency $dep found.\n" if ($screendump == 1); } else # at least try to find it even if the confit is wrong { my $search = `which $dep 2> /dev/null`; chomp($search); if ($search =~ /aliased to (\.+)/) { $find->{$dep} = $1; print "$dep found, but not where your config file says it should be. Continuing...\n" if ($screendump == 1); } elsif (-x $search) { $find->{$dep} = $search; print "$dep found, but not where your config file says it should be. Continuing...\n" if ($screendump == 1); } else { push(@missing, $dep); } } } if (@missing) { print "The following dependencies are not found: @missing\n"; print "Please add the full path to each dependency to the configuration file and try again.\n"; exit; } } ################################################## # a please wait thingy, thanks to: # # http://www.perlmonks.org/index.pl?node_id=4943 # ################################################## package Whirley; { sub new { my $type = shift; my $number = shift; my $class = ref $type || $type; my $WHIRLEY_COUNT = -1; my @whirleyhash = ( [qw/. .o .oO .oO(zzz) .oO(ZZZ) /], [qw(>--- ->-- -->- ---> ---* --- ---< --<- -<-- <--- *---)], [qw(- \ | / )], [qw(_o_ \o/ _O_ \O/)], [qw(/... ./.. ../. .../ ...# ... ...\ ..\. .\.. \... #...)], [ '(^_^ )','( ^_^)'], [ '(^_^ )','( ^_^)','( -_-)', '(-_- )' ], [map chr, qw/32 176 177 178 219 178 177 176/] ); my @whirley = @{$whirleyhash[$number]}; my $self = sub { $WHIRLEY_COUNT = 0 if ++$WHIRLEY_COUNT == @whirley; return $whirley[$WHIRLEY_COUNT]; }; bless $self, $class; } } ############### # help, help! # ############### sub HELP_MESSAGE { print $usage; exit; } sub VERSION { print "Msatfinder version: $version\n"; exit; } __END__