#!/usr/bin/perl -w ################### REannotate ################### # author: Vini Pereira # email : vini_moll@rocketmail.com # REannotate. Automated re-annotation and evolutionary analysis of repetitive DNA. # Copyright (C) 2003-2007 Vini Pereira. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ################### version 26.11.2007 ################ # # Re-annotates RepeatMasker annotation for complex repeats, # in order to infer insertion events, i.e. defragment multiple hits # that originated in the same insertion event. # If required it writes the sequence of identified, defragmented # elements to disk in two formats, the raw genomic sequence and # a gapped sequence of regions matching, and aligned to, a library sequence # (used in the RepeatMasker search). # # For LTR-elements the full element plus LTR and internal region sequences # are saved in separate files (fasta format). If required intra-element # LTR pairs can be automatedly aligned using clustalW, and the # number of nucleotide substitutions between them estimated, so that # the element age of insertion can also be estimated. # # The script also resolves nesting patterns of complex repeat elements. # # Obligatory command line argument: # RepeatMasker annotation (.out) file. # Second optional argument is a FASTA file # contaning second the query sequence(s) that were annotated by RepeatMasker # and reported in its annotatation file (the first argument). The latter # argument is necessary if sequences of elements identified by REannotate # are to be written to disk, and sequence analyses of LTR-elements performed. use strict; use Getopt::Long; use File::Basename; use English; use FileHandle; my %parameter = ( fragmentSearchRange => 40000, # Max distance in base pairs considered by the LTR pairing algorithm # maxIndel => 15000, # Max tolerated LTR fragment overlap/gap in base pairs boundaryTolerance => 40, # Discrepancy (in num of bases) tolerated when dealing sequence boundaries minDistanceToSoloLTR => 15000, # Minimum distance between an unpaired LTR and the nearest internal region # of same family and orientation necessary for such an unpaired LTR to be # classified as a 'solo' LTR doAlign => 0, # Align intra-element LTR pairs (set if alignment -k option is TRUE) pathToClustalW => 0, # Full path to clustalW (set if "alignment -c" option is TRUE)) arabidopsis => 0, # TRUE if query sequences come from Arabidopsis thaliana, FALSE otherwise # (optional) chromo => 0, # Chromosome number will be set if "arabidopsis -a" option is TRUE rateOfEvolution => 0, # Rate of nucleotide substitution in substitutions per site per million years # (set if "kalign -k" option is TRUE) outDir => "REannotate_output", # Basename of directory within the directory from which this script is run, # where all output of REannotate will be written LTRoutDir => "LTR_elements", # Basename of directory within the REannotate output directory # where output LTR element sequences will be stored otherOutDir => "non-LTR_elements", # Basename of directory within the REannotate output directory # where output non-LTR element sequences will be stored completeDir => "complete", # Basename of dir within LTRoutDir where sequences of complete elements # will be stored LTRdir => "LTRs", # Basename of directory within completeDir where intra-element LTR pair # sequences will be stored INTdir => "internal", # Basename of directory within completeDir where internal region sequences # will be stored fullDir => "full", # Basename of directory within completeDir where full sequences will be stored soloDir => "solo", # Basename of dir within LTRoutDir where solo LTR sequences will be stored truncatedDir => "truncated", # Basename of directory within LTRoutDir where truncated element sequences # will be stored alignDir => "alignments", # Basename of direcotry within LTRdir where intra-element LTR pair # alignments would be stored isFASTAinput => 1, # This flag stays TRUE if input sequence file is in fasta format noSeqOutput => 0, # This flag stays FALSE unless user supresses sequence output at command line maxRecursion => 6, # Max number of recursive calls for sub 'defragmentation' when assembling # multiple hits associated with each element repeatClassColumn => 1, # TRUE if the input RepeatMasker annotation file has a 'repeat class' column fuzzynames => [], # this array ref will store arrays of equivalent lib seq names if option -f atomicfuzzynames => [], # this array ref will store an atomic array with all the names in equivalent lib seq names if option -f truncatedNesting => 0, # resolve truncated nesting if TRUE (i.e. if set with the -t option) inputError => "", # Will store error messages related to improper input data UCSCinput => 0, # Will indicate whether input annotation is from UCSC Genome Browser table ungapped => 0 # If option "u|ungapped" is used this will be set to TRUE and ungapped sequences output ); my ($script) = ($0 =~ m|([^/]*)$|); my $version = "26.11.2007"; my $USAGE = "\nUSAGE: (perl) ./REannotate [-OPTIONS] [] options: -h (-help) : Displays command line options -v (-version) : Displays version of REannotate -n (-noseq) : This option suppresses all sequence output, so that REannotate will only generate annotation. If this option is used no sequence input is required. -u (-ungapped) : This option outputs the chromosomal sequence spanned by a defragmented element (from the first to the last fragment), in addition to the default gapped element sequence that is aligned to a reference element. -k (-kalign) : Aligns intra-element LTR pairs using the clustalW program and estimates K (the number of nucleotide substitutions per site between intra-element LTRs) using the Kimura 2-parameter model, for each 'complete' element found. Required argument is the full path to the clustalW programme. -r (-rate) : Estimates the time elapsed since the insertion of structurally 'complete' LTR-elements, if the -k option is being used. Required argument is the rate of evolution in number of substitutions per site per million years. -g (-gff) : Outputs extra annotation in GFF format (e.g. for visualisation in the Apollo Genome Browser). -f (-fuzzy) : Allows defragmentation of chromosomal repeat fragments that match different reference elements. The required argument is the (path to) the filename of a text file containing lists of 'related' (equivalent) reference sequence names. -t (-tnest) : Allows 'truncated nesting' to be annotated, i.e. elements that interrupt one terminus of another element (but which are not contained within the latter) will be annotated as nested. -d (-drange) : Sets the 'search range' for defragmentation, the maximum distance (in number of nucleotides) that is allowed between two repeat fragments to be considered as a candidate fragments of the same element. Required argument is the search range in nucleotides. Default = 40000. -b (-boundary) : Sets the maximum tolerated overlap (in number of nucleotides along a reference element sequence) between chromosomal matches to the same reference element to be considered as fragments of the same chromosomal element. Required argument is the alignment overlap in nucleotides along a reference sequence. Default = 40. -s (-solo) : Sets the minimum distance required (in number of nucleotides) between a given LTR and any other LTR or internal region (belonging to the same family and in the same orientation on the host chromosome) for the given LTR to be considered as a 'solo' LTR. Required argument is the distance in nucleotides. Default = 15000. -m (-stream) : Outputs re-annotation to STDOUT (all messages saved in file ) -l (-long) : Outputs ASCII representation of the order and nesting of defragmented elements along a query sequence. (NOT RECOMMENDED FOR LARGE QUERIES!) -c (-class) : Outputs re-annotation to four separate files (instead of the one file by default), one file for each class: non-LTR-elements, complete LTR-elements, truncated LTR-elements, and solo LTRs. "; #-a (-arabidopsis) # : RepeatMasked queries assumed to come from an A. thaliana # chromosome, extra information (distance from centromere, # superfamily) is generated. #options list my ($opt_m,$opt_h,$opt_v,$opt_k,$opt_r,$opt_a,$opt_d,$opt_i,$opt_s,$opt_b,$opt_n,$opt_f,$opt_t,$opt_g,$opt_l,$opt_c,$opt_u)=(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); # get the supplied command line options, and set flags &GetOptions('m|stream!'=>\$opt_m,'h|help!'=>\$opt_h,'v|version!'=>\$opt_v,'k|kalign=s'=>\$opt_k,'r|rate=f'=>\$opt_r,'a|arabidopsis=i'=>\$opt_a,'d|drange=i'=>\$opt_d,'i|indelmax=i'=>\$opt_i,'s|solo=i'=>\$opt_s,'b|boundary=i'=>\$opt_b,'n|noseq!'=>\$opt_n,'f|fuzzy=s'=>\$opt_f,'t|tnest!'=>\$opt_t,'g|gff!'=>\$opt_g,'l|long!'=>\$opt_l,'c|class!'=>\$opt_c,'u|ungapped!'=>\$opt_u); my $MESSAGES = *STDOUT; # file handle for messages my $outPath=$ENV{'PWD'}."/"; # set output path to current directory # check whether the output directory already exists: # if not create it, if it does rename existing structure before creating new one open (TIME, "date +%T-%F |"); my $timeStamp =