ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/fasplit
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 2471 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/
7 Split a multi-fasta file in multiple smaller parts.
8 Usage:
9 fasplit {-n <seqs_per_part> | -s <partsize>} [-o <outname>] <multifasta.fa>
10
11 Options:
12 There are two modes of operation (mutually exclusive):
13 -n split by number of sequences in a part
14 -s split by part size; <partsize> is a numeric argument representing
15 the desired part size in Megabytes
16
17 -o specify the name prefix for the output files (parts);a 0-padded number
18 and a .fa extension will be appended to <outname> for each part
19 /;
20 umask 0002;
21 getopts('Nn:s:o:') || die($usage."\n");
22 my $infile=$ARGV[0] || die("No multifasta input file provided!\n");
23 my $outprefix=$Getopt::Std::opt_o;
24 my $perSeq=$Getopt::Std::opt_N;
25 unless ($outprefix) {
26 $outprefix=$infile;
27 $outprefix=~s/\.\w+$//;
28 }
29 my $partsize=$Getopt::Std::opt_s;
30 my $numseqs=$Getopt::Std::opt_n;
31 die("$usage\nOnly one of -n or -s options can be given!\n")
32 if ($numseqs && $partsize);
33 if ($perSeq) { $numseqs=1; $partsize=''; }
34 my $bufProcess = $numseqs ? \&nBufProc : \&sBufProc;
35 open(INFILE, $infile) || die("Input file $infile cannot be opened!");
36 $partsize=int($partsize*1024*1024);
37 #my $rsize=0; #accumulated read size so far since the last file open
38 my $rsize=0;
39 my $wsize=0; #last written size
40 #my $partbuf;
41 my $scount=1;
42 my $partnum=1; #current part number
43 my $pfname=sprintf('%s%02d',$outprefix,$partnum).'.fa';
44 open(PFNAME, '>'.$pfname) || die ("Error creating file $pfname!\n");
45 while (<INFILE>) {
46 tr/\n\r//d;
47 next unless $_;
48 if (m/^>/) {
49 #starting record:
50 #print STDERR "* starting record: $_\n";
51 &$bufProcess();
52 }
53 $rsize+=length($_)+1;
54 print PFNAME $_."\n";
55
56 }
57 close(INFILE);
58 #&$bufProcess(1);
59 close(PFNAME);
60
61 sub nBufProc {
62 return unless $rsize>0;
63 if ($scount>=$numseqs) {
64 close(PFNAME);
65 $partnum++;
66 $pfname=sprintf('%s%02d',$outprefix,$partnum).'.fa';
67 #print STDERR "$scount > $numseqs : write to $pfname now.\n";
68 open(PFNAME, '>'.$pfname) || die ("Error creating file $pfname!\n");
69 $wsize=$rsize;
70 $rsize=0;
71 $scount=0;
72 }
73 $scount++;
74 }
75
76 sub sBufProc {
77 return unless $rsize>0;
78 if ($rsize>=$partsize) {
79 #close previous part, open new one
80 close(PFNAME);
81 $partnum++;
82 $pfname=sprintf('%s%02d',$outprefix,$partnum).'.fa';
83 open(PFNAME, '>'.$pfname) || die ("Error creating file $pfname!\n");
84 $wsize=$rsize;
85 $rsize=0;
86 $scount=0;
87 }
88 $scount++;
89 }

Properties

Name Value
svn:executable *