ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/nr_contain.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 4008 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 nr_contain.pl [-M] [-o <outfile.cl>] <chr_mapping_intervals>
8
9 Performs containment clustering of genomic intervals\/mappings
10 Input is expected to be sorted by chromosome, start, end
11
12 Expected input format:
13
14 chr[strand] start end mappingID mapping_defline
15
16 Options:
17
18 -M merge overlapping intervals into single segments
19
20 /;
21 # perl -ne \
22 # '@t=split(/\t/);($c,$a,$b,$s)=m/ ([^\:]+)\:(\d+)\-(\d+)([\-\+])/;print join("\t",$c.$s,$a,$b,$t[0],$t[2])."\n"' \
23 # hg19_primary_transcripts.fsize | sort -k1,1 -k2,2n -k3,3n > hg19_primary_transcripts.tab
24 # perl -pe 's/[\-\+]\t/\t/' hg19_primary_transcripts.tab |sort -k1,1 -k2,2n -k3,3n > hg19_pri_rna.nostrand.tab
25
26
27 #After clustering, get the sequence of the container intervals like this:
28
29 # grep '^>' containers.cl | perl -ne 'chomp;@t=split(/\s+/,$_,5);$t[0]=~s/^>//; $t[0]=~s/^[^\|]+\|//; \
30 # print "$t[1]\t$t[2]-$t[3]\t$t[0]|$t[1]:$t[2]-$t[3] $t[4]\n"' | seqmanip -f - hg19.fa > hg19_cseqs.fa
31
32 umask 0002;
33 getopts('Mo:') || die($usage."\n");
34 my $merge=$Getopt::Std::opt_M;
35 my $outfile=$Getopt::Std::opt_o;
36 if ($outfile) {
37 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
38 select(OUTF);
39 }
40 my %idata; #interval data: idata{id}=>[chr, start, end, annotation]
41 my $max_end; #max seen right end for current chromosome, still buffered
42 my $max_end_id; #id of maximum right end entry in buffered region
43 my $max_end_idx; #idx of maximum right end entry in @reg
44 my @reg; # data in buffered region starting < $max_end[2]
45 # array of [id, chr, start, end, included_ids...]
46 # 0 1 2 3 4...
47 my $chr;
48 while (<>) {
49 chomp;
50 my ($c, $cstart, $cend, $id, $idann)=split(/\t/);
51 if ($chr ne $c) {
52 flush_region();
53 %idata=();
54 $chr=$c;
55 }
56 else {
57 flush_region() if $cstart>=$max_end;
58 }
59 $idann=~s/\w+[\+\-]?\:\d+\-\d+[\+\-]?//;
60 $idata{$id}=[$c, $cstart, $cend, $idann];
61 unless ($max_end) {
62 push(@reg, [$id, $c, $cstart, $cend]);
63 ($max_end, $max_end_id, $max_end_idx)=($cend, $id, $#reg);
64 next;
65 }
66 if ($merge) {
67 #check for overlaps with other intervals in @reg
68 if ($cstart<$max_end-3) {
69 push(@{$reg[$max_end_idx]}, $id);
70 if ($cend>$max_end) {
71 $max_end=$cend;
72 $reg[$max_end_idx]->[3]=$max_end; #adjust ending of cluster region
73 }
74 next;
75 }
76 }
77 else {
78 # check for containment IN other intervals of @reg
79 if ($cend<=$max_end+3) {
80 #merge into max_id
81 push(@{$reg[$max_end_idx]}, $id);
82 if ($max_end<$cend) {
83 $max_end=$cend;
84 $reg[$max_end_idx]->[3]=$max_end;
85 }
86 next; #do not store as independent member of @reg
87 }
88 # check for containment OF other intervals of @reg, take over
89 if ($cstart<=$reg[$max_end_idx]->[1]+3 && $cend>$max_end) {
90 $max_end=$cend;
91 push(@{$reg[$max_end_idx]}, $reg[$max_end_idx]->[0]);
92 $reg[$max_end_idx]->[0]=$id;
93 $reg[$max_end_idx]->[3]=$max_end;
94 next;
95 }
96 }
97 # no overlap/containment, add to region buffer
98 push(@reg, [$id, $c, $cstart, $cend]);
99 if ($cend>$max_end) {
100 ($max_end, $max_end_id, $max_end_idx)=($cend, $id, $#reg);
101 }
102 }
103
104 flush_region();
105
106 # --
107 if ($outfile) {
108 select(STDOUT);
109 close(OUTF);
110 }
111
112 #************ Subroutines **************
113
114 sub flush_region {
115 return unless @reg>0;
116 foreach my $d (@reg) {
117 my @t=@$d;
118 my $clid=shift(@t);
119 next unless $clid; # could be a contained one, skip it
120 print ">$clid ".shift(@t).' '.shift(@t).' '.shift(@t);
121 my $ida=$idata{$clid} || die("Error retrieving data for container id $clid!\n");
122 print " ".$$ida[3] if $$ida[3];
123 print "\n";
124 if (@t>0) {
125 foreach my $id (@t) {
126 my $dv=$idata{$id} ||
127 die("Error retrieving data for id $id!\n");
128 print join(' ',$id, $$dv[1], $$dv[2], $$d[3])."\n";
129 }
130 }
131 }
132 # reset region data
133 ($max_end, $max_end_id, $max_end_idx)=(undef,undef,undef);
134 @reg=();
135 }

Properties

Name Value
svn:executable *