ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/uniprot_nrsort.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (9 years, 1 month ago) by gpertea
File size: 2990 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     #this sorts the concatenated deflines after nrdb -d1
3     #such that "reviewed" deflines come first 'UPr|'
4     use strict;
5    
6     #-------- local array holdin uniformative annotation patterns:
7     my @uninformative=(
8     '\bunknown\b',
9     #'unknown protein\b',
10     '\bhypothetical\b',
11     'unnamed protein product',
12     'open reading frame',
13     '\borf\b',
14     '\bputative\b',
15     '\bhomologue\b',
16     '\bsimilar to',
17     '^expressed sequence \S+$',
18     '\bHA\d{4}\b',
19     '\bDKFZP\S+\b',
20     'PROTEIN FOR MGC:\d+',
21     'PROTEIN FOR IMAGE:\d+',
22     '\bR\d{5}\_\d\b',
23     '\bPRO\d{4}\b',
24     # 'KIAA\d+ GENE PRODUCT',
25     #'KIAA\d+ PROTEIN',
26     '\bKIAA\d+\b',
27     '\bHSPC\d+\b',
28     # HSPC\d+ PROTEIN
29     #'\bC\d+ORF\d+\b',
30     'FLJ\d+ PROTEIN',
31     '\bDJ\d+[A-Z]\d+(\.\d+)*',
32     'NOVEL PROTEIN',
33     'CG\d+ PROTEIN',
34     'CG\d+ GENE PRODUCT',
35     '^\s*CG\d+\s*$',
36     'CGI\-\d+ PROTEIN',
37     'CGI\-\d+',
38     'CDNA:? FLJ\d+ FIS, CLONE \w+',
39     'BA\d+[A-Z]\d+[A-Z]?\.\d(\.\d)?',
40     #'\bRIKEN CDNA .{10} GENE\b',
41     '\bRIKEN.+?CDNA\b',
42     'MRNA, COMPLETE CDS, CLONE:\d+(\+\d[A-Z])?\-\d+',
43     'MRNA, COMPLETE CDS, CLONE:SMAP\d+\-\w+',
44     'BRAIN CDNA, CLONE MNCB-\d+',
45     '.{10}RIK PROTEIN',
46     '^MY\d{3}\s*$',
47     'MY\d{3} PROTEIN^',
48     '^probable\b',
49     'BRAIN MY\d{3}$',
50     'NPD\d{3} PROTEIN',
51     '[A-Z]\d{2}[A-Z0-9]+\.\d+ PROTEIN',
52     'WUGSC:H_\w+\.\w+ PROTEIN',
53     #'DNA SEGMENT, CHR [0-9XY]+, WAYNE STATE UNIVERSITY \d+, EXPRESSED',
54     #'DNA SEGMENT, CHR [0-9XY]+, KL MOHLKE \d+',
55     #'DNA SEGMENT, CHR [0-9XY]+, BAYLOR \d+',
56     '\bDNA SEGMENT\b',
57     'PROTEIN HSPC\d+',
58     #'HYPOTHETICAL [\.\d]+\s*KDA PROTEIN \S+ IN CHROMOSOME \S+',
59     'EG:[0-9A-Z\.]+ PROTEIN',
60     'GENOMIC DNA, CHROMOSOME \d+, P1 CLONE:\S+',
61     '[^,]+, RIKEN FULL-LENGTH ENRICHED LIBRARY, CLONE:.{10}, FULL INSERT SEQUENCE',
62     'ZK\d+\.\d+ PROTEIN',
63     '\bEST \w+',
64     'B2 ELEMENT'
65     );
66    
67     $/="\n>";
68     my $excluded=0;
69     my $written=0;
70     while (<>) {
71     chomp;
72     s/^>//;
73     my ($defline, $seq)=(m/^([^\n]+)(.+)/s);
74     my $fseq=$seq;
75     $seq=~tr/\n\r\t //d;
76     if (length($seq)<10) {
77     $excluded++;
78     next;
79     }
80     #print STDERR "defline: $defline\nseq: $seq\n";
81     #next;
82    
83     ## temp fix for my silly mistake:
84     # $defline=~s/UPr\|/UX\|/sg;
85     # $defline=~s/UP\|/UPr\|/sg;
86     # $defline=~s/UX\|/UP\|/sg;
87     ##
88    
89     if ($defline=~m/\x01/) {
90     my @d=split(/\x01/, $defline);
91     @d=sort cmpDeflines @d;
92     my $f=shift(@d);
93     foreach (@d) { s/ UniRef\w+// };
94     $defline=join("\x01",$f,@d);
95     }
96     print '>'.$defline.$fseq."\n";
97     $written++;
98     }
99     my $wtotal=$excluded+$written;
100     print STDERR "Done. $wtotal total entries processed ($excluded excluded for being too short)\n";
101    
102     #===============================================
103     # bool isInformative($description)
104     # expects only the descripts - not the accession
105     #===============================================
106     sub isInformative {
107     local $_=$_[0];
108     s/^\s+//g;s/\s+$//g;
109     return 0 if length($_)<2;
110     foreach my $pat (@uninformative) {
111     if (m/$pat/i) {
112     return 0;
113     }
114     }
115     return 1;
116     }
117    
118    
119     sub cmpDeflines {
120     my $a=$main::a;
121     my $b=$main::b;
122     my $va=2 if ($a=~m/^UPr\|/ || $a=~m/\bNP_\d+/);
123     $va+=isInformative($a);
124     my $vb=2 if ($b=~m/^UPr\|/ || $b=~m/\bNP_\d+/);
125     $vb+=isInformative($b);
126     return ($vb <=> $va);
127     }

Properties

Name Value
svn:executable *