ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/uniprot_nrsort.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 2990 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 #this sorts the concatenated deflines after nrdb -d1
3 #such that "reviewed" deflines come first 'UPr|'
4 use strict;
5
6 #-------- local array holdin uniformative annotation patterns:
7 my @uninformative=(
8 '\bunknown\b',
9 #'unknown protein\b',
10 '\bhypothetical\b',
11 'unnamed protein product',
12 'open reading frame',
13 '\borf\b',
14 '\bputative\b',
15 '\bhomologue\b',
16 '\bsimilar to',
17 '^expressed sequence \S+$',
18 '\bHA\d{4}\b',
19 '\bDKFZP\S+\b',
20 'PROTEIN FOR MGC:\d+',
21 'PROTEIN FOR IMAGE:\d+',
22 '\bR\d{5}\_\d\b',
23 '\bPRO\d{4}\b',
24 # 'KIAA\d+ GENE PRODUCT',
25 #'KIAA\d+ PROTEIN',
26 '\bKIAA\d+\b',
27 '\bHSPC\d+\b',
28 # HSPC\d+ PROTEIN
29 #'\bC\d+ORF\d+\b',
30 'FLJ\d+ PROTEIN',
31 '\bDJ\d+[A-Z]\d+(\.\d+)*',
32 'NOVEL PROTEIN',
33 'CG\d+ PROTEIN',
34 'CG\d+ GENE PRODUCT',
35 '^\s*CG\d+\s*$',
36 'CGI\-\d+ PROTEIN',
37 'CGI\-\d+',
38 'CDNA:? FLJ\d+ FIS, CLONE \w+',
39 'BA\d+[A-Z]\d+[A-Z]?\.\d(\.\d)?',
40 #'\bRIKEN CDNA .{10} GENE\b',
41 '\bRIKEN.+?CDNA\b',
42 'MRNA, COMPLETE CDS, CLONE:\d+(\+\d[A-Z])?\-\d+',
43 'MRNA, COMPLETE CDS, CLONE:SMAP\d+\-\w+',
44 'BRAIN CDNA, CLONE MNCB-\d+',
45 '.{10}RIK PROTEIN',
46 '^MY\d{3}\s*$',
47 'MY\d{3} PROTEIN^',
48 '^probable\b',
49 'BRAIN MY\d{3}$',
50 'NPD\d{3} PROTEIN',
51 '[A-Z]\d{2}[A-Z0-9]+\.\d+ PROTEIN',
52 'WUGSC:H_\w+\.\w+ PROTEIN',
53 #'DNA SEGMENT, CHR [0-9XY]+, WAYNE STATE UNIVERSITY \d+, EXPRESSED',
54 #'DNA SEGMENT, CHR [0-9XY]+, KL MOHLKE \d+',
55 #'DNA SEGMENT, CHR [0-9XY]+, BAYLOR \d+',
56 '\bDNA SEGMENT\b',
57 'PROTEIN HSPC\d+',
58 #'HYPOTHETICAL [\.\d]+\s*KDA PROTEIN \S+ IN CHROMOSOME \S+',
59 'EG:[0-9A-Z\.]+ PROTEIN',
60 'GENOMIC DNA, CHROMOSOME \d+, P1 CLONE:\S+',
61 '[^,]+, RIKEN FULL-LENGTH ENRICHED LIBRARY, CLONE:.{10}, FULL INSERT SEQUENCE',
62 'ZK\d+\.\d+ PROTEIN',
63 '\bEST \w+',
64 'B2 ELEMENT'
65 );
66
67 $/="\n>";
68 my $excluded=0;
69 my $written=0;
70 while (<>) {
71 chomp;
72 s/^>//;
73 my ($defline, $seq)=(m/^([^\n]+)(.+)/s);
74 my $fseq=$seq;
75 $seq=~tr/\n\r\t //d;
76 if (length($seq)<10) {
77 $excluded++;
78 next;
79 }
80 #print STDERR "defline: $defline\nseq: $seq\n";
81 #next;
82
83 ## temp fix for my silly mistake:
84 # $defline=~s/UPr\|/UX\|/sg;
85 # $defline=~s/UP\|/UPr\|/sg;
86 # $defline=~s/UX\|/UP\|/sg;
87 ##
88
89 if ($defline=~m/\x01/) {
90 my @d=split(/\x01/, $defline);
91 @d=sort cmpDeflines @d;
92 my $f=shift(@d);
93 foreach (@d) { s/ UniRef\w+// };
94 $defline=join("\x01",$f,@d);
95 }
96 print '>'.$defline.$fseq."\n";
97 $written++;
98 }
99 my $wtotal=$excluded+$written;
100 print STDERR "Done. $wtotal total entries processed ($excluded excluded for being too short)\n";
101
102 #===============================================
103 # bool isInformative($description)
104 # expects only the descripts - not the accession
105 #===============================================
106 sub isInformative {
107 local $_=$_[0];
108 s/^\s+//g;s/\s+$//g;
109 return 0 if length($_)<2;
110 foreach my $pat (@uninformative) {
111 if (m/$pat/i) {
112 return 0;
113 }
114 }
115 return 1;
116 }
117
118
119 sub cmpDeflines {
120 my $a=$main::a;
121 my $b=$main::b;
122 my $va=2 if ($a=~m/^UPr\|/ || $a=~m/\bNP_\d+/);
123 $va+=isInformative($a);
124 my $vb=2 if ($b=~m/^UPr\|/ || $b=~m/\bNP_\d+/);
125 $vb+=isInformative($b);
126 return ($vb <=> $va);
127 }

Properties

Name Value
svn:executable *