ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/src/owl/core/sequence/UniprotHomologList.java
Revision: 1157
Committed: Wed Jul 7 15:31:14 2010 UTC (14 years, 2 months ago) by jmduarteg
File size: 32228 byte(s)
Log Message:
Fixed bug, wasn't computing the nucleotide alignment correctly (was missing the last codon)
Line File contents
1 package owl.core.sequence;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.FileReader;
7 import java.io.IOException;
8 import java.io.PrintStream;
9 import java.io.PrintWriter;
10 import java.util.ArrayList;
11 import java.util.Collection;
12 import java.util.HashMap;
13 import java.util.Iterator;
14 import java.util.List;
15 import java.util.Map;
16 import java.util.Set;
17 import java.util.TreeSet;
18 import java.util.regex.Matcher;
19 import java.util.regex.Pattern;
20
21 import org.apache.log4j.Logger;
22 import org.xml.sax.SAXException;
23
24 import owl.core.connections.EmblWSDBfetchConnection;
25 import owl.core.connections.NoMatchFoundException;
26 import owl.core.connections.UniProtConnection;
27 import owl.core.runners.SelectonRunner;
28 import owl.core.runners.TcoffeeError;
29 import owl.core.runners.TcoffeeRunner;
30 import owl.core.runners.blast.BlastError;
31 import owl.core.runners.blast.BlastHit;
32 import owl.core.runners.blast.BlastHitList;
33 import owl.core.runners.blast.BlastRunner;
34 import owl.core.runners.blast.BlastXMLParser;
35 import owl.core.sequence.alignment.AlignmentConstructionError;
36 import owl.core.sequence.alignment.MultipleSequenceAlignment;
37 import owl.core.sequence.alignment.PairwiseSequenceAlignment;
38 import owl.core.sequence.alignment.PairwiseSequenceAlignment.PairwiseSequenceAlignmentException;
39 import owl.core.util.FileFormatError;
40 import owl.core.util.Goodies;
41 import uk.ac.ebi.kraken.interfaces.uniprot.DatabaseType;
42 import uk.ac.ebi.kraken.interfaces.uniprot.NcbiTaxon;
43 import uk.ac.ebi.kraken.interfaces.uniprot.NcbiTaxonomyId;
44 import uk.ac.ebi.kraken.interfaces.uniprot.Organelle;
45 import uk.ac.ebi.kraken.interfaces.uniprot.UniProtEntry;
46 import uk.ac.ebi.kraken.interfaces.uniprot.dbx.embl.Embl;
47 import uk.ac.ebi.kraken.uuw.services.remoting.EntryIterator;
48
49 /**
50 * Class to store a set of uniprot homologs of a given sequence.
51 * It contains methods to blast against a uniprot database to get the homolog
52 * list and to retrieve data from uniprot (taxonomy, dbrefs) and embl cds sequences.
53 *
54 * @see UniprotHomolog
55 *
56 * @author duarte_j
57 *
58 */
59 public class UniprotHomologList implements Iterable<UniprotHomolog>{
60
61 /*------------------------ constants --------------------------*/
62
63 private static final String BLASTOUT_SUFFIX = "blast.out.xml";
64 private static final String FASTA_SUFFIX = ".fa";
65 private static final String BLAST_BASENAME = "homSearch";
66
67 private static final int BLAST_OUTPUT_TYPE = 7; // xml output
68 private static final boolean BLAST_NO_FILTERING = true;
69 private static final String UNIPROT_VER_FILE = "reldate.txt";
70
71 private static final String TCOFFEE_ALN_OUTFORMAT = "fasta";
72
73 private static final boolean DEBUG = false;
74
75 private static final Logger LOGGER = Logger.getLogger(UniprotHomologList.class);
76
77 /*-------------------------- members --------------------------*/
78
79 private UniprotEntry ref; // the uniprot entry to which the homologs refer
80 private List<UniprotHomolog> list; // the list of homologs
81 private Map<String,List<UniprotHomolog>> lookup; // to speed up searches (uniprot ids to Homologs lists)
82 // it's a list because blast can hit a single uniprot in multiple regions (for us
83 // that's multiple BlastHits)
84 private double idCutoff; // the identity cutoff (see restrictToMinId() )
85 private String uniprotVer; // the version of uniprot used in blasting, read from the reldate.txt uniprot file
86
87 private MultipleSequenceAlignment aln; // the protein sequences alignment
88 private MultipleSequenceAlignment nucAln; // the nucleotides alignment
89
90 private int reducedAlphabet; // the reduced alphabet used to calculate entropies
91 private List<Double> entropies; // entropies for each uniprot reference sequence position
92 private List<Double> kaksRatios; // ka/ks for each uniprot reference sequence position
93
94
95 public UniprotHomologList(UniprotEntry ref) {
96 this.ref = ref;
97 this.idCutoff = 0.0; // i.e. no filter
98 }
99
100 /**
101 * Performs a blast search based on the reference UniprotEntry to populate this list of homologs.
102 * All blast output files will be removed on exit.
103 * @param blastBinDir
104 * @param blastDbDir
105 * @param blastDb
106 * @param blastNumThreads
107 * @param cacheFile a file with the cached xml blast output file, if null blast will be always run
108 * @throws IOException
109 * @throws BlastError
110 */
111 public void searchWithBlast(String blastBinDir, String blastDbDir, String blastDb, int blastNumThreads, File cacheFile) throws IOException, BlastError {
112 File outBlast = null;
113 boolean fromCache = false;
114 if (cacheFile!=null && cacheFile.exists()) {
115 outBlast = cacheFile;
116 fromCache = true;
117 LOGGER.warn("Reading blast results from cache file "+cacheFile);
118 } else {
119 outBlast = File.createTempFile(BLAST_BASENAME,BLASTOUT_SUFFIX);
120 File inputSeqFile = File.createTempFile(BLAST_BASENAME,FASTA_SUFFIX);
121 if (!DEBUG) {
122 outBlast.deleteOnExit();
123 inputSeqFile.deleteOnExit();
124 }
125 // NOTE: we blast the reference uniprot sequence
126 this.ref.getUniprotSeq().writeToFastaFile(inputSeqFile);
127 BlastRunner blastRunner = new BlastRunner(blastBinDir, blastDbDir);
128 blastRunner.runBlastp(inputSeqFile, blastDb, outBlast, BLAST_OUTPUT_TYPE, BLAST_NO_FILTERING, blastNumThreads);
129 this.uniprotVer = readUniprotVer(blastDbDir);
130 if (cacheFile!=null) {
131 try {
132 Goodies.copyFile(outBlast, cacheFile);
133 } catch (IOException e) {
134 System.err.println("Couldn't write the blast cache file "+cacheFile);
135 System.err.println(e.getMessage());
136 }
137 }
138 }
139
140 BlastHitList blastList = null;
141 try {
142 BlastXMLParser blastParser = new BlastXMLParser(outBlast);
143 blastList = blastParser.getHits();
144 } catch (SAXException e) {
145 if (fromCache) {
146 throw new IOException("Cache file "+cacheFile+" does not comply with blast XML format.");
147 } else {
148 // if this happens it means that blast doesn't format correctly its XML, i.e. has a bug
149 System.err.println("Unexpected error: "+e.getMessage());
150 System.exit(1);
151 }
152 }
153
154 if (fromCache) {
155 if (!blastList.getQueryId().equals(this.ref.getUniprotSeq().getName())) {
156 throw new IOException("Query id "+blastList.getQueryId()+" from cache file "+cacheFile+" does not match the id from the sequence: "+this.ref.getUniprotSeq().getName());
157 }
158 this.uniprotVer = readUniprotVer(cacheFile.getParent());
159 String uniprotVerFromBlastDbDir = readUniprotVer(blastDbDir);
160 if (!uniprotVerFromBlastDbDir.equals(uniprotVer)) {
161 throw new IOException("Uniprot version from blast db dir "+blastDbDir+" does not match version in cache dir "+cacheFile.getParent());
162 }
163 }
164
165
166 this.list = new ArrayList<UniprotHomolog>();
167 for (BlastHit hit:blastList) {
168 String sid = hit.getSubjectId();
169 Matcher m = Sequence.DEFLINE_PRIM_ACCESSION_REGEX.matcher(sid);
170 if (m.matches()) {
171 String uniId = m.group(1);
172 list.add(new UniprotHomolog(hit,new UniprotEntry(uniId)));
173 } else {
174 System.err.println("Could not find uniprot id in subject id "+sid);
175 }
176 }
177 initialiseMap();
178 }
179
180 //TODO write a searchithPSIBlast method
181
182 /**
183 * Initialises the lookup map (for speeding up lookups of homologs by uniprot ids)
184 */
185 private void initialiseMap() {
186 this.lookup = new HashMap<String, List<UniprotHomolog>>();
187 for (UniprotHomolog hom:this) {
188 if (lookup.containsKey(hom.getUniId())) {
189 lookup.get(hom.getUniId()).add(hom);
190 } else {
191 List<UniprotHomolog> list = new ArrayList<UniprotHomolog>();
192 list.add(hom);
193 lookup.put(hom.getUniId(), list);
194 }
195
196 }
197 }
198
199 private String readUniprotVer(String blastDbDir) {
200 String ver = "unknown";
201 File uniprotVerFile = new File(blastDbDir,UNIPROT_VER_FILE);
202 try {
203
204 BufferedReader br = new BufferedReader(new FileReader(uniprotVerFile));
205 String line;
206 Pattern p = Pattern.compile("^UniProt\\sKnowledgebase\\sRelease\\s([\\d._]+)\\s.*");
207 while ((line=br.readLine())!=null){
208 Matcher m = p.matcher(line);
209 if (m.matches()) {
210 ver = m.group(1);
211 break;
212 }
213 }
214 br.close();
215 } catch(IOException e) {
216 System.err.println("Warning: couldn't read uniprot version from file "+uniprotVerFile);
217 }
218 return ver;
219 }
220
221 /**
222 * Retrieves from UniprotKB the sequence, taxonomy and EMBL CDS ids data,
223 * by using the remote Uniprot API
224 */
225 public void retrieveUniprotKBData() {
226 UniProtConnection uniprotConn = new UniProtConnection();
227 if (!uniprotConn.getVersion().equals(this.uniprotVer)){
228 System.err.println("Warning! Uniprot version used for blast ("+uniprotVer+") and uniprot version being queried with api ("+uniprotConn.getVersion()+")don't match!");
229 }
230 List<String> uniprotIds = new ArrayList<String>();
231 for (UniprotHomolog hom:this) {
232 uniprotIds.add(hom.getUniId());
233 }
234 EntryIterator<UniProtEntry> entries = uniprotConn.getMultipleEntries(uniprotIds);
235
236 for (UniProtEntry entry:entries) {
237 List<UniprotHomolog> homs = this.getHomolog(entry.getPrimaryUniProtAccession().getValue());
238 for (UniprotHomolog hom:homs) {
239 hom.getUniprotEntry().setUniprotSeq(new Sequence(hom.getUniId(),entry.getSequence().getValue()));
240
241 List<NcbiTaxonomyId> ncbiTaxIds = entry.getNcbiTaxonomyIds();
242 if (ncbiTaxIds.size()>1) {
243 System.err.println("Warning! more than one taxonomy id for uniprot entry "+hom.getUniId());
244 }
245 hom.getUniprotEntry().setTaxId(ncbiTaxIds.get(0).getValue());
246 List<String> taxons = new ArrayList<String>();
247 for(NcbiTaxon ncbiTaxon:entry.getTaxonomy()) {
248 taxons.add(ncbiTaxon.getValue());
249 }
250 hom.getUniprotEntry().setTaxons(taxons);
251
252 Collection<Embl> emblrefs = entry.getDatabaseCrossReferences(DatabaseType.EMBL);
253 List<String> emblCdsIds = new ArrayList<String>();
254 Set<String> tmpEmblCdsIdsSet = new TreeSet<String>();
255 for(Embl ref:emblrefs) {
256 String emblCdsIdWithVer = ref.getEmblProteinId().getValue();
257 if (!emblCdsIdWithVer.equals("-")) { // for non annotated genomic dna cds sequences the identifier is '-', we ignore them
258 String emblCdsId = emblCdsIdWithVer.substring(0, emblCdsIdWithVer.lastIndexOf("."));
259 //emblCdsIds.add(emblCdsId);
260 tmpEmblCdsIdsSet.add(emblCdsId);
261 }
262 }
263 emblCdsIds.addAll(tmpEmblCdsIdsSet); // we use the set to be sure there are no duplicates (it does happen sometimes)
264 hom.getUniprotEntry().setEmblCdsIds(emblCdsIds);
265
266 List<Organelle> orglls = entry.getOrganelles();
267 if (orglls.size()>0) {
268 hom.getUniprotEntry().setGeneEncodingOrganelle(orglls.get(0).getType().getValue());
269 if (orglls.size()>1) {
270 for (Organelle orgll:orglls){
271 if (!orgll.getType().equals(hom.getUniprotEntry().getGeneEncodingOrganelle())) {
272 System.err.println("Warning! Different gene encoding organelles for Uniprot "+hom.getUniId());
273 }
274 }
275 }
276 }
277 }
278 }
279 }
280
281 /**
282 * Retrieves from EMBL DB fetch web service the EMBL CDS sequences
283 * @param cacheFile a FASTA file containing the sequences to retrieve. If present and if
284 * it contains ALL required sequences then they are read from cacheFile. If null or file
285 * does not exist or file older than {@value #EmblWSDBfetchConnect.MAX_CACHE_AGE} then the sequences are
286 * retrieved from EMBL DB fetch
287 * @throws IOException
288 */
289 public void retrieveEmblCdsSeqs(File cacheFile) throws IOException {
290 List<String> allIds = new ArrayList<String>();
291
292 for (UniprotHomolog hom:this) {
293 allIds.addAll(hom.getUniprotEntry().getEmblCdsIds());
294 }
295 List<Sequence> allSeqs = null;
296 try {
297 allSeqs = EmblWSDBfetchConnection.fetchEMBLCDS(allIds, cacheFile);
298 } catch (NoMatchFoundException e) {
299 // this is unlikely to happen here, that's why we don't write a better error message
300 LOGGER.warn("Couldn't retrieve EMBL CDS sequences for some EMBL cds ids");
301 LOGGER.warn(e.getMessage());
302 }
303
304 // we put the list (containing all the sequences from all the homologs) in a lookup table
305 // so that we can then retrieve the ones corresponding to each homolog below
306 Map<String,Sequence> lookup = new HashMap<String,Sequence>();
307 for (Sequence seq:allSeqs) {
308 lookup.put(seq.getSecondaryAccession(), seq);
309 }
310
311 for (UniprotHomolog hom:this) {
312 List<Sequence> seqs = new ArrayList<Sequence>();
313 for (String emblCdsId:hom.getUniprotEntry().getEmblCdsIds()) {
314 if (lookup.containsKey(emblCdsId)) {
315 seqs.add(lookup.get(emblCdsId));
316 } else {
317 // this will happen when the CDS sequence was not returned by embl dbfetch or the cache file does not have it (it's in the list of missing entries)
318 // in either case we don't want the list of emblcs sequences to contain a null
319 LOGGER.warn("Sequence for EMBL CDS "+emblCdsId+" of uniprot entry "+hom.getUniId()+" could not be found. Not using it.");
320 //TODO should we also remove from the list of embl cds ids this emblCdsId? Not sure if it can cause problems
321 // to have an embl cds id without its corresponding sequence
322 }
323 }
324
325 hom.getUniprotEntry().setEmblCdsSeqs(seqs);
326 }
327 }
328
329 /**
330 * Gets the Homolog given a uniprot ID
331 * @param uniprotId
332 * @return
333 */
334 public List<UniprotHomolog> getHomolog(String uniprotId) {
335 return this.lookup.get(uniprotId);
336 }
337
338 public Iterator<UniprotHomolog> iterator() {
339 return this.list.iterator();
340 }
341
342 /**
343 * Write to the given file the query protein sequence and all the homolog protein
344 * sequences (full uniprot sequences) in fasta format.
345 * @param outFile
346 * @param writeQuery if true the query sequence is written as well, if false only homologs
347 * @throws FileNotFoundException
348 */
349 public void writeToFasta(File outFile, boolean writeQuery) throws FileNotFoundException {
350 PrintWriter pw = new PrintWriter(outFile);
351
352 int len = 80;
353
354 if (writeQuery) {
355 pw.println(MultipleSequenceAlignment.FASTAHEADER_CHAR + this.ref.getUniprotSeq().getName());
356 for(int i=0; i<this.ref.getLength(); i+=len) {
357 pw.println(ref.getUniprotSeq().getSeq().substring(i, Math.min(i+len,ref.getLength())));
358 }
359 }
360
361 for(UniprotHomolog hom:this) {
362
363 String sequence = hom.getUniprotSeq().getSeq();
364 pw.println(MultipleSequenceAlignment.FASTAHEADER_CHAR + hom.getLongSequenceTag());
365 for(int i=0; i<sequence.length(); i+=len) {
366 pw.println(sequence.substring(i, Math.min(i+len,sequence.length())));
367 }
368 }
369 pw.println();
370 pw.close();
371 }
372
373 /**
374 * Runs t_coffee to align all protein sequences of homologs and the query sequence
375 * returning a MultipleSequenceAlignment object
376 * @param tcoffeeBin
377 * @param veryFast whether to use t_coffee's very fast alignment (and less accurate) mode
378 * @throws IOException
379 * @throws TcoffeeError
380 */
381 public void computeTcoffeeAlignment(File tcoffeeBin, boolean veryFast) throws IOException, TcoffeeError {
382 File homologSeqsFile = File.createTempFile("homologs.", ".fa");
383 File outTreeFile = File.createTempFile("homologs.", ".dnd");
384 File alnFile = File.createTempFile("homologs.",".aln");
385 File tcoffeeLogFile = File.createTempFile("homologs.",".tcoffee.log");
386 if (!DEBUG) {
387 homologSeqsFile.deleteOnExit();
388 alnFile.deleteOnExit();
389 tcoffeeLogFile.deleteOnExit();
390 outTreeFile.deleteOnExit();
391 }
392 this.writeToFasta(homologSeqsFile, true);
393 TcoffeeRunner tcr = new TcoffeeRunner(tcoffeeBin);
394 tcr.runTcoffee(homologSeqsFile, alnFile, TCOFFEE_ALN_OUTFORMAT, outTreeFile, null, tcoffeeLogFile, veryFast);
395
396 try {
397 aln = new MultipleSequenceAlignment(alnFile.getAbsolutePath(), MultipleSequenceAlignment.FASTAFORMAT);
398 } catch (FileFormatError e) {
399 System.err.println("Unexpected error, output file of tcoffee "+alnFile+" does not seem to be in the right format.");
400 System.err.println("Error: "+e.getMessage());
401 System.exit(1);
402 } catch (AlignmentConstructionError e) {
403 System.err.println("Unexpected error, output file of tcoffee "+alnFile+" seems to contain .");
404 System.err.println("Error: "+e.getMessage());
405 System.exit(1);
406 }
407
408 }
409
410 /**
411 * Returns the protein sequence alignment of query sequence and all homologs
412 * @return
413 */
414 public MultipleSequenceAlignment getAlignment() {
415 return aln;
416 }
417
418 /**
419 * Returns a multiple sequence alignment of all valid nucleotide CDS sequences from the
420 * UniprotHomologList plus the query's CDS by mapping the nucleotide sequences to the protein
421 * sequences alignment.
422 * @return
423 */
424 public MultipleSequenceAlignment getNucleotideAlignment() {
425 if (nucAln!=null) {
426 return nucAln;
427 }
428 List<Sequence> allSeqs = new ArrayList<Sequence>();
429
430 // CDS of the query sequence
431 StringBuffer nucSeqSB = new StringBuffer();
432 String queryProtSeq = aln.getAlignedSequence(ref.getUniprotSeq().getName());
433 ProteinToCDSMatch queryMatching = ref.getRepresentativeCDS();
434 if (queryMatching!=null) {
435 String bestNucSeq = queryMatching.getNucleotideSeqForBestTranslation();
436 int j = 0;
437 for (int i=0;i<queryProtSeq.length();i++) {
438 char aa = queryProtSeq.charAt(i);
439 if (aa==MultipleSequenceAlignment.GAPCHARACTER) {
440 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
441 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
442 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
443 } else {
444 if (j+3<=bestNucSeq.length()) {
445 nucSeqSB.append(bestNucSeq.substring(j, j+3));
446 } else {
447 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
448 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
449 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
450 }
451 j+=3;
452 }
453 }
454 allSeqs.add(new Sequence(queryMatching.getCDSName(),nucSeqSB.toString()));
455 }
456
457 for (UniprotHomolog hom:this) {
458 nucSeqSB = new StringBuffer();
459 String protSeq = aln.getAlignedSequence(hom.getBlastHit().getSubjectId());
460 ProteinToCDSMatch matching = hom.getUniprotEntry().getRepresentativeCDS();
461 if (matching!=null) {
462 String bestNucSeq = matching.getNucleotideSeqForBestTranslation();
463 int j = 0;
464 for (int i=0;i<protSeq.length();i++) {
465 char aa = protSeq.charAt(i);
466 if (aa==MultipleSequenceAlignment.GAPCHARACTER) {
467 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
468 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
469 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
470 } else {
471 if (j+3<bestNucSeq.length()) {
472 nucSeqSB.append(bestNucSeq.substring(j, j+3));
473 } else {
474 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
475 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
476 nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER);
477 }
478 j+=3;
479
480 }
481 }
482 allSeqs.add(new Sequence(matching.getCDSName(),nucSeqSB.toString()));
483 }
484 }
485 try {
486 nucAln = new MultipleSequenceAlignment(allSeqs);
487 } catch(AlignmentConstructionError e) {
488 System.err.println("Unexpected error while creating the nucleotides alignment");
489 System.err.println(e.getMessage());
490 System.exit(1);
491 }
492
493 return nucAln;
494 }
495
496 public void writeAlignmentToFile(File alnFile) throws FileNotFoundException {
497 aln.writeFasta(new PrintStream(alnFile), 80, true);
498 }
499
500 public void writeNucleotideAlignmentToFile(File alnFile) throws FileNotFoundException {
501 this.getNucleotideAlignment().writeFasta(new PrintStream(alnFile), 80, true);
502 }
503
504 /**
505 * Removes homologs below the given sequence identity value.
506 * @param idCutoff
507 */
508 public void restrictToMinIdAndCoverage(double idCutoff, double queryCovCutoff) {
509 this.idCutoff = idCutoff;
510
511 Iterator<UniprotHomolog> it = list.iterator();
512 while (it.hasNext()) {
513 UniprotHomolog hom = it.next();
514 if ((hom.getBlastHit().getPercentIdentity()/100.0)<=idCutoff || hom.getBlastHit().getQueryCoverage()<=queryCovCutoff) {
515 it.remove();
516
517 }
518 }
519 // finally we update the lookup table
520 initialiseMap();
521 }
522
523 /**
524 * Removes the redundant sequences in this list of Homologs. The redundancy reduction procedes as follows:
525 * 1) It groups the sequences by taxonomy id and sequence identity
526 * 2) If any of the groups have more than one member then the pairwise identities within the group are calculated
527 * (all vs all Needleman-Wunsch). From the pairwise identity matrix sequences that are not 100% identity to all the others
528 * are removed from the group, leaving groups that contain only identical sequences from same species.
529 * 3) If after this second pruning any group has more than 1 member then a single member is chosen (first one with a
530 * good matching corresponding CDS sequence or simply first one if no good CDS matchings exist)
531 */
532 public void removeRedundancy() {
533
534 // 1) grouping by tax id and sequence identity
535 Map<String,List<UniprotHomolog>> groups = new HashMap<String,List<UniprotHomolog>>();
536 for (UniprotHomolog hom:this){
537 double percentId = hom.getPercentIdentity();
538 String taxId = hom.getUniprotEntry().getTaxId();
539 String key = taxId+"_"+String.format("%5.1f",percentId);
540 if (groups.containsKey(key)) {
541 groups.get(key).add(hom);
542 } else {
543 List<UniprotHomolog> list = new ArrayList<UniprotHomolog>();
544 list.add(hom);
545 groups.put(key, list);
546 }
547 }
548 LOGGER.debug("Number of protein sequence groups for redundancy elimination (based on same identity value and same tax id): "+groups.size());
549 // 2) finding if group members are really identical (all vs all pairwise alignments)
550 for (String key:groups.keySet()) {
551 // all vs all pairwise alignments
552 List<UniprotHomolog> list = groups.get(key);
553 LOGGER.debug("Size of group "+key+": "+list.size());
554 if (list.size()>1) {
555 double[][] pairwiseIdMatrix = new double[list.size()][list.size()];
556 for (int i=0;i<list.size();i++){
557 for (int j=i+1;j<list.size();j++){
558 try {
559 PairwiseSequenceAlignment aln = new PairwiseSequenceAlignment(list.get(i).getUniprotSeq(), list.get(j).getUniprotSeq());
560 pairwiseIdMatrix[i][j]=aln.getPercentIdentity();
561 } catch (PairwiseSequenceAlignmentException e) {
562 LOGGER.fatal("Unexpected error. Couldn't align sequences for redundancy removal procedure");
563 LOGGER.fatal(e.getMessage());
564 System.exit(1);
565 }
566 }
567 }
568 // mirroring the other side of the matrix
569 for (int i=0;i<list.size();i++){
570 for (int j=i+1;j<list.size();j++){
571 pairwiseIdMatrix[j][i] = pairwiseIdMatrix[i][j];
572 }
573 }
574
575 String matStr = "";
576 // if a member is not identical to all the others then we throw it away
577 boolean[] hasNoIdenticalPair = new boolean[list.size()];
578 for (int i=0;i<list.size()-1;i++){
579 boolean hasIdenticalPair = false;
580 for (int j=0;j<list.size();j++){
581 matStr+=String.format("%5.1f ", pairwiseIdMatrix[i][j]);
582 if (pairwiseIdMatrix[i][j]>99.99f) {
583 hasIdenticalPair = true;
584 }
585 }
586 if (!hasIdenticalPair) hasNoIdenticalPair[i] = true;
587 matStr+="\n";
588 }
589
590 LOGGER.debug("Pairwise similarities: \n"+matStr);
591 Iterator<UniprotHomolog> it = list.iterator();
592 int i = 0;
593 while (it.hasNext()){
594 it.next();
595 if (hasNoIdenticalPair[i]) {
596 it.remove();
597 }
598 i++;
599 }
600 LOGGER.debug("Size of group after elimination of sequences not identical to any of the others: "+list.size());
601 }
602 }
603 // 3) if there are still groups with size>1 then they are really redundant, all sequences except one have to be eliminated
604 // in any case we first check that the one we want to eliminate doesn't have a better CDS representative
605 List<UniprotHomolog> toRemove = new ArrayList<UniprotHomolog>();
606 for (String key:groups.keySet()) {
607 List<UniprotHomolog> list = groups.get(key);
608 if (list.size()>1) {
609 UniprotHomolog homToKeep = null;
610 for (UniprotHomolog hom:list){
611 if (hom.getUniprotEntry().getRepresentativeCDS()!=null) {
612 homToKeep = hom;
613 break;
614 }
615 }
616 if (homToKeep==null) { // if there wasn't any good CDS homolog we remove all but first
617 for (int i=1;i<list.size();i++) {
618 toRemove.add(list.get(i));
619 }
620 } else { // if we found one good CDS homolog we remove all the others
621 for (UniprotHomolog hom:list) {
622 if (hom!=homToKeep) toRemove.add(hom);
623 }
624 }
625 }
626 }
627 for (UniprotHomolog hom:toRemove){
628 this.list.remove(hom);
629 LOGGER.info("Homolog "+hom.getUniId()+" removed because it is redundant.");
630 }
631 LOGGER.info("Number of homologs after redundancy elimination: "+this.size());
632
633 // finally we update the lookup table
634 initialiseMap();
635 }
636
637 /**
638 * Reduces the number of homologs by skimming the list for homologs with same identities
639 * until maxDesiredHomologs is reached.
640 * The procedure is as follows: sequences are grouped by identity to query and one of each group
641 * eliminated (if possible the one without valid CDS matching) at each iteration until the
642 * desired number of homologs is reached.
643 * This is needed for example to perform ka/ks calculations with selecton (too many sequences
644 * are far too slow and a risk of ks saturation).
645 * @param maxDesiredHomologs
646 */
647 public void skimList(int maxDesiredHomologs) {
648 if (list.size()<=maxDesiredHomologs) {
649 return;
650 }
651 LOGGER.info("List of homologs too long: "+list.size()+", skimming it.");
652 // 1) grouping by sequence identity
653 Map<Integer,List<UniprotHomolog>> groups = new HashMap<Integer,List<UniprotHomolog>>();
654 for (UniprotHomolog hom:this){
655 double percentId = hom.getPercentIdentity();
656 int key =(int) Math.round(percentId);
657 if (groups.containsKey(key)) {
658 groups.get(key).add(hom);
659 } else {
660 List<UniprotHomolog> list = new ArrayList<UniprotHomolog>();
661 list.add(hom);
662 groups.put(key, list);
663 }
664 }
665 // 2) skimming iteratively
666 int countIterations = 0;
667 outer:
668 while (true) {
669 countIterations++;
670 for (int key:groups.keySet()) {
671 List<UniprotHomolog> group = groups.get(key);
672 if (group.size()>1) {
673 // remove the last element of the group
674 UniprotHomolog toRemove = getHomologNonValidCDS(group);
675 if (toRemove==null) {
676 toRemove = group.get(group.size()-1);
677 }
678 list.remove(toRemove);
679 group.remove(toRemove);
680
681 LOGGER.info("Removed "+toRemove.getUniId());
682 if (list.size()<=maxDesiredHomologs) break outer;
683 }
684 }
685 }
686 LOGGER.info("Size of homolog list after skimming: "+list.size()+" ("+countIterations+" iterations)");
687
688 // and we reinitialise the lookup maps
689 initialiseMap();
690 }
691
692 /**
693 * Given a list of homologs returns the first one that does not have a valid CDS match
694 * or null if all homologs have valid matches.
695 * @param list
696 * @return
697 */
698 private static UniprotHomolog getHomologNonValidCDS(List<UniprotHomolog> list) {
699 for (UniprotHomolog hom:list) {
700 if (hom.getUniprotEntry().getRepresentativeCDS()==null) {
701 return hom;
702 }
703 }
704 return null;
705 }
706
707 /**
708 * Returns the number of homologs in this list
709 * @return
710 */
711 public int size() {
712 return list.size();
713 }
714
715 /**
716 * Returns the sequence identity cutoff, see {@link #restrictToMinId(double)}
717 * @return
718 */
719 public double getIdCutoff() {
720 return idCutoff;
721 }
722
723 /**
724 * Gets the uniprot version used for blasting
725 * @return
726 */
727 public String getUniprotVer() {
728 return uniprotVer;
729 }
730
731 public int getNumHomologsWithCDS() {
732 int count = 0;
733 for (UniprotHomolog hom:this) {
734 if (hom.getUniprotEntry().hasCDS()) {
735 count++;
736 }
737 }
738 return count;
739 }
740
741 public int getNumHomologsWithValidCDS() {
742 int count = 0;
743 for (UniprotHomolog hom:this) {
744 if (hom.getUniprotEntry().getRepresentativeCDS()!=null) {
745 count++;
746 }
747 }
748 return count;
749 }
750
751 /**
752 * Checks whether this UniprotHomologList contains genes encoded with strictly
753 * one genetic code type.
754 * @see GeneticCodeType
755 * @return
756 */
757 public boolean isConsistentGeneticCodeType() {
758 GeneticCodeType lastGct = null;
759 for (UniprotHomolog hom:this) {
760 GeneticCodeType gct = hom.getUniprotEntry().getGeneticCodeType();
761 if (lastGct!=null && !lastGct.equals(gct)) {
762 return false;
763 }
764 lastGct = gct;
765 }
766 return true;
767 }
768
769 /**
770 * Gets the genetic code type of this UniprotHomologList. It does it by simply returning
771 * the genetic code type of the first homolog in the list. It does NOT check for consistency
772 * within the list. For that use {@link #isConsistentGeneticCodeType()}
773 * @return
774 */
775 public GeneticCodeType getGeneticCodeType() {
776 return this.list.get(0).getUniprotEntry().getGeneticCodeType();
777 }
778
779 /**
780 * Compute the sequence entropies for all reference sequence (uniprot) positions
781 * @param reducedAlphabet
782 */
783 public void computeEntropies(int reducedAlphabet) {
784 this.reducedAlphabet = reducedAlphabet;
785 this.entropies = new ArrayList<Double>();
786 for (int i=0;i<ref.getUniprotSeq().getLength();i++){
787 entropies.add(this.aln.getColumnEntropy(this.aln.seq2al(ref.getUniprotSeq().getName(),i+1), reducedAlphabet));
788 }
789 }
790
791 /**
792 * Compute the sequence ka/ks ratios with selecton for all reference CDS sequence positions
793 * @param selectonBin
794 * @param resultsFile
795 * @param logFile
796 * @param treeFile
797 * @param globalResultsFile
798 * @param epsilon
799 * @throws IOException
800 */
801 public void computeKaKsRatiosSelecton(File selectonBin, File resultsFile, File logFile, File treeFile, File globalResultsFile, double epsilon)
802 throws IOException {
803 kaksRatios = new ArrayList<Double>();
804 SelectonRunner sr = new SelectonRunner(selectonBin);
805 if(!resultsFile.exists()) {
806 File alnFile = File.createTempFile("selecton.", ".cds.aln");
807 alnFile.deleteOnExit();
808 this.nucAln.writeFasta(new PrintStream(alnFile), 80, true);
809 sr.run(alnFile, resultsFile, logFile, treeFile, null, globalResultsFile, this.ref.getRepresentativeCDS().getCDSName(), this.getGeneticCodeType(),epsilon);
810 } else {
811 LOGGER.warn("Selecton output file "+resultsFile+" already exists. Using the file instead of running selecton.");
812 try {
813 sr.parseResultsFile(resultsFile, this.ref.getRepresentativeCDS().getCDSName());
814 } catch (FileFormatError e) {
815 LOGGER.warn("Cached output selecton file "+resultsFile+" does not seem to be in the righ format");
816 LOGGER.warn(e.getMessage());
817 LOGGER.warn("Running selecton and overwritting the file.");
818 File alnFile = File.createTempFile("selecton.", ".cds.aln");
819 alnFile.deleteOnExit();
820 this.nucAln.writeFasta(new PrintStream(alnFile), 80, true);
821 sr.run(alnFile, resultsFile, logFile, treeFile, null, globalResultsFile, this.ref.getRepresentativeCDS().getCDSName(), this.getGeneticCodeType(),epsilon);
822 }
823 }
824 kaksRatios = sr.getKaKsRatios();
825 if (kaksRatios.size()!=this.ref.getUniprotSeq().getLength()) {
826 LOGGER.info("Size of ka/ks ratio list ("+kaksRatios.size()+") is not the same as length of reference sequence ("+this.ref.getUniprotSeq().getLength()+")");
827 }
828 }
829
830 public List<Double> getEntropies() {
831 return entropies;
832 }
833
834 public List<Double> getKaksRatios() {
835 return kaksRatios;
836 }
837
838 public int getReducedAlphabet() {
839 return reducedAlphabet;
840 }
841
842 /**
843 * Tells whether a given position of the reference sequence (starting at 0)
844 * is reliable with respect to the CDS matching of the reference sequence and the CDS
845 * matchings of the homologs
846 * (not reliable is considered any position where the CDS translation does not match exactly the
847 * protein residue)
848 * @param i
849 * @return
850 */
851 public boolean isReferenceSeqPositionReliable(int i) {
852 // check if matching CDS of ref sequence is reliable at position i
853 if (!this.ref.isReliablePosition(i)) {
854 return false;
855 }
856 // check if each of the matching CDS of the homologs is reliable at position i of the ref sequence
857 for (UniprotHomolog hom:this){
858 if (hom.getUniprotEntry().getRepresentativeCDS()==null) {
859 continue; // the homolog may have no representative CDS, in that case we don't want to check the CDS alignment as it doesn't exist
860 }
861 int alnPos = aln.seq2al(this.ref.getUniprotSeq().getName(), i+1);
862 int seqPos = aln.al2seq(hom.getLongSequenceTag(), alnPos);
863 if (seqPos==-1) { // the position maps to a gap in the homolog sequence, there's no CDS alignment to check, we can continue to next homolog directly
864 continue;
865 }
866 if (!hom.getUniprotEntry().isReliablePosition(seqPos-1)) {
867 return false;
868 }
869 }
870 return true;
871 }
872 }

Properties

Name Value
svn:mime-type text/plain