1 |
package owl.core.sequence; |
2 |
|
3 |
import java.io.BufferedReader; |
4 |
import java.io.File; |
5 |
import java.io.FileNotFoundException; |
6 |
import java.io.FileReader; |
7 |
import java.io.IOException; |
8 |
import java.io.PrintStream; |
9 |
import java.io.PrintWriter; |
10 |
import java.util.ArrayList; |
11 |
import java.util.Collection; |
12 |
import java.util.HashMap; |
13 |
import java.util.Iterator; |
14 |
import java.util.List; |
15 |
import java.util.Map; |
16 |
import java.util.Set; |
17 |
import java.util.TreeSet; |
18 |
import java.util.regex.Matcher; |
19 |
import java.util.regex.Pattern; |
20 |
|
21 |
import org.apache.log4j.Logger; |
22 |
import org.xml.sax.SAXException; |
23 |
|
24 |
import owl.core.connections.EmblWSDBfetchConnection; |
25 |
import owl.core.connections.NoMatchFoundException; |
26 |
import owl.core.connections.UniProtConnection; |
27 |
import owl.core.runners.SelectonRunner; |
28 |
import owl.core.runners.TcoffeeError; |
29 |
import owl.core.runners.TcoffeeRunner; |
30 |
import owl.core.runners.blast.BlastError; |
31 |
import owl.core.runners.blast.BlastHit; |
32 |
import owl.core.runners.blast.BlastHitList; |
33 |
import owl.core.runners.blast.BlastRunner; |
34 |
import owl.core.runners.blast.BlastXMLParser; |
35 |
import owl.core.sequence.alignment.AlignmentConstructionError; |
36 |
import owl.core.sequence.alignment.MultipleSequenceAlignment; |
37 |
import owl.core.sequence.alignment.PairwiseSequenceAlignment; |
38 |
import owl.core.sequence.alignment.PairwiseSequenceAlignment.PairwiseSequenceAlignmentException; |
39 |
import owl.core.util.FileFormatError; |
40 |
import owl.core.util.Goodies; |
41 |
import uk.ac.ebi.kraken.interfaces.uniprot.DatabaseType; |
42 |
import uk.ac.ebi.kraken.interfaces.uniprot.NcbiTaxon; |
43 |
import uk.ac.ebi.kraken.interfaces.uniprot.NcbiTaxonomyId; |
44 |
import uk.ac.ebi.kraken.interfaces.uniprot.Organelle; |
45 |
import uk.ac.ebi.kraken.interfaces.uniprot.UniProtEntry; |
46 |
import uk.ac.ebi.kraken.interfaces.uniprot.dbx.embl.Embl; |
47 |
import uk.ac.ebi.kraken.uuw.services.remoting.EntryIterator; |
48 |
|
49 |
/** |
50 |
* Class to store a set of uniprot homologs of a given sequence. |
51 |
* It contains methods to blast against a uniprot database to get the homolog |
52 |
* list and to retrieve data from uniprot (taxonomy, dbrefs) and embl cds sequences. |
53 |
* |
54 |
* @see UniprotHomolog |
55 |
* |
56 |
* @author duarte_j |
57 |
* |
58 |
*/ |
59 |
public class UniprotHomologList implements Iterable<UniprotHomolog>{ |
60 |
|
61 |
/*------------------------ constants --------------------------*/ |
62 |
|
63 |
private static final String BLASTOUT_SUFFIX = "blast.out.xml"; |
64 |
private static final String FASTA_SUFFIX = ".fa"; |
65 |
private static final String BLAST_BASENAME = "homSearch"; |
66 |
|
67 |
private static final int BLAST_OUTPUT_TYPE = 7; // xml output |
68 |
private static final boolean BLAST_NO_FILTERING = true; |
69 |
private static final String UNIPROT_VER_FILE = "reldate.txt"; |
70 |
|
71 |
private static final String TCOFFEE_ALN_OUTFORMAT = "fasta"; |
72 |
|
73 |
private static final boolean DEBUG = false; |
74 |
|
75 |
private static final Logger LOGGER = Logger.getLogger(UniprotHomologList.class); |
76 |
|
77 |
/*-------------------------- members --------------------------*/ |
78 |
|
79 |
private UniprotEntry ref; // the uniprot entry to which the homologs refer |
80 |
private List<UniprotHomolog> list; // the list of homologs |
81 |
private Map<String,List<UniprotHomolog>> lookup; // to speed up searches (uniprot ids to Homologs lists) |
82 |
// it's a list because blast can hit a single uniprot in multiple regions (for us |
83 |
// that's multiple BlastHits) |
84 |
private double idCutoff; // the identity cutoff (see restrictToMinId() ) |
85 |
private String uniprotVer; // the version of uniprot used in blasting, read from the reldate.txt uniprot file |
86 |
|
87 |
private MultipleSequenceAlignment aln; // the protein sequences alignment |
88 |
private MultipleSequenceAlignment nucAln; // the nucleotides alignment |
89 |
|
90 |
private int reducedAlphabet; // the reduced alphabet used to calculate entropies |
91 |
private List<Double> entropies; // entropies for each uniprot reference sequence position |
92 |
private List<Double> kaksRatios; // ka/ks for each uniprot reference sequence position |
93 |
|
94 |
|
95 |
public UniprotHomologList(UniprotEntry ref) { |
96 |
this.ref = ref; |
97 |
this.idCutoff = 0.0; // i.e. no filter |
98 |
} |
99 |
|
100 |
/** |
101 |
* Performs a blast search based on the reference UniprotEntry to populate this list of homologs. |
102 |
* All blast output files will be removed on exit. |
103 |
* @param blastBinDir |
104 |
* @param blastDbDir |
105 |
* @param blastDb |
106 |
* @param blastNumThreads |
107 |
* @param cacheFile a file with the cached xml blast output file, if null blast will be always run |
108 |
* @throws IOException |
109 |
* @throws BlastError |
110 |
*/ |
111 |
public void searchWithBlast(String blastBinDir, String blastDbDir, String blastDb, int blastNumThreads, File cacheFile) throws IOException, BlastError { |
112 |
File outBlast = null; |
113 |
boolean fromCache = false; |
114 |
if (cacheFile!=null && cacheFile.exists()) { |
115 |
outBlast = cacheFile; |
116 |
fromCache = true; |
117 |
LOGGER.warn("Reading blast results from cache file "+cacheFile); |
118 |
} else { |
119 |
outBlast = File.createTempFile(BLAST_BASENAME,BLASTOUT_SUFFIX); |
120 |
File inputSeqFile = File.createTempFile(BLAST_BASENAME,FASTA_SUFFIX); |
121 |
if (!DEBUG) { |
122 |
outBlast.deleteOnExit(); |
123 |
inputSeqFile.deleteOnExit(); |
124 |
} |
125 |
// NOTE: we blast the reference uniprot sequence |
126 |
this.ref.getUniprotSeq().writeToFastaFile(inputSeqFile); |
127 |
BlastRunner blastRunner = new BlastRunner(blastBinDir, blastDbDir); |
128 |
blastRunner.runBlastp(inputSeqFile, blastDb, outBlast, BLAST_OUTPUT_TYPE, BLAST_NO_FILTERING, blastNumThreads); |
129 |
this.uniprotVer = readUniprotVer(blastDbDir); |
130 |
if (cacheFile!=null) { |
131 |
try { |
132 |
Goodies.copyFile(outBlast, cacheFile); |
133 |
} catch (IOException e) { |
134 |
System.err.println("Couldn't write the blast cache file "+cacheFile); |
135 |
System.err.println(e.getMessage()); |
136 |
} |
137 |
} |
138 |
} |
139 |
|
140 |
BlastHitList blastList = null; |
141 |
try { |
142 |
BlastXMLParser blastParser = new BlastXMLParser(outBlast); |
143 |
blastList = blastParser.getHits(); |
144 |
} catch (SAXException e) { |
145 |
if (fromCache) { |
146 |
throw new IOException("Cache file "+cacheFile+" does not comply with blast XML format."); |
147 |
} else { |
148 |
// if this happens it means that blast doesn't format correctly its XML, i.e. has a bug |
149 |
System.err.println("Unexpected error: "+e.getMessage()); |
150 |
System.exit(1); |
151 |
} |
152 |
} |
153 |
|
154 |
if (fromCache) { |
155 |
if (!blastList.getQueryId().equals(this.ref.getUniprotSeq().getName())) { |
156 |
throw new IOException("Query id "+blastList.getQueryId()+" from cache file "+cacheFile+" does not match the id from the sequence: "+this.ref.getUniprotSeq().getName()); |
157 |
} |
158 |
this.uniprotVer = readUniprotVer(cacheFile.getParent()); |
159 |
String uniprotVerFromBlastDbDir = readUniprotVer(blastDbDir); |
160 |
if (!uniprotVerFromBlastDbDir.equals(uniprotVer)) { |
161 |
throw new IOException("Uniprot version from blast db dir "+blastDbDir+" does not match version in cache dir "+cacheFile.getParent()); |
162 |
} |
163 |
} |
164 |
|
165 |
|
166 |
this.list = new ArrayList<UniprotHomolog>(); |
167 |
for (BlastHit hit:blastList) { |
168 |
String sid = hit.getSubjectId(); |
169 |
Matcher m = Sequence.DEFLINE_PRIM_ACCESSION_REGEX.matcher(sid); |
170 |
if (m.matches()) { |
171 |
String uniId = m.group(1); |
172 |
list.add(new UniprotHomolog(hit,new UniprotEntry(uniId))); |
173 |
} else { |
174 |
System.err.println("Could not find uniprot id in subject id "+sid); |
175 |
} |
176 |
} |
177 |
initialiseMap(); |
178 |
} |
179 |
|
180 |
//TODO write a searchithPSIBlast method |
181 |
|
182 |
/** |
183 |
* Initialises the lookup map (for speeding up lookups of homologs by uniprot ids) |
184 |
*/ |
185 |
private void initialiseMap() { |
186 |
this.lookup = new HashMap<String, List<UniprotHomolog>>(); |
187 |
for (UniprotHomolog hom:this) { |
188 |
if (lookup.containsKey(hom.getUniId())) { |
189 |
lookup.get(hom.getUniId()).add(hom); |
190 |
} else { |
191 |
List<UniprotHomolog> list = new ArrayList<UniprotHomolog>(); |
192 |
list.add(hom); |
193 |
lookup.put(hom.getUniId(), list); |
194 |
} |
195 |
|
196 |
} |
197 |
} |
198 |
|
199 |
private String readUniprotVer(String blastDbDir) { |
200 |
String ver = "unknown"; |
201 |
File uniprotVerFile = new File(blastDbDir,UNIPROT_VER_FILE); |
202 |
try { |
203 |
|
204 |
BufferedReader br = new BufferedReader(new FileReader(uniprotVerFile)); |
205 |
String line; |
206 |
Pattern p = Pattern.compile("^UniProt\\sKnowledgebase\\sRelease\\s([\\d._]+)\\s.*"); |
207 |
while ((line=br.readLine())!=null){ |
208 |
Matcher m = p.matcher(line); |
209 |
if (m.matches()) { |
210 |
ver = m.group(1); |
211 |
break; |
212 |
} |
213 |
} |
214 |
br.close(); |
215 |
} catch(IOException e) { |
216 |
System.err.println("Warning: couldn't read uniprot version from file "+uniprotVerFile); |
217 |
} |
218 |
return ver; |
219 |
} |
220 |
|
221 |
/** |
222 |
* Retrieves from UniprotKB the sequence, taxonomy and EMBL CDS ids data, |
223 |
* by using the remote Uniprot API |
224 |
*/ |
225 |
public void retrieveUniprotKBData() { |
226 |
UniProtConnection uniprotConn = new UniProtConnection(); |
227 |
if (!uniprotConn.getVersion().equals(this.uniprotVer)){ |
228 |
System.err.println("Warning! Uniprot version used for blast ("+uniprotVer+") and uniprot version being queried with api ("+uniprotConn.getVersion()+")don't match!"); |
229 |
} |
230 |
List<String> uniprotIds = new ArrayList<String>(); |
231 |
for (UniprotHomolog hom:this) { |
232 |
uniprotIds.add(hom.getUniId()); |
233 |
} |
234 |
EntryIterator<UniProtEntry> entries = uniprotConn.getMultipleEntries(uniprotIds); |
235 |
|
236 |
for (UniProtEntry entry:entries) { |
237 |
List<UniprotHomolog> homs = this.getHomolog(entry.getPrimaryUniProtAccession().getValue()); |
238 |
for (UniprotHomolog hom:homs) { |
239 |
hom.getUniprotEntry().setUniprotSeq(new Sequence(hom.getUniId(),entry.getSequence().getValue())); |
240 |
|
241 |
List<NcbiTaxonomyId> ncbiTaxIds = entry.getNcbiTaxonomyIds(); |
242 |
if (ncbiTaxIds.size()>1) { |
243 |
System.err.println("Warning! more than one taxonomy id for uniprot entry "+hom.getUniId()); |
244 |
} |
245 |
hom.getUniprotEntry().setTaxId(ncbiTaxIds.get(0).getValue()); |
246 |
List<String> taxons = new ArrayList<String>(); |
247 |
for(NcbiTaxon ncbiTaxon:entry.getTaxonomy()) { |
248 |
taxons.add(ncbiTaxon.getValue()); |
249 |
} |
250 |
hom.getUniprotEntry().setTaxons(taxons); |
251 |
|
252 |
Collection<Embl> emblrefs = entry.getDatabaseCrossReferences(DatabaseType.EMBL); |
253 |
List<String> emblCdsIds = new ArrayList<String>(); |
254 |
Set<String> tmpEmblCdsIdsSet = new TreeSet<String>(); |
255 |
for(Embl ref:emblrefs) { |
256 |
String emblCdsIdWithVer = ref.getEmblProteinId().getValue(); |
257 |
if (!emblCdsIdWithVer.equals("-")) { // for non annotated genomic dna cds sequences the identifier is '-', we ignore them |
258 |
String emblCdsId = emblCdsIdWithVer.substring(0, emblCdsIdWithVer.lastIndexOf(".")); |
259 |
//emblCdsIds.add(emblCdsId); |
260 |
tmpEmblCdsIdsSet.add(emblCdsId); |
261 |
} |
262 |
} |
263 |
emblCdsIds.addAll(tmpEmblCdsIdsSet); // we use the set to be sure there are no duplicates (it does happen sometimes) |
264 |
hom.getUniprotEntry().setEmblCdsIds(emblCdsIds); |
265 |
|
266 |
List<Organelle> orglls = entry.getOrganelles(); |
267 |
if (orglls.size()>0) { |
268 |
hom.getUniprotEntry().setGeneEncodingOrganelle(orglls.get(0).getType().getValue()); |
269 |
if (orglls.size()>1) { |
270 |
for (Organelle orgll:orglls){ |
271 |
if (!orgll.getType().equals(hom.getUniprotEntry().getGeneEncodingOrganelle())) { |
272 |
System.err.println("Warning! Different gene encoding organelles for Uniprot "+hom.getUniId()); |
273 |
} |
274 |
} |
275 |
} |
276 |
} |
277 |
} |
278 |
} |
279 |
} |
280 |
|
281 |
/** |
282 |
* Retrieves from EMBL DB fetch web service the EMBL CDS sequences |
283 |
* @param cacheFile a FASTA file containing the sequences to retrieve. If present and if |
284 |
* it contains ALL required sequences then they are read from cacheFile. If null or file |
285 |
* does not exist or file older than {@value #EmblWSDBfetchConnect.MAX_CACHE_AGE} then the sequences are |
286 |
* retrieved from EMBL DB fetch |
287 |
* @throws IOException |
288 |
*/ |
289 |
public void retrieveEmblCdsSeqs(File cacheFile) throws IOException { |
290 |
List<String> allIds = new ArrayList<String>(); |
291 |
|
292 |
for (UniprotHomolog hom:this) { |
293 |
allIds.addAll(hom.getUniprotEntry().getEmblCdsIds()); |
294 |
} |
295 |
List<Sequence> allSeqs = null; |
296 |
try { |
297 |
allSeqs = EmblWSDBfetchConnection.fetchEMBLCDS(allIds, cacheFile); |
298 |
} catch (NoMatchFoundException e) { |
299 |
// this is unlikely to happen here, that's why we don't write a better error message |
300 |
LOGGER.warn("Couldn't retrieve EMBL CDS sequences for some EMBL cds ids"); |
301 |
LOGGER.warn(e.getMessage()); |
302 |
} |
303 |
|
304 |
// we put the list (containing all the sequences from all the homologs) in a lookup table |
305 |
// so that we can then retrieve the ones corresponding to each homolog below |
306 |
Map<String,Sequence> lookup = new HashMap<String,Sequence>(); |
307 |
for (Sequence seq:allSeqs) { |
308 |
lookup.put(seq.getSecondaryAccession(), seq); |
309 |
} |
310 |
|
311 |
for (UniprotHomolog hom:this) { |
312 |
List<Sequence> seqs = new ArrayList<Sequence>(); |
313 |
for (String emblCdsId:hom.getUniprotEntry().getEmblCdsIds()) { |
314 |
if (lookup.containsKey(emblCdsId)) { |
315 |
seqs.add(lookup.get(emblCdsId)); |
316 |
} else { |
317 |
// this will happen when the CDS sequence was not returned by embl dbfetch or the cache file does not have it (it's in the list of missing entries) |
318 |
// in either case we don't want the list of emblcs sequences to contain a null |
319 |
LOGGER.warn("Sequence for EMBL CDS "+emblCdsId+" of uniprot entry "+hom.getUniId()+" could not be found. Not using it."); |
320 |
//TODO should we also remove from the list of embl cds ids this emblCdsId? Not sure if it can cause problems |
321 |
// to have an embl cds id without its corresponding sequence |
322 |
} |
323 |
} |
324 |
|
325 |
hom.getUniprotEntry().setEmblCdsSeqs(seqs); |
326 |
} |
327 |
} |
328 |
|
329 |
/** |
330 |
* Gets the Homolog given a uniprot ID |
331 |
* @param uniprotId |
332 |
* @return |
333 |
*/ |
334 |
public List<UniprotHomolog> getHomolog(String uniprotId) { |
335 |
return this.lookup.get(uniprotId); |
336 |
} |
337 |
|
338 |
public Iterator<UniprotHomolog> iterator() { |
339 |
return this.list.iterator(); |
340 |
} |
341 |
|
342 |
/** |
343 |
* Write to the given file the query protein sequence and all the homolog protein |
344 |
* sequences (full uniprot sequences) in fasta format. |
345 |
* @param outFile |
346 |
* @param writeQuery if true the query sequence is written as well, if false only homologs |
347 |
* @throws FileNotFoundException |
348 |
*/ |
349 |
public void writeToFasta(File outFile, boolean writeQuery) throws FileNotFoundException { |
350 |
PrintWriter pw = new PrintWriter(outFile); |
351 |
|
352 |
int len = 80; |
353 |
|
354 |
if (writeQuery) { |
355 |
pw.println(MultipleSequenceAlignment.FASTAHEADER_CHAR + this.ref.getUniprotSeq().getName()); |
356 |
for(int i=0; i<this.ref.getLength(); i+=len) { |
357 |
pw.println(ref.getUniprotSeq().getSeq().substring(i, Math.min(i+len,ref.getLength()))); |
358 |
} |
359 |
} |
360 |
|
361 |
for(UniprotHomolog hom:this) { |
362 |
|
363 |
String sequence = hom.getUniprotSeq().getSeq(); |
364 |
pw.println(MultipleSequenceAlignment.FASTAHEADER_CHAR + hom.getLongSequenceTag()); |
365 |
for(int i=0; i<sequence.length(); i+=len) { |
366 |
pw.println(sequence.substring(i, Math.min(i+len,sequence.length()))); |
367 |
} |
368 |
} |
369 |
pw.println(); |
370 |
pw.close(); |
371 |
} |
372 |
|
373 |
/** |
374 |
* Runs t_coffee to align all protein sequences of homologs and the query sequence |
375 |
* returning a MultipleSequenceAlignment object |
376 |
* @param tcoffeeBin |
377 |
* @param veryFast whether to use t_coffee's very fast alignment (and less accurate) mode |
378 |
* @throws IOException |
379 |
* @throws TcoffeeError |
380 |
*/ |
381 |
public void computeTcoffeeAlignment(File tcoffeeBin, boolean veryFast) throws IOException, TcoffeeError { |
382 |
File homologSeqsFile = File.createTempFile("homologs.", ".fa"); |
383 |
File outTreeFile = File.createTempFile("homologs.", ".dnd"); |
384 |
File alnFile = File.createTempFile("homologs.",".aln"); |
385 |
File tcoffeeLogFile = File.createTempFile("homologs.",".tcoffee.log"); |
386 |
if (!DEBUG) { |
387 |
homologSeqsFile.deleteOnExit(); |
388 |
alnFile.deleteOnExit(); |
389 |
tcoffeeLogFile.deleteOnExit(); |
390 |
outTreeFile.deleteOnExit(); |
391 |
} |
392 |
this.writeToFasta(homologSeqsFile, true); |
393 |
TcoffeeRunner tcr = new TcoffeeRunner(tcoffeeBin); |
394 |
tcr.runTcoffee(homologSeqsFile, alnFile, TCOFFEE_ALN_OUTFORMAT, outTreeFile, null, tcoffeeLogFile, veryFast); |
395 |
|
396 |
try { |
397 |
aln = new MultipleSequenceAlignment(alnFile.getAbsolutePath(), MultipleSequenceAlignment.FASTAFORMAT); |
398 |
} catch (FileFormatError e) { |
399 |
System.err.println("Unexpected error, output file of tcoffee "+alnFile+" does not seem to be in the right format."); |
400 |
System.err.println("Error: "+e.getMessage()); |
401 |
System.exit(1); |
402 |
} catch (AlignmentConstructionError e) { |
403 |
System.err.println("Unexpected error, output file of tcoffee "+alnFile+" seems to contain ."); |
404 |
System.err.println("Error: "+e.getMessage()); |
405 |
System.exit(1); |
406 |
} |
407 |
|
408 |
} |
409 |
|
410 |
/** |
411 |
* Returns the protein sequence alignment of query sequence and all homologs |
412 |
* @return |
413 |
*/ |
414 |
public MultipleSequenceAlignment getAlignment() { |
415 |
return aln; |
416 |
} |
417 |
|
418 |
/** |
419 |
* Returns a multiple sequence alignment of all valid nucleotide CDS sequences from the |
420 |
* UniprotHomologList plus the query's CDS by mapping the nucleotide sequences to the protein |
421 |
* sequences alignment. |
422 |
* @return |
423 |
*/ |
424 |
public MultipleSequenceAlignment getNucleotideAlignment() { |
425 |
if (nucAln!=null) { |
426 |
return nucAln; |
427 |
} |
428 |
List<Sequence> allSeqs = new ArrayList<Sequence>(); |
429 |
|
430 |
// CDS of the query sequence |
431 |
StringBuffer nucSeqSB = new StringBuffer(); |
432 |
String queryProtSeq = aln.getAlignedSequence(ref.getUniprotSeq().getName()); |
433 |
ProteinToCDSMatch queryMatching = ref.getRepresentativeCDS(); |
434 |
if (queryMatching!=null) { |
435 |
String bestNucSeq = queryMatching.getNucleotideSeqForBestTranslation(); |
436 |
int j = 0; |
437 |
for (int i=0;i<queryProtSeq.length();i++) { |
438 |
char aa = queryProtSeq.charAt(i); |
439 |
if (aa==MultipleSequenceAlignment.GAPCHARACTER) { |
440 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
441 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
442 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
443 |
} else { |
444 |
if (j+3<=bestNucSeq.length()) { |
445 |
nucSeqSB.append(bestNucSeq.substring(j, j+3)); |
446 |
} else { |
447 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
448 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
449 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
450 |
} |
451 |
j+=3; |
452 |
} |
453 |
} |
454 |
allSeqs.add(new Sequence(queryMatching.getCDSName(),nucSeqSB.toString())); |
455 |
} |
456 |
|
457 |
for (UniprotHomolog hom:this) { |
458 |
nucSeqSB = new StringBuffer(); |
459 |
String protSeq = aln.getAlignedSequence(hom.getBlastHit().getSubjectId()); |
460 |
ProteinToCDSMatch matching = hom.getUniprotEntry().getRepresentativeCDS(); |
461 |
if (matching!=null) { |
462 |
String bestNucSeq = matching.getNucleotideSeqForBestTranslation(); |
463 |
int j = 0; |
464 |
for (int i=0;i<protSeq.length();i++) { |
465 |
char aa = protSeq.charAt(i); |
466 |
if (aa==MultipleSequenceAlignment.GAPCHARACTER) { |
467 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
468 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
469 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
470 |
} else { |
471 |
if (j+3<bestNucSeq.length()) { |
472 |
nucSeqSB.append(bestNucSeq.substring(j, j+3)); |
473 |
} else { |
474 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
475 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
476 |
nucSeqSB.append(MultipleSequenceAlignment.GAPCHARACTER); |
477 |
} |
478 |
j+=3; |
479 |
|
480 |
} |
481 |
} |
482 |
allSeqs.add(new Sequence(matching.getCDSName(),nucSeqSB.toString())); |
483 |
} |
484 |
} |
485 |
try { |
486 |
nucAln = new MultipleSequenceAlignment(allSeqs); |
487 |
} catch(AlignmentConstructionError e) { |
488 |
System.err.println("Unexpected error while creating the nucleotides alignment"); |
489 |
System.err.println(e.getMessage()); |
490 |
System.exit(1); |
491 |
} |
492 |
|
493 |
return nucAln; |
494 |
} |
495 |
|
496 |
public void writeAlignmentToFile(File alnFile) throws FileNotFoundException { |
497 |
aln.writeFasta(new PrintStream(alnFile), 80, true); |
498 |
} |
499 |
|
500 |
public void writeNucleotideAlignmentToFile(File alnFile) throws FileNotFoundException { |
501 |
this.getNucleotideAlignment().writeFasta(new PrintStream(alnFile), 80, true); |
502 |
} |
503 |
|
504 |
/** |
505 |
* Removes homologs below the given sequence identity value. |
506 |
* @param idCutoff |
507 |
*/ |
508 |
public void restrictToMinIdAndCoverage(double idCutoff, double queryCovCutoff) { |
509 |
this.idCutoff = idCutoff; |
510 |
|
511 |
Iterator<UniprotHomolog> it = list.iterator(); |
512 |
while (it.hasNext()) { |
513 |
UniprotHomolog hom = it.next(); |
514 |
if ((hom.getBlastHit().getPercentIdentity()/100.0)<=idCutoff || hom.getBlastHit().getQueryCoverage()<=queryCovCutoff) { |
515 |
it.remove(); |
516 |
|
517 |
} |
518 |
} |
519 |
// finally we update the lookup table |
520 |
initialiseMap(); |
521 |
} |
522 |
|
523 |
/** |
524 |
* Removes the redundant sequences in this list of Homologs. The redundancy reduction procedes as follows: |
525 |
* 1) It groups the sequences by taxonomy id and sequence identity |
526 |
* 2) If any of the groups have more than one member then the pairwise identities within the group are calculated |
527 |
* (all vs all Needleman-Wunsch). From the pairwise identity matrix sequences that are not 100% identity to all the others |
528 |
* are removed from the group, leaving groups that contain only identical sequences from same species. |
529 |
* 3) If after this second pruning any group has more than 1 member then a single member is chosen (first one with a |
530 |
* good matching corresponding CDS sequence or simply first one if no good CDS matchings exist) |
531 |
*/ |
532 |
public void removeRedundancy() { |
533 |
|
534 |
// 1) grouping by tax id and sequence identity |
535 |
Map<String,List<UniprotHomolog>> groups = new HashMap<String,List<UniprotHomolog>>(); |
536 |
for (UniprotHomolog hom:this){ |
537 |
double percentId = hom.getPercentIdentity(); |
538 |
String taxId = hom.getUniprotEntry().getTaxId(); |
539 |
String key = taxId+"_"+String.format("%5.1f",percentId); |
540 |
if (groups.containsKey(key)) { |
541 |
groups.get(key).add(hom); |
542 |
} else { |
543 |
List<UniprotHomolog> list = new ArrayList<UniprotHomolog>(); |
544 |
list.add(hom); |
545 |
groups.put(key, list); |
546 |
} |
547 |
} |
548 |
LOGGER.debug("Number of protein sequence groups for redundancy elimination (based on same identity value and same tax id): "+groups.size()); |
549 |
// 2) finding if group members are really identical (all vs all pairwise alignments) |
550 |
for (String key:groups.keySet()) { |
551 |
// all vs all pairwise alignments |
552 |
List<UniprotHomolog> list = groups.get(key); |
553 |
LOGGER.debug("Size of group "+key+": "+list.size()); |
554 |
if (list.size()>1) { |
555 |
double[][] pairwiseIdMatrix = new double[list.size()][list.size()]; |
556 |
for (int i=0;i<list.size();i++){ |
557 |
for (int j=i+1;j<list.size();j++){ |
558 |
try { |
559 |
PairwiseSequenceAlignment aln = new PairwiseSequenceAlignment(list.get(i).getUniprotSeq(), list.get(j).getUniprotSeq()); |
560 |
pairwiseIdMatrix[i][j]=aln.getPercentIdentity(); |
561 |
} catch (PairwiseSequenceAlignmentException e) { |
562 |
LOGGER.fatal("Unexpected error. Couldn't align sequences for redundancy removal procedure"); |
563 |
LOGGER.fatal(e.getMessage()); |
564 |
System.exit(1); |
565 |
} |
566 |
} |
567 |
} |
568 |
// mirroring the other side of the matrix |
569 |
for (int i=0;i<list.size();i++){ |
570 |
for (int j=i+1;j<list.size();j++){ |
571 |
pairwiseIdMatrix[j][i] = pairwiseIdMatrix[i][j]; |
572 |
} |
573 |
} |
574 |
|
575 |
String matStr = ""; |
576 |
// if a member is not identical to all the others then we throw it away |
577 |
boolean[] hasNoIdenticalPair = new boolean[list.size()]; |
578 |
for (int i=0;i<list.size()-1;i++){ |
579 |
boolean hasIdenticalPair = false; |
580 |
for (int j=0;j<list.size();j++){ |
581 |
matStr+=String.format("%5.1f ", pairwiseIdMatrix[i][j]); |
582 |
if (pairwiseIdMatrix[i][j]>99.99f) { |
583 |
hasIdenticalPair = true; |
584 |
} |
585 |
} |
586 |
if (!hasIdenticalPair) hasNoIdenticalPair[i] = true; |
587 |
matStr+="\n"; |
588 |
} |
589 |
|
590 |
LOGGER.debug("Pairwise similarities: \n"+matStr); |
591 |
Iterator<UniprotHomolog> it = list.iterator(); |
592 |
int i = 0; |
593 |
while (it.hasNext()){ |
594 |
it.next(); |
595 |
if (hasNoIdenticalPair[i]) { |
596 |
it.remove(); |
597 |
} |
598 |
i++; |
599 |
} |
600 |
LOGGER.debug("Size of group after elimination of sequences not identical to any of the others: "+list.size()); |
601 |
} |
602 |
} |
603 |
// 3) if there are still groups with size>1 then they are really redundant, all sequences except one have to be eliminated |
604 |
// in any case we first check that the one we want to eliminate doesn't have a better CDS representative |
605 |
List<UniprotHomolog> toRemove = new ArrayList<UniprotHomolog>(); |
606 |
for (String key:groups.keySet()) { |
607 |
List<UniprotHomolog> list = groups.get(key); |
608 |
if (list.size()>1) { |
609 |
UniprotHomolog homToKeep = null; |
610 |
for (UniprotHomolog hom:list){ |
611 |
if (hom.getUniprotEntry().getRepresentativeCDS()!=null) { |
612 |
homToKeep = hom; |
613 |
break; |
614 |
} |
615 |
} |
616 |
if (homToKeep==null) { // if there wasn't any good CDS homolog we remove all but first |
617 |
for (int i=1;i<list.size();i++) { |
618 |
toRemove.add(list.get(i)); |
619 |
} |
620 |
} else { // if we found one good CDS homolog we remove all the others |
621 |
for (UniprotHomolog hom:list) { |
622 |
if (hom!=homToKeep) toRemove.add(hom); |
623 |
} |
624 |
} |
625 |
} |
626 |
} |
627 |
for (UniprotHomolog hom:toRemove){ |
628 |
this.list.remove(hom); |
629 |
LOGGER.info("Homolog "+hom.getUniId()+" removed because it is redundant."); |
630 |
} |
631 |
LOGGER.info("Number of homologs after redundancy elimination: "+this.size()); |
632 |
|
633 |
// finally we update the lookup table |
634 |
initialiseMap(); |
635 |
} |
636 |
|
637 |
/** |
638 |
* Reduces the number of homologs by skimming the list for homologs with same identities |
639 |
* until maxDesiredHomologs is reached. |
640 |
* The procedure is as follows: sequences are grouped by identity to query and one of each group |
641 |
* eliminated (if possible the one without valid CDS matching) at each iteration until the |
642 |
* desired number of homologs is reached. |
643 |
* This is needed for example to perform ka/ks calculations with selecton (too many sequences |
644 |
* are far too slow and a risk of ks saturation). |
645 |
* @param maxDesiredHomologs |
646 |
*/ |
647 |
public void skimList(int maxDesiredHomologs) { |
648 |
if (list.size()<=maxDesiredHomologs) { |
649 |
return; |
650 |
} |
651 |
LOGGER.info("List of homologs too long: "+list.size()+", skimming it."); |
652 |
// 1) grouping by sequence identity |
653 |
Map<Integer,List<UniprotHomolog>> groups = new HashMap<Integer,List<UniprotHomolog>>(); |
654 |
for (UniprotHomolog hom:this){ |
655 |
double percentId = hom.getPercentIdentity(); |
656 |
int key =(int) Math.round(percentId); |
657 |
if (groups.containsKey(key)) { |
658 |
groups.get(key).add(hom); |
659 |
} else { |
660 |
List<UniprotHomolog> list = new ArrayList<UniprotHomolog>(); |
661 |
list.add(hom); |
662 |
groups.put(key, list); |
663 |
} |
664 |
} |
665 |
// 2) skimming iteratively |
666 |
int countIterations = 0; |
667 |
outer: |
668 |
while (true) { |
669 |
countIterations++; |
670 |
for (int key:groups.keySet()) { |
671 |
List<UniprotHomolog> group = groups.get(key); |
672 |
if (group.size()>1) { |
673 |
// remove the last element of the group |
674 |
UniprotHomolog toRemove = getHomologNonValidCDS(group); |
675 |
if (toRemove==null) { |
676 |
toRemove = group.get(group.size()-1); |
677 |
} |
678 |
list.remove(toRemove); |
679 |
group.remove(toRemove); |
680 |
|
681 |
LOGGER.info("Removed "+toRemove.getUniId()); |
682 |
if (list.size()<=maxDesiredHomologs) break outer; |
683 |
} |
684 |
} |
685 |
} |
686 |
LOGGER.info("Size of homolog list after skimming: "+list.size()+" ("+countIterations+" iterations)"); |
687 |
|
688 |
// and we reinitialise the lookup maps |
689 |
initialiseMap(); |
690 |
} |
691 |
|
692 |
/** |
693 |
* Given a list of homologs returns the first one that does not have a valid CDS match |
694 |
* or null if all homologs have valid matches. |
695 |
* @param list |
696 |
* @return |
697 |
*/ |
698 |
private static UniprotHomolog getHomologNonValidCDS(List<UniprotHomolog> list) { |
699 |
for (UniprotHomolog hom:list) { |
700 |
if (hom.getUniprotEntry().getRepresentativeCDS()==null) { |
701 |
return hom; |
702 |
} |
703 |
} |
704 |
return null; |
705 |
} |
706 |
|
707 |
/** |
708 |
* Returns the number of homologs in this list |
709 |
* @return |
710 |
*/ |
711 |
public int size() { |
712 |
return list.size(); |
713 |
} |
714 |
|
715 |
/** |
716 |
* Returns the sequence identity cutoff, see {@link #restrictToMinId(double)} |
717 |
* @return |
718 |
*/ |
719 |
public double getIdCutoff() { |
720 |
return idCutoff; |
721 |
} |
722 |
|
723 |
/** |
724 |
* Gets the uniprot version used for blasting |
725 |
* @return |
726 |
*/ |
727 |
public String getUniprotVer() { |
728 |
return uniprotVer; |
729 |
} |
730 |
|
731 |
public int getNumHomologsWithCDS() { |
732 |
int count = 0; |
733 |
for (UniprotHomolog hom:this) { |
734 |
if (hom.getUniprotEntry().hasCDS()) { |
735 |
count++; |
736 |
} |
737 |
} |
738 |
return count; |
739 |
} |
740 |
|
741 |
public int getNumHomologsWithValidCDS() { |
742 |
int count = 0; |
743 |
for (UniprotHomolog hom:this) { |
744 |
if (hom.getUniprotEntry().getRepresentativeCDS()!=null) { |
745 |
count++; |
746 |
} |
747 |
} |
748 |
return count; |
749 |
} |
750 |
|
751 |
/** |
752 |
* Checks whether this UniprotHomologList contains genes encoded with strictly |
753 |
* one genetic code type. |
754 |
* @see GeneticCodeType |
755 |
* @return |
756 |
*/ |
757 |
public boolean isConsistentGeneticCodeType() { |
758 |
GeneticCodeType lastGct = null; |
759 |
for (UniprotHomolog hom:this) { |
760 |
GeneticCodeType gct = hom.getUniprotEntry().getGeneticCodeType(); |
761 |
if (lastGct!=null && !lastGct.equals(gct)) { |
762 |
return false; |
763 |
} |
764 |
lastGct = gct; |
765 |
} |
766 |
return true; |
767 |
} |
768 |
|
769 |
/** |
770 |
* Gets the genetic code type of this UniprotHomologList. It does it by simply returning |
771 |
* the genetic code type of the first homolog in the list. It does NOT check for consistency |
772 |
* within the list. For that use {@link #isConsistentGeneticCodeType()} |
773 |
* @return |
774 |
*/ |
775 |
public GeneticCodeType getGeneticCodeType() { |
776 |
return this.list.get(0).getUniprotEntry().getGeneticCodeType(); |
777 |
} |
778 |
|
779 |
/** |
780 |
* Compute the sequence entropies for all reference sequence (uniprot) positions |
781 |
* @param reducedAlphabet |
782 |
*/ |
783 |
public void computeEntropies(int reducedAlphabet) { |
784 |
this.reducedAlphabet = reducedAlphabet; |
785 |
this.entropies = new ArrayList<Double>(); |
786 |
for (int i=0;i<ref.getUniprotSeq().getLength();i++){ |
787 |
entropies.add(this.aln.getColumnEntropy(this.aln.seq2al(ref.getUniprotSeq().getName(),i+1), reducedAlphabet)); |
788 |
} |
789 |
} |
790 |
|
791 |
/** |
792 |
* Compute the sequence ka/ks ratios with selecton for all reference CDS sequence positions |
793 |
* @param selectonBin |
794 |
* @param resultsFile |
795 |
* @param logFile |
796 |
* @param treeFile |
797 |
* @param globalResultsFile |
798 |
* @param epsilon |
799 |
* @throws IOException |
800 |
*/ |
801 |
public void computeKaKsRatiosSelecton(File selectonBin, File resultsFile, File logFile, File treeFile, File globalResultsFile, double epsilon) |
802 |
throws IOException { |
803 |
kaksRatios = new ArrayList<Double>(); |
804 |
SelectonRunner sr = new SelectonRunner(selectonBin); |
805 |
if(!resultsFile.exists()) { |
806 |
File alnFile = File.createTempFile("selecton.", ".cds.aln"); |
807 |
alnFile.deleteOnExit(); |
808 |
this.nucAln.writeFasta(new PrintStream(alnFile), 80, true); |
809 |
sr.run(alnFile, resultsFile, logFile, treeFile, null, globalResultsFile, this.ref.getRepresentativeCDS().getCDSName(), this.getGeneticCodeType(),epsilon); |
810 |
} else { |
811 |
LOGGER.warn("Selecton output file "+resultsFile+" already exists. Using the file instead of running selecton."); |
812 |
try { |
813 |
sr.parseResultsFile(resultsFile, this.ref.getRepresentativeCDS().getCDSName()); |
814 |
} catch (FileFormatError e) { |
815 |
LOGGER.warn("Cached output selecton file "+resultsFile+" does not seem to be in the righ format"); |
816 |
LOGGER.warn(e.getMessage()); |
817 |
LOGGER.warn("Running selecton and overwritting the file."); |
818 |
File alnFile = File.createTempFile("selecton.", ".cds.aln"); |
819 |
alnFile.deleteOnExit(); |
820 |
this.nucAln.writeFasta(new PrintStream(alnFile), 80, true); |
821 |
sr.run(alnFile, resultsFile, logFile, treeFile, null, globalResultsFile, this.ref.getRepresentativeCDS().getCDSName(), this.getGeneticCodeType(),epsilon); |
822 |
} |
823 |
} |
824 |
kaksRatios = sr.getKaKsRatios(); |
825 |
if (kaksRatios.size()!=this.ref.getUniprotSeq().getLength()) { |
826 |
LOGGER.info("Size of ka/ks ratio list ("+kaksRatios.size()+") is not the same as length of reference sequence ("+this.ref.getUniprotSeq().getLength()+")"); |
827 |
} |
828 |
} |
829 |
|
830 |
public List<Double> getEntropies() { |
831 |
return entropies; |
832 |
} |
833 |
|
834 |
public List<Double> getKaksRatios() { |
835 |
return kaksRatios; |
836 |
} |
837 |
|
838 |
public int getReducedAlphabet() { |
839 |
return reducedAlphabet; |
840 |
} |
841 |
|
842 |
/** |
843 |
* Tells whether a given position of the reference sequence (starting at 0) |
844 |
* is reliable with respect to the CDS matching of the reference sequence and the CDS |
845 |
* matchings of the homologs |
846 |
* (not reliable is considered any position where the CDS translation does not match exactly the |
847 |
* protein residue) |
848 |
* @param i |
849 |
* @return |
850 |
*/ |
851 |
public boolean isReferenceSeqPositionReliable(int i) { |
852 |
// check if matching CDS of ref sequence is reliable at position i |
853 |
if (!this.ref.isReliablePosition(i)) { |
854 |
return false; |
855 |
} |
856 |
// check if each of the matching CDS of the homologs is reliable at position i of the ref sequence |
857 |
for (UniprotHomolog hom:this){ |
858 |
if (hom.getUniprotEntry().getRepresentativeCDS()==null) { |
859 |
continue; // the homolog may have no representative CDS, in that case we don't want to check the CDS alignment as it doesn't exist |
860 |
} |
861 |
int alnPos = aln.seq2al(this.ref.getUniprotSeq().getName(), i+1); |
862 |
int seqPos = aln.al2seq(hom.getLongSequenceTag(), alnPos); |
863 |
if (seqPos==-1) { // the position maps to a gap in the homolog sequence, there's no CDS alignment to check, we can continue to next homolog directly |
864 |
continue; |
865 |
} |
866 |
if (!hom.getUniprotEntry().isReliablePosition(seqPos-1)) { |
867 |
return false; |
868 |
} |
869 |
} |
870 |
return true; |
871 |
} |
872 |
} |