Submit | Open tickets | Closed tickets

[ Ticket #1107 ] long sequences and 64 bit
Date:
01/18/11 01:27
Submitted by:
alpapan
Assigned to:
unset
Category:
Clustering
Priority:
5
Ticket group:
Critical
Resolution:
Unset
Summary:
long sequences and 64 bit
Original submission:
Hello

Sadly a variable type bug for 64bit machines estimates that negative memory is required to initiate a run (tested with cd-hit-est and one 50,000 bp sequence). The following patch ought to fix it.

thanks
michael ott & alexie papanicolaou
CSIRO Ecosystem Sciences

diff -rupN cd-hit-v4.3-2010-10-25/cdhit-common.c++ cd-hit-v4.3-2010-10-25.patched/cdhit-common.c++
--- cd-hit-v4.3-2010-10-25/cdhit-common.c++ 2010-10-26 11:04:34.000000000 +1100
+++ cd-hit-v4.3-2010-10-25.patched/cdhit-common.c++ 2011-01-18 15:40:46.285906232 +1100
@@ -2081,9 +2081,9 @@ void SequenceDB::ClusterOne( Sequence *s
}
}
#include<assert.h>
-size_t SequenceDB::MinimalMemory( int frag_no, int bsize, int T, const Options & options )
+size_t SequenceDB::MinimalMemory( int frag_no, size_t bsize, int T, const Options & options )
{
- int N = sequences.size();
+ size_t N = sequences.size();
int F = frag_no < MAXNUM*CHUNK2 ? frag_no : MAXNUM*CHUNK2;
size_t mem_need = 0;
size_t mem, mega = 1000000;
@@ -2092,28 +2092,28 @@ size_t SequenceDB::MinimalMemory( int fr
printf( "nApproximated minimal memory consumption:n" );
mem = N*sizeof(Sequence) + options.total_desc + N;
if( options.store_disk == false ) mem += options.total_letters + N;
- printf( "%-16s: %iMn", "Sequence", mem/mega );
+ printf( "%-16s: %luMn", "Sequence", (unsigned long) mem/mega );
mem_need += mem;

mem = bsize;
- printf( "%-16s: %i X %iM = %iMn", "Buffer", T, mem/mega, T*mem/mega );
+ printf( "%-16s: %i X %luM = %luMn", "Buffer", T, (unsigned long) mem/mega, (unsigned long) T*mem/mega );
mem_need += T*mem;

mem = F*(sizeof(Sequence*) + sizeof(IndexCount)) + NAAN*sizeof(NVector<IndexCount>);
- printf( "%-16s: %i X %iM = %iMn", "Table", table, mem/mega, table*mem/mega );
+ printf( "%-16s: %i X %luM = %luMn", "Table", table, (unsigned long) mem/mega, (unsigned long) table*mem/mega );
mem_need += table*mem;

mem = sequences.capacity()*sizeof(Sequence*) + N*sizeof(int);
mem += Comp_AAN_idx.size()*sizeof(int);
- printf( "%-16s: %iMn", "Miscellaneous", mem/mega );
+ printf( "%-16s: %luMn", "Miscellaneous", (unsigned long) mem/mega );
mem_need += mem;

- printf( "%-16s: %iMnn", "Total", mem_need/mega );
+ printf( "%-16s: %luMnn", "Total", (unsigned long) mem_need/mega );

if(options.max_memory and options.max_memory < mem_need + 50*table ){
char msg[200];
- sprintf( msg, "not enough memory, please set -M option greater than %in",
- 50*table + mem_need/mega );
+ sprintf( msg, "not enough memory, please set -M option greater than %lun",
+ (unsigned long) 50*table + mem_need/mega );
bomb_error(msg);
}
return mem_need;
@@ -2169,7 +2169,7 @@ void SequenceDB::DoClustering( int T, co
printf( "Table limit with the given memory limit:n" );
printf( "Max number of representatives: %in", MAXNUM*CHUNK2 );
if( options.max_memory )
- printf( "Max number of word counting entries: %in", mem_limit );
+ printf( "Max number of word counting entries: %lun", (unsigned long) mem_limit );
else mem_limit = options.max_entries;
printf( "n" );

@@ -2301,7 +2301,7 @@ void SequenceDB::DoClustering( int T, co
}else if( i < m ){
printf( "r---------- %6i remaining sequences to the next cyclen", m-i );
}
- printf( "---------- new table with %8i representativesn", word_table.sequences.size() );
+ printf( "---------- new table with %8lu representativesn", (unsigned long) word_table.sequences.size() );
if( (last_table.size + word_table.size) > tabsize )
tabsize = last_table.size + word_table.size;
last_table.Clear();
@@ -2312,7 +2312,7 @@ void SequenceDB::DoClustering( int T, co
}
printf( "n%9li finished %9li clustersn", sequences.size(), rep_seqs.size() );
mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
- printf( "nApprixmated maximum memory consumption: %iMn", mem );
+ printf( "nApprixmated maximum memory consumption: %luMn", (unsigned long) mem );
last_table.Clear();
word_table.Clear();
}
@@ -2694,7 +2694,7 @@ void SequenceDB::DoClustering( const Opt
printf( "Table limit with the given memory limit:n" );
printf( "Max number of representatives: %in", MAXNUM*CHUNK2 );
if( options.max_memory )
- printf( "Max number of word counting entries: %in", mem_limit );
+ printf( "Max number of word counting entries: %lun", (unsigned long) mem_limit );
else mem_limit = options.max_entries;
printf( "n" );

@@ -2753,7 +2753,7 @@ void SequenceDB::DoClustering( const Opt
}
printf( "n%9li finished %9li clustersn", sequences.size(), rep_seqs.size() );
mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
- printf( "nApprixmated maximum memory consumption: %iMn", mem );
+ printf( "nApprixmated maximum memory consumption: %luMn", (unsigned long) mem );
temp_files.Clear();
word_table.Clear();

diff -rupN cd-hit-v4.3-2010-10-25/cdhit-common.h cd-hit-v4.3-2010-10-25.patched/cdhit-common.h
--- cd-hit-v4.3-2010-10-25/cdhit-common.h 2010-10-26 11:04:34.000000000 +1100
+++ cd-hit-v4.3-2010-10-25.patched/cdhit-common.h 2011-01-18 15:36:44.947982894 +1100
@@ -463,7 +463,7 @@ struct WorkingBuffer
Vector<int> diag_score2;
Vector<int> aan_list_comp;
char seqi_comp[MAX_SEQ];
- int total_bytes;
+ size_t total_bytes;

WorkingBuffer( int frag=0, const Options & options=Options() ){
Set( frag, options );
@@ -471,8 +471,8 @@ struct WorkingBuffer
void Set( int frag, const Options & options ){
bool est = options.isEST;
int m = MAX_UAA*MAX_UAA;
- int max_len = options.max_length;
- int band = max_len*max_len;
+ size_t max_len = (size_t) options.max_length;
+ size_t band = max_len*max_len;
if( est ) m = m * m;
if( band > options.band_width ) band = options.band_width;
taap.resize( m );
@@ -550,7 +550,7 @@ class SequenceDB
void SortDivide( Options & options, bool sort=true );
void MakeWordTable( const Options & optioins );

- size_t MinimalMemory( int frag_no, int bsize, int T, const Options & options );
+ size_t MinimalMemory( int frag_no, size_t bsize, int T, const Options & options );

void ClusterOne( Sequence *seq, int id, WordTable & table,
WorkingParam & param, WorkingBuffer & buf, const Options & options );

Please log in to add comments and receive followups via email.
No followups have been posted
No results for "Dependent on ticket"
No results for "Dependent on Task"
No other tickets are dependent on this ticket
Ticket change history
Field Old value Date By
status_id Pending 07/14/11 01:19 liwz
close_date 12/31/69 19:00 07/14/11 01:19 liwz

© 1998-2025 Scilico, LLC. All rights reserved.