Original submission:
Hello
Sadly a variable type bug for 64bit machines estimates that negative memory is required to initiate a run (tested with cd-hit-est and one 50,000 bp sequence). The following patch ought to fix it.
thanks
michael ott & alexie papanicolaou
CSIRO Ecosystem Sciences
diff -rupN cd-hit-v4.3-2010-10-25/cdhit-common.c++ cd-hit-v4.3-2010-10-25.patched/cdhit-common.c++
--- cd-hit-v4.3-2010-10-25/cdhit-common.c++ 2010-10-26 11:04:34.000000000 +1100
+++ cd-hit-v4.3-2010-10-25.patched/cdhit-common.c++ 2011-01-18 15:40:46.285906232 +1100
@@ -2081,9 +2081,9 @@ void SequenceDB::ClusterOne( Sequence *s
}
}
#include<assert.h>
-size_t SequenceDB::MinimalMemory( int frag_no, int bsize, int T, const Options & options )
+size_t SequenceDB::MinimalMemory( int frag_no, size_t bsize, int T, const Options & options )
{
- int N = sequences.size();
+ size_t N = sequences.size();
int F = frag_no < MAXNUM*CHUNK2 ? frag_no : MAXNUM*CHUNK2;
size_t mem_need = 0;
size_t mem, mega = 1000000;
@@ -2092,28 +2092,28 @@ size_t SequenceDB::MinimalMemory( int fr
printf( "nApproximated minimal memory consumption:n" );
mem = N*sizeof(Sequence) + options.total_desc + N;
if( options.store_disk == false ) mem += options.total_letters + N;
- printf( "%-16s: %iMn", "Sequence", mem/mega );
+ printf( "%-16s: %luMn", "Sequence", (unsigned long) mem/mega );
mem_need += mem;
mem = bsize;
- printf( "%-16s: %i X %iM = %iMn", "Buffer", T, mem/mega, T*mem/mega );
+ printf( "%-16s: %i X %luM = %luMn", "Buffer", T, (unsigned long) mem/mega, (unsigned long) T*mem/mega );
mem_need += T*mem;
mem = F*(sizeof(Sequence*) + sizeof(IndexCount)) + NAAN*sizeof(NVector<IndexCount>);
- printf( "%-16s: %i X %iM = %iMn", "Table", table, mem/mega, table*mem/mega );
+ printf( "%-16s: %i X %luM = %luMn", "Table", table, (unsigned long) mem/mega, (unsigned long) table*mem/mega );
mem_need += table*mem;
mem = sequences.capacity()*sizeof(Sequence*) + N*sizeof(int);
mem += Comp_AAN_idx.size()*sizeof(int);
- printf( "%-16s: %iMn", "Miscellaneous", mem/mega );
+ printf( "%-16s: %luMn", "Miscellaneous", (unsigned long) mem/mega );
mem_need += mem;
- printf( "%-16s: %iMnn", "Total", mem_need/mega );
+ printf( "%-16s: %luMnn", "Total", (unsigned long) mem_need/mega );
if(options.max_memory and options.max_memory < mem_need + 50*table ){
char msg[200];
- sprintf( msg, "not enough memory, please set -M option greater than %in",
- 50*table + mem_need/mega );
+ sprintf( msg, "not enough memory, please set -M option greater than %lun",
+ (unsigned long) 50*table + mem_need/mega );
bomb_error(msg);
}
return mem_need;
@@ -2169,7 +2169,7 @@ void SequenceDB::DoClustering( int T, co
printf( "Table limit with the given memory limit:n" );
printf( "Max number of representatives: %in", MAXNUM*CHUNK2 );
if( options.max_memory )
- printf( "Max number of word counting entries: %in", mem_limit );
+ printf( "Max number of word counting entries: %lun", (unsigned long) mem_limit );
else mem_limit = options.max_entries;
printf( "n" );
@@ -2301,7 +2301,7 @@ void SequenceDB::DoClustering( int T, co
}else if( i < m ){
printf( "r---------- %6i remaining sequences to the next cyclen", m-i );
}
- printf( "---------- new table with %8i representativesn", word_table.sequences.size() );
+ printf( "---------- new table with %8lu representativesn", (unsigned long) word_table.sequences.size() );
if( (last_table.size + word_table.size) > tabsize )
tabsize = last_table.size + word_table.size;
last_table.Clear();
@@ -2312,7 +2312,7 @@ void SequenceDB::DoClustering( int T, co
}
printf( "n%9li finished %9li clustersn", sequences.size(), rep_seqs.size() );
mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
- printf( "nApprixmated maximum memory consumption: %iMn", mem );
+ printf( "nApprixmated maximum memory consumption: %luMn", (unsigned long) mem );
last_table.Clear();
word_table.Clear();
}
@@ -2694,7 +2694,7 @@ void SequenceDB::DoClustering( const Opt
printf( "Table limit with the given memory limit:n" );
printf( "Max number of representatives: %in", MAXNUM*CHUNK2 );
if( options.max_memory )
- printf( "Max number of word counting entries: %in", mem_limit );
+ printf( "Max number of word counting entries: %lun", (unsigned long) mem_limit );
else mem_limit = options.max_entries;
printf( "n" );
@@ -2753,7 +2753,7 @@ void SequenceDB::DoClustering( const Opt
}
printf( "n%9li finished %9li clustersn", sequences.size(), rep_seqs.size() );
mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
- printf( "nApprixmated maximum memory consumption: %iMn", mem );
+ printf( "nApprixmated maximum memory consumption: %luMn", (unsigned long) mem );
temp_files.Clear();
word_table.Clear();
diff -rupN cd-hit-v4.3-2010-10-25/cdhit-common.h cd-hit-v4.3-2010-10-25.patched/cdhit-common.h
--- cd-hit-v4.3-2010-10-25/cdhit-common.h 2010-10-26 11:04:34.000000000 +1100
+++ cd-hit-v4.3-2010-10-25.patched/cdhit-common.h 2011-01-18 15:36:44.947982894 +1100
@@ -463,7 +463,7 @@ struct WorkingBuffer
Vector<int> diag_score2;
Vector<int> aan_list_comp;
char seqi_comp[MAX_SEQ];
- int total_bytes;
+ size_t total_bytes;
WorkingBuffer( int frag=0, const Options & options=Options() ){
Set( frag, options );
@@ -471,8 +471,8 @@ struct WorkingBuffer
void Set( int frag, const Options & options ){
bool est = options.isEST;
int m = MAX_UAA*MAX_UAA;
- int max_len = options.max_length;
- int band = max_len*max_len;
+ size_t max_len = (size_t) options.max_length;
+ size_t band = max_len*max_len;
if( est ) m = m * m;
if( band > options.band_width ) band = options.band_width;
taap.resize( m );
@@ -550,7 +550,7 @@ class SequenceDB
void SortDivide( Options & options, bool sort=true );
void MakeWordTable( const Options & optioins );
- size_t MinimalMemory( int frag_no, int bsize, int T, const Options & options );
+ size_t MinimalMemory( int frag_no, size_t bsize, int T, const Options & options );
void ClusterOne( Sequence *seq, int id, WordTable & table,
WorkingParam & param, WorkingBuffer & buf, const Options & options );
|