Bioinformatics.org
[Regeneron]
Not logged in
  • Log in
  • Bioinformatics.org
    Membership (41423+) Group hosting [?] Wiki
    Franklin Award
    Sponsorships

    Careers
    About bioinformatics
    Bioinformatics training
    Bioinformatics jobs

    Research
    All information groups
    Online databases Online analysis tools Online education tools More tools

    Development
    All software groups
    FTP repository
    SVN & CVS repositories [?]
    Mailing lists

    Forums
    News & Commentary
  • Submit
  • Archives
  • Subscribe

  • Jobs Forum
    (Career Center)
  • Submit
  • Archives
  • Subscribe
  • CD-HIT: Sequence clustering software - Support tickets

    Submit | Open tickets | Closed tickets

    [ Ticket #1107 ] long sequences and 64 bit
    Date:
    01/18/11 01:27
    Submitted by:
    alpapan
    Assigned to:
    unset
    Category:
    Clustering
    Priority:
    5
    Ticket group:
    Critical
    Resolution:
    Unset
    Summary:
    long sequences and 64 bit
    Original submission:
    Hello

    Sadly a variable type bug for 64bit machines estimates that negative memory is required to initiate a run (tested with cd-hit-est and one 50,000 bp sequence). The following patch ought to fix it.

    thanks
    michael ott & alexie papanicolaou
    CSIRO Ecosystem Sciences

    diff -rupN cd-hit-v4.3-2010-10-25/cdhit-common.c++ cd-hit-v4.3-2010-10-25.patched/cdhit-common.c++
    --- cd-hit-v4.3-2010-10-25/cdhit-common.c++ 2010-10-26 11:04:34.000000000 +1100
    +++ cd-hit-v4.3-2010-10-25.patched/cdhit-common.c++ 2011-01-18 15:40:46.285906232 +1100
    @@ -2081,9 +2081,9 @@ void SequenceDB::ClusterOne( Sequence *s
    }
    }
    #include<assert.h>
    -size_t SequenceDB::MinimalMemory( int frag_no, int bsize, int T, const Options & options )
    +size_t SequenceDB::MinimalMemory( int frag_no, size_t bsize, int T, const Options & options )
    {
    - int N = sequences.size();
    + size_t N = sequences.size();
    int F = frag_no < MAXNUM*CHUNK2 ? frag_no : MAXNUM*CHUNK2;
    size_t mem_need = 0;
    size_t mem, mega = 1000000;
    @@ -2092,28 +2092,28 @@ size_t SequenceDB::MinimalMemory( int fr
    printf( "nApproximated minimal memory consumption:n" );
    mem = N*sizeof(Sequence) + options.total_desc + N;
    if( options.store_disk == false ) mem += options.total_letters + N;
    - printf( "%-16s: %iMn", "Sequence", mem/mega );
    + printf( "%-16s: %luMn", "Sequence", (unsigned long) mem/mega );
    mem_need += mem;

    mem = bsize;
    - printf( "%-16s: %i X %iM = %iMn", "Buffer", T, mem/mega, T*mem/mega );
    + printf( "%-16s: %i X %luM = %luMn", "Buffer", T, (unsigned long) mem/mega, (unsigned long) T*mem/mega );
    mem_need += T*mem;

    mem = F*(sizeof(Sequence*) + sizeof(IndexCount)) + NAAN*sizeof(NVector<IndexCount>);
    - printf( "%-16s: %i X %iM = %iMn", "Table", table, mem/mega, table*mem/mega );
    + printf( "%-16s: %i X %luM = %luMn", "Table", table, (unsigned long) mem/mega, (unsigned long) table*mem/mega );
    mem_need += table*mem;

    mem = sequences.capacity()*sizeof(Sequence*) + N*sizeof(int);
    mem += Comp_AAN_idx.size()*sizeof(int);
    - printf( "%-16s: %iMn", "Miscellaneous", mem/mega );
    + printf( "%-16s: %luMn", "Miscellaneous", (unsigned long) mem/mega );
    mem_need += mem;

    - printf( "%-16s: %iMnn", "Total", mem_need/mega );
    + printf( "%-16s: %luMnn", "Total", (unsigned long) mem_need/mega );

    if(options.max_memory and options.max_memory < mem_need + 50*table ){
    char msg[200];
    - sprintf( msg, "not enough memory, please set -M option greater than %in",
    - 50*table + mem_need/mega );
    + sprintf( msg, "not enough memory, please set -M option greater than %lun",
    + (unsigned long) 50*table + mem_need/mega );
    bomb_error(msg);
    }
    return mem_need;
    @@ -2169,7 +2169,7 @@ void SequenceDB::DoClustering( int T, co
    printf( "Table limit with the given memory limit:n" );
    printf( "Max number of representatives: %in", MAXNUM*CHUNK2 );
    if( options.max_memory )
    - printf( "Max number of word counting entries: %in", mem_limit );
    + printf( "Max number of word counting entries: %lun", (unsigned long) mem_limit );
    else mem_limit = options.max_entries;
    printf( "n" );

    @@ -2301,7 +2301,7 @@ void SequenceDB::DoClustering( int T, co
    }else if( i < m ){
    printf( "r---------- %6i remaining sequences to the next cyclen", m-i );
    }
    - printf( "---------- new table with %8i representativesn", word_table.sequences.size() );
    + printf( "---------- new table with %8lu representativesn", (unsigned long) word_table.sequences.size() );
    if( (last_table.size + word_table.size) > tabsize )
    tabsize = last_table.size + word_table.size;
    last_table.Clear();
    @@ -2312,7 +2312,7 @@ void SequenceDB::DoClustering( int T, co
    }
    printf( "n%9li finished %9li clustersn", sequences.size(), rep_seqs.size() );
    mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
    - printf( "nApprixmated maximum memory consumption: %iMn", mem );
    + printf( "nApprixmated maximum memory consumption: %luMn", (unsigned long) mem );
    last_table.Clear();
    word_table.Clear();
    }
    @@ -2694,7 +2694,7 @@ void SequenceDB::DoClustering( const Opt
    printf( "Table limit with the given memory limit:n" );
    printf( "Max number of representatives: %in", MAXNUM*CHUNK2 );
    if( options.max_memory )
    - printf( "Max number of word counting entries: %in", mem_limit );
    + printf( "Max number of word counting entries: %lun", (unsigned long) mem_limit );
    else mem_limit = options.max_entries;
    printf( "n" );

    @@ -2753,7 +2753,7 @@ void SequenceDB::DoClustering( const Opt
    }
    printf( "n%9li finished %9li clustersn", sequences.size(), rep_seqs.size() );
    mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
    - printf( "nApprixmated maximum memory consumption: %iMn", mem );
    + printf( "nApprixmated maximum memory consumption: %luMn", (unsigned long) mem );
    temp_files.Clear();
    word_table.Clear();

    diff -rupN cd-hit-v4.3-2010-10-25/cdhit-common.h cd-hit-v4.3-2010-10-25.patched/cdhit-common.h
    --- cd-hit-v4.3-2010-10-25/cdhit-common.h 2010-10-26 11:04:34.000000000 +1100
    +++ cd-hit-v4.3-2010-10-25.patched/cdhit-common.h 2011-01-18 15:36:44.947982894 +1100
    @@ -463,7 +463,7 @@ struct WorkingBuffer
    Vector<int> diag_score2;
    Vector<int> aan_list_comp;
    char seqi_comp[MAX_SEQ];
    - int total_bytes;
    + size_t total_bytes;

    WorkingBuffer( int frag=0, const Options & options=Options() ){
    Set( frag, options );
    @@ -471,8 +471,8 @@ struct WorkingBuffer
    void Set( int frag, const Options & options ){
    bool est = options.isEST;
    int m = MAX_UAA*MAX_UAA;
    - int max_len = options.max_length;
    - int band = max_len*max_len;
    + size_t max_len = (size_t) options.max_length;
    + size_t band = max_len*max_len;
    if( est ) m = m * m;
    if( band > options.band_width ) band = options.band_width;
    taap.resize( m );
    @@ -550,7 +550,7 @@ class SequenceDB
    void SortDivide( Options & options, bool sort=true );
    void MakeWordTable( const Options & optioins );

    - size_t MinimalMemory( int frag_no, int bsize, int T, const Options & options );
    + size_t MinimalMemory( int frag_no, size_t bsize, int T, const Options & options );

    void ClusterOne( Sequence *seq, int id, WordTable & table,
    WorkingParam & param, WorkingBuffer & buf, const Options & options );

    Please log in to add comments and receive followups via email.
    No followups have been posted
    No results for "Dependent on ticket"
    No results for "Dependent on Task"
    No other tickets are dependent on this ticket
    Ticket change history
    Field Old value Date By
    status_id Pending 07/14/11 01:19 liwz
    close_date 12/31/69 19:00 07/14/11 01:19 liwz

     

    Copyright © 2019 · Scilico, LLC