package charite.christo.aaSetup;
import charite.christo.*;
import charite.christo.strap.*;
import java.util.*;
import java.io.*;
import static charite.christo.ChUtils.*;
import static charite.christo.strap.LocalSequenceDB.*;
#if 0
/**
   #######################################
   # Pfam IDs
   zcat /local/bioinf/DB/pfam/Pfam-A.seed.gz | fgrep 'DR PDB; ' | sed 's|.*DR PDB; \(....\) \(.\).*|>\1_\2|1' > TMP/otherIdList
   #######################################

   aa_setup -formatdb -localSequenceDB=UNIPROT:tmp/DB/uniprot_sprot.fasta.gz -GFF=UNIPROT:tmp/DB/uniprot-reviewed.gff -sort=upPrefOrder.txt
   hash.tmp/136:B6H768=2ED93FE

   tmp/DB
   ln -s /local/bioinf/DB/UniProt/{uniprot_sprot.fasta.gz,uniprot-reviewed.gff} tmp/DB
*/
#endif //0

@*SARRAY_GFF_SKIP_NAMES
  Chain|Compositional|Repeat|Helix|Turn|Beta strand
*@
@*SARRAY_GFF_RPLC
status=By similarity|$SBS
status=Potential|$SP
*@
#define _mapGffDB mapNoClr(255)
public final class LocalSequenceDBFormater{
    private final static String SFX_ACTG=XMATCH_SFX_DIR+"actg.tmp",SFX_TSV=XMATCH_SFX_DIR+"tsv.tmp",SFX_SORTED=XMATCH_SFX_DIR+"sorted.tmp";
#define MSG_MEMORY "If the program stops with an OutOfMemoryError then increase the maximum heap size with the java option -Xmx"
#define MIN_SEQLEN 25
#define HASH_ANNOTATION 1
#define JOINSECTIONS 200
    private final LocalSequenceDB _db;
    private final BA _bySection[][]=new BA[2][SECTIONS/JOINSECTIONS],_tmp=new BA(999);
    private static int _progress;
    private final File _f;

    private LocalSequenceDBFormater(LocalSequenceDB sdb){
        _db=sdb;
        _f=sdb.f();
    }

    public static void format(){
        try{
            for(LocalSequenceDB db:LocalSequenceDB.instances()){
                final LocalSequenceDBFormater t=new LocalSequenceDBFormater(db);
                t.tsvFile();
                t.tsvFileSorted();
                t.rmSingleHashFiles();
                t.gfffaFile();
                FORi(0,2) t.mkHashFile(i);
                t.mk2bit();
            }
        }catch(OutOfMemoryError oomer){
            errorEx(oomer,RED_ERROR,MSG_MEMORY);
            MEIN_DIE("LocalSequenceDBFormater ");
        }
    }
    private boolean upToDate(File f){
        boolean ok=sze(f)>0 && fileNewr(f,_f);
        for(File gff:ffGFF()) ok=ok&&fileNewr(f,gff);
        return ok;
    }
    private boolean needMkFile(File f){
        final boolean ok=upToDate(f);
        baOut(ok?"File is up-to-date: ":"Creating ").aln(f);
        return!ok;
    }
    private void createdFile(File file,OutputStream os){
        closeStrm(os);
        final String s=s(file),s2=delSfx("TMP",s);
        File f=file;
        if(s!=s2) renamFile(file,f=file(s2));
        if(!upToDate(f)) baOut("Failed to generate ").a(f).special(EXIT_NOW);
        else baOut("\nCreated file ").a(f).aln(GREEN_SUCCESS);
    }
/* ---------------------------------------- */
/* >>> Step 0 Prefered Order File >>> */
/*  zcat /local/bioinf/DB/pfam/Pfam-A.seed.gz | cut -c 1-20 | grep '^\([A-Z][A-Z_]*\)/[0-9][0-9]*-[0-9][0-9]*' | sed 's|/.*$||1' > upPrefOrder.txt */

    private final Map<CharSequence,Integer>MAP_PREFORDER=new HashMap();
    private int _prefOrderDone=0;
    public void readPreferedOrder(){
        if(_prefOrderDone++==0){
            final File f=iFile(F_PRPRTY_iPAR_FORMATDB_SORT);
            int count=0;
            if(sze(f)>0){
                final ChInStream is=new ChInStream(f,1024);
                final BA LINE=new BA(999);
                while(is.readLine(clr(LINE))){
                    if(LINE.trim().length()>0){
                        MAP_PREFORDER.put(LINE.toString(),new Integer(++count));
                    }
                }
            }
            if(f!=null) baOut("readPreferedOrder -sort=").aFile(f).a(" #").a(count).aln(GREEN_DONE);
        }
    }
/* <<< Step 0 Prefered Order File <<< */
/* ---------------------------------------- */
/* >>> Step 1 TSV-File >>> */
    void tsvFile(){
        readPreferedOrder();
        final File fOut=file(addSfx(SFX_TSV,_f)),fOutTMP=file(addSfx("TMP",fOut));
        if(!needMkFile(fOut)) return;
        final ChInStream is=new ChInStream(_f,999);
        final BA LINE=new BA(999),sb=new BA(999);
        OutputStream os=null;
        try{
            os=fOutStrm(0,fOutTMP);
            while(is.readLine(clr(LINE))){
                final char c0=LINE.charAt(0),c1=LINE.charAt(1);
                final boolean slashSlash=c0=='/' && c1=='/';
                if(c0=='>' || slashSlash){
                    if(nxt(0,chrClas(LETTR),sb)>=0) tsvFile1(sb,os);
                    clr(sb);
                }
                if(!slashSlash) sb.aln(LINE);
            }
        }catch(IOException iox){
            errorEx(iox,"tsvFile ",fOutTMP);
            MEIN_DIE("LocalSequenceDBFormater ");
        }
        if(nxt(0,chrClas(LETTR),sb)>=0) tsvFile1(sb,os);
        closeStrm(is);
        createdFile(fOutTMP,os);
        clr(MAP_PREFORDER);
    }

    public int rankForID(BA sb,int from){
        if(from>0) sb.setBegin(from);
        final Integer i=MAP_PREFORDER.get(sb);
        if(from>0) sb.setBegin(0);
        if(i!=null){
            return i.intValue();
        }
        return 0;

    }
    private void tsvFile1(BA txt,OutputStream os){
        if(txt.length()==0) return;
        //final boolean isUniProt="UNIPROT:".equals(_db);
        try{
            final byte[]T=txt.bytes();
            final int E=txt.end();
            final BA sb=clr(_tmp),sbID=baClr(31);
            final int seqB;
            final boolean[]no_lettr_digt_us_colon=chrClas(-LETTR_DIGT_US_COLON);

            if(txt.charAt(0)=='>'){/*X Fasta format */
                String pfx;
                final int f=strStarts(pfx=">gb|",txt) || strStarts(pfx=">sp|",txt)?pfx.length():1;
                final int t=nxt(STR_E,no_lettr_digt_us_colon,T,f,E);
                final int e=strchr('\n'|STR_E,T,0,E);
                if(t<0 || t-f<1) baOut(RED_WARNING).a("LocalSequenceDB do not understand line ").aFT(T,e,E).special(EXIT_NOW);
                sbID.aFT(T,f,t);
                int rank=rankForID(sbID,0);
                final int nextE=T[t]=='|'?strchr(STR_E|' ',T,t+1,E):-1;
                int organismF=0,organismT=0;
                //if (nextE>0&& cntainsOnly(UPPR_DIGT_US,T,t+1,nextE))
                if(nextE>t+1 && 0>nxt(chrClas(-UPPR_DIGT_US),T,t+1,nextE)){/*X  >sp|Q6GZX4|001R_FRG3G  Zwei durch bar separierte IDs */
                    sbID.a(XMATCH_SEP_SYN_ID);
                    final int idStart=sbID.end();
                    sbID.aFT(T,t+1,nextE);
                    if(rank==0) rank=rankForID(sbID,idStart);
                    organismF=strchr('_',T,t+1,nextE)+1;
                    organismT=nextE;
                }
                sb.an('0',7-strSzeOfInt(rank)).a(rank).a('\t').a(sbID);
                seqB=sb.a('\t').end();
                sb.aFilter(FILTER_TO_UPPER|LETTR,T,e+1,E);
                sb.a('\t');
                if(nextE>0) sb.aFT(T,organismF,organismT);
                sb.a('\t');
                if(nextE>0) sb.aFT(T,nextE+1,strstr(STR_E," OS=",T,nextE,e));
            }else{
                baOut(RED_ERROR).a("Not fasta format!").aln(txt).special(EXIT_NOW);
                return;
            }
#if CPP_DEACTIVATED
            else {/*X  No Fasta format*/
                _p.removeSequenceRef("*");
                _parser.parse(0,txt,_p);
                final String ss[]=_p.getRefs(0),u=_p.seqId(BLAST4ID_DB_UNIPROT),acc=orS(_p.seqId(GETSEQID_ACCESSION),u);
                appendID(acc,sb);
                if(u!=null && !u.equals(acc)) appendID(u,sb);
                for(String s:ss) if(!s.equals(acc) && !s.equals(u)) sb.a(XMATCH_SEP_SYN_ID).a(s);
                seqB=sb.del(XMATCH_SEP_SYN_ID).a('\t').end();
                sb.a0(_p.getResTypeUC());
            }
#endif //CPP_DEACTIVATED
            if(sb.end()-seqB>=MIN_SEQLEN) sb.a('\n').writeTxt(os);
            if(_progress%2000==0) baOut(ANSI_CURSOR_LEFT+ANSI_CLR_FROM_CURSOR).a("LocalSequenceDBFormater ").a("step 1 ").a(_progress).send();
            _progress++;
        }catch(IOException iox) {baOut("tsvFile1 ").a(iox).special(EXIT_NOW);}
    }

/* private final Protein _p=new Protein(); */
/* private void appendID(String id,BA sb) { */
/* if(id==null) return; */
/* final int colon=id.indexOf(':'); */
/* if(colon>0 && !id.startsWith(_db)) return; */
/* sb.a(id).a(id,colon+1,MAX_INT).a(XMATCH_SEP_SYN_ID); */
/* } */
/* private final ProteinParser _parser=new SwissprotParser(); */

/* <<< Step 1 <<< */
/* ---------------------------------------- */
/* >>> Step 2 Sorted File >>> */
    void tsvFileSorted(){
        final File fOut=file(addSfx(SFX_SORTED,_f));
        if(!needMkFile(fOut)) return;
        rtExecV(0,new Object[]{"sh","-c",new BA(99).a("sort -k 1,3 ").a(_f).a(SFX_TSV+" | uniq > ").a(fOut)});
        createdFile(fOut,null);
    }
/* <<< Step 2 <<< */
/* ---------------------------------------- */
/* >>> Step 3 GFF >>> */
    private static File[]_ffGff;
    private static File[]ffGFF(){
        if(_ffGff==null){
            final String[]aa=splitTkns(0,prprty(iPAR_FORMATDB_GFF),0,MAX_INT,chrClas(SPACE_COMMA));
            final File[]ff=_ffGff=new File[aa.length];
            FORi(0,aa.length){
                final String a=aa[i];
                final int colon=a.indexOf(':');
                if(colon<(IF_WINDOWS(isWin()?1:)0)){
                    baOut("Error for Parameter "+PAR_FORMATDB_GFF+":\nThe database file must be preceded by database name  colon\n Example: "+
                          PAR_FORMATDB_GFF+"UNIPROT:/local/files/db/uniprot-reviewed.gff.gz").special(EXIT_NOW);
                }
                ff[i]=file(a.substring(colon+1));
                _mapGffDB.put(ff[i],a.substring(0,colon+1));
                if(ff[i].length()==0) baOut(PAR_FORMATDB_GFF+": ").aFile(ff[i]).special(EXIT_NOW);
            }
        }
        return _ffGff;

    }
    //_altUrl
    private Map<String,Map>_mapGff;
    private Map gff(){
        if(_mapGff==null){
            _mapGff=new HashMap();
            for(File f:ffGFF()){
                final String db=(String)_mapGffDB.get(f);
                Map m=_mapGff.get(db);
                if(m==null) _mapGff.put(db,m=new HashMap());
                gff(f,m);
            }
        }
        return _mapGff.get(_db.db());
    }

    private static void gff(File f,Map<String,byte[]>m){
        final ChInStream is=new ChInStream(f,999);
        final BA LINE=new BA(999),gff=new BA(999);
        String id=null;
        int count=0;
        final long memory=f.length()*3/2;
        baOut("Loading gff file into memory: ").aln(f);
        if(Runtime.getRuntime().maxMemory()<memory){
            baOut(RED_WARNING).aa(MSG_MEMORY,memory/1000000,'M').aln();
        }
        nextLine:
        while(is.readLine(clr(LINE))){
            final byte[]T=LINE.bytes();
            final int E=LINE.end();
            if(E==0) continue;

            if(T[0]=='#' && strStarts("##sequence-region ",T)){
                if(id!=null && gff.end()>0) {m.put(id,gff.newBytes());}
                id=wordAt(-SPC,T,18,MAX_INT);
                clr(gff);
                if(count++%10000==0) baOut("\rgff ").a(count);
            }else{
                final int
                    f1=strchr('\t',T,0,E)+1,
                    f2=strchr('\t',T,f1,E)+1,
                    f3=strchr('\t',T,f2,E)+1;
                if(f3<=0) continue;
                final byte c=T[f2];
                for(String skip:arry(SARRAY_GFF_SKIP_NAMES)) if(c==skip.charAt(0) && f2+skip.length()+1==f3 && strEquAt(0,skip,T,f2)) continue nextLine;
                for(String[]ss:arry2(SARRAY_GFF_RPLC)) LINE.replace(0,ss[0],ss[1]);
                gff.a('\t').aFT(LINE,f2-1,MAX_INT).a('\n');
            }
        }
        if(id!=null && gff.end()>0) m.put(id,gff.newBytes());
        baOut("\nGFF file loaded ").aln(GREEN_DONE);
        closeStrm(is);
    }
/* <<< Step 3 GFF <<< */
/* ---------------------------------------- */
/* >>> Step 4 DB File >>> */
    private void gfffaFile(){
        _progress=0;
        final File fOut=file(addSfx(XMATCH_SFX_GFFFA,_f)),fOutTMP=file(addSfx("TMP",fOut));
        final File fActg=file(addSfx(SFX_ACTG,_f)),fActgTMP=file(addSfx("TMP",fActg));
        final boolean fOutOK=!needMkFile(fOut),fActgOK=!needMkFile(fActg);
        if(fOutOK && fActgOK) return;
        final ChInStream is=new ChInStream(addSfx(SFX_SORTED,_f),999);
        final BA LINE=new BA(999),seq=new BA(999),ids=new BA(999);
        OutputStream os=null,osActg=null;
        final Map<String,byte[]>gff=gff();
        final int[]tt=new int[5];
        try{
            os=fOutStrm(0,fOutTMP);
            osActg=fOutStrm(0,fActgTMP);
            int join=0;

            while(is.readLine(clr(LINE))){
                final byte[]T=LINE.bytes();
                tabulatrs('\t',T,0,LINE.end(),tt);
                final int idB=tt[0]+1,idE=tt[1],seqB=tt[1]+1,seqE=tt[2];
                gfffaFile1(T,tt,os,gff);
                ids.aFT(T,idB,idE).a(XMATCH_SEP_ID);
                if(join>3 || seqE-seqB!=seq.end() || !strEquAt(0,seq,LINE,seqB)){
                    actgFile1(T,seqB,seqE,ids.del(XMATCH_SEP_ID),osActg);
                    clr(seq).aFT(LINE,seqB,MAX_INT);
                    clr(ids);
                    join=0;
                }
                join++;
            }
            actgFile1(LINE.bytes(),1+strchr('\t',LINE),LINE.end(),ids,osActg);
        }catch(IOException iox){
            errorEx(iox,"gfffaFile ",fActgTMP,fOutTMP);
            MEIN_DIE("LocalSequenceDBFormater ");
        }
        closeStrm(is);
        createdFile(fOutTMP,os);
        createdFile(fActgTMP,osActg);
        writeSections(true);

    }

    private long _gfffaPos;
    private final String[]_id1={null};
    private void gfffaFile1(byte[]T,int[]tt,OutputStream os,Map<String,byte[]>gff)throws IOException{
        final int idB=tt[0]+1,idE=tt[1],seqB=idE+1,seqE=tt[2],orgB=seqE+1,orgE=tt[3],desB=orgE+1,desE=tt[4];/*X  id | sequence | organism | description */
        final BA sb=clr(_tmp).a('>').aFT(T,idB,idE).a('\n').aFT(T,seqB,seqE).a('\n');
        if(orgE-orgB>1) sb.a("#O=").aFT(T,orgB,orgE).a('\n');
        if(desE-desB>1) sb.a("#D=").aFT(T,desB,desE).a('\n');
        final String[]id1=splitTkns1(T,idB,idE,XMATCH_SEP_SYN_ID,_id1);
        boolean hasGFF=false;
        if(gff!=null){
            final int end=sb.end();
            for(String id:id1){
                final byte[]txt=gff.get(id);
                if(txt!=null) sb.aln(txt);
            }
            hasGFF=sb.end()>end;
        }
        for(String id:id1){
            putID(id,_gfffaPos,hasGFF);
        }
        _gfffaPos+=sb.end();
        sb.writeTxt(os);
    }
    private void actgFile1(byte[]T,int B,int E,BA ids,OutputStream os)throws IOException{
        final BA sb=clr(_tmp).a('>').aln(ids);
        ExactMatch.reduceAlphabet(T,B,E,sb);
        sb.a('\n').writeTxt(os);
    }
/* <<< Step 4 <<< */
/* ---------------------------------------- */
/* >>> Step 5 2bit >>> */
    private void mk2bit(){
        final File f2=file(addSfx(XMATCH_SFX_2BIT,_f));
        if(!needMkFile(f2)) return;
        if(0!=rtExecV(0,new Object[]{"faToTwoBit",addSfx(SFX_ACTG,_f),f2})){
            fileDel(0,f2);
            baOut(RED_FAILED).aln();
            MEIN_DIE("LocalSequenceDBFormater ");
        }
        createdFile(f2,null);
    }
/* <<< Step 4 <<< */
/* ---------------------------------------- */
/* >>> Hash >>> */
    private void writeSections(boolean totally){
        ROFk0(2){
            ROFi0(_bySection[k].length){
                final BA txt=_bySection[k][i];
                if(txt==null || (!totally && txt.end()<99)) continue;
                appndToFile(txt,_db.hashFile(k==HASH_ANNOTATION,i));
                clr(txt);
            }
        }
    }
    private void putID(String id,long pos,boolean hasAnno){
        if(id==null) return;
        final int hc=id.hashCode(),section=((hc<0?-hc:hc)%SECTIONS)/JOINSECTIONS;
        ROFi0(2){
            if(!hasAnno && i==HASH_ANNOTATION) continue;
            BA sb=_bySection[i][section];
            if(sb==null) sb=_bySection[i][section]=new BA(333);
            sb.a(id).a('=').aHex(pos).a('\n');
            //if (_progress%1000==0) {baOut(ANSI_CURSOR_LEFT+ANSI_CLR_FROM_CURSOR).a("LocalSequenceDBFormater ").a(_progress).send();}
            if(sze(sb)>20*1000) writeSections(false);
        }
    }

    private void rmSingleHashFiles(){
        rtExecV(0,new Object[]{"rm","-r",_db.hashFile(true,HASH_DIR),_db.hashFile(false,HASH_DIR)});
    }

    private void mkHashFile(int i){
        final File dir=_db.hashFile(i==HASH_ANNOTATION,HASH_DIR);
        OutputStream os=null,osRange=null;
        final BA txt=new BA(9999);
        final File f=_db.hashFile(i==HASH_ANNOTATION,HASH_FILE);
        final File fRange=_db.hashFile(i==HASH_ANNOTATION,HASH_RANGE_FILE);
        int pos=0;
        try{
            os=new FileOutputStream(f);
            osRange=new FileOutputStream(fRange);
            byte[]T=null;
            for(int eol[]=null,hash[]=null,section=0;section<SECTIONS;section++){
                if(section%JOINSECTIONS==0){
                    readBytes(new File(dir,s(section/JOINSECTIONS)),clr(txt));
                    T=txt.bytes();
                    hash=new int[(eol=txt.eol()).length];
                    FORiL(0,eol.length){
                        final int b=BOL0(iL,eol),hc=hashCd(T,b,strchr('=',T,b,eol[iL]));
                        hash[iL]=(hc<0?-hc:hc)%SECTIONS;
                    }
                }
                final int pos0=pos;
                FORiL(0,eol.length){
                    if(hash[iL]==section){
                        final int b=BOL0(iL,eol);
                        os.write('\n');
                        os.write(T,b,eol[iL]-b);
                        pos+=1+eol[iL]-b;
                    }
                }
                if(pos-pos0>0xFFff){
                    baOut(RED_ERROR).a("LocalSequenceDBFormater ").aln("File too big! Need to increase # SECTIONS! ");
                    System.exit(1);
                    }
                baClr(32).aLittleEndian(pos0,4).aLittleEndian(pos-pos0,2).writeTxt(osRange);
            }
        }catch(IOException iox){errorEx(iox);}
        closeStrm(os);
        baOut("  Expected size=").a(pos).a(' ').aln(pos==sze(f)?GREEN_SUCCESS:RED_FAILED);
        closeStrm(osRange);
        baOut("  Expected size=").a(SECTIONS*RANGE_BYTES).a(' ').aln(SECTIONS*RANGE_BYTES==sze(fRange)?GREEN_SUCCESS:RED_FAILED);
    }
}
