{seq.pas - AnnHyb
 	Copyright (C) 1997-2007 Olivier Friard
        (annhyb @ gmail.com)
 
  This file is part of AnnHyb.

AnnHyb is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.

AnnHyb is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with AnnHyb; see the file COPYING.TXT.  If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
}

unit seq; 

interface
uses controls,classes,sysutils,dialogs,u_progress,forms;

type T_sq=record
           name,longname,_type,creationdate,creator:shortstring;
           memposheader,memposseq:integer;
           descrip,features:string;
           hrsf:string;
           seq:string;
           seqrtf:string;
           flag_f_ok:boolean;
           end;
      psq=^T_sq;
      
const md:shortstring='JANFEBMARAPRMAYJUNJULAUGSEPOCTNOVDEC';

function extr(s:string; var erreur:boolean):string; // extrait nt d'une sq.
procedure analyse_seq(seq2:tstrings;var seq:t_sq;var seqformat:shortstring);

implementation
uses u_sequenceattributes,u_features,u_biotools;

var save_cursor:Tcursor;

function extr(s:string; var erreur:boolean):string; //extrait nt d'une sq.
var t:cardinal;
    s1:string;
begin
s1:='';
erreur:=false;
//s:=uppercase(s);
for t:=1 to length(s) do
    if upcase(s[t]) in ['A','C','G','T','Y','R','W','S','K','M','B','D','H','V','N'] then
       s1:=s1+s[t]
    else
       erreur:=true;
extr:=s1;
end;   //extr

procedure analyse_seq;
var i:integer;
    flagerreur:boolean;

procedure remove_HTML(flagprogressbar:boolean);    
var i:cardinal;
    j:byte;
    s:shortstring;
begin

if flagprogressbar then
   begin
   frmProgress.caption:='Removing HTML code';
   frmProgress.label1.caption:='Please wait...';
   frmProgress.show;
   end;
i:=0;
while i<=seq2.count-1 do
   begin
   if flagprogressbar then
      frmProgress.progressbar1.position:=round(i/seq2.count*100);
   if pos('</A>',seq2[i])<>0 then
      begin
      s:=seq2[i];
      delete(s,pos('</A>',s),4);
      seq2[i]:=s;
      end;
   if (pos('<',seq2[i])<>0) and (pos('>',seq2[i])<>0) and (pos('<',seq2[i])<pos('>',seq2[i])) then
   repeat
       j:=pos('<',seq2[i]);
       if j<>0 then
          if pos('>',seq2[i])<>0 then
             if j<pos('>',seq2[i]) then
                begin
                s:=seq2[i];
                delete(s,j,pos('>',s)-j+1);
                seq2[i]:=s;
                end;
   until (pos('<',seq2[i])=0) or (pos('>',seq2[i])=0);
   repeat
   if pos('&quot;',seq2[i])<>0 then
      begin
      s:=seq2[i];
      insert('"',s,pos('&quot;',s));
      delete(s,pos('&quot;',s),6);
      seq2[i]:=s;
      end;
   until pos('&quot;',seq2[i])=0;
   inc(i);
   end;
while seq2[0]='' do
   seq2.delete(0);
if flagprogressbar then
   frmProgress.close;
   
end; //remove HTML

//======================================================================================
procedure embl;

function nucldeb(s:shortstring):shortstring;
var ss:shortstring;
    t:byte;
begin
ss:='';
t:=22;
while s[t] in ['0'..'9'] do
   begin
   ss:=ss+s[t];
   inc(t);
   end;
nucldeb:=ss;
end;

function nuclfin(s:shortstring):shortstring;
var ss:shortstring;
    t:byte;
begin
ss:='';
t:=22;
while s[t] in ['0'..'9'] do //passe 1ers chiffres
   inc(t);
while not (s[t] in ['0'..'9']) do //passe points
   inc(t);

while s[t] in ['0'..'9'] do //passe 1ers chiffres
   begin
   ss:=ss+s[t];
   inc(t);
   end;
nuclfin:=ss;
end;  //nuclfin

var j:integer;
    strdate:shortstring;
label suite,fin;
begin           //embl
frmProgress.caption:='Sequence analysis in progress...';
frmProgress.label1.caption:='Please wait...';
frmProgress.btnCancel.visible:=false;

frmProgress.show;

seq.hrsf:='';
seq.creator:='';
seq.creationdate:='';
while i<=seq2.count-1 do
   begin
   if pos('ID',seq2[i])=1 then
      begin
      seq.name:=wordextr(seq2[i+2],' ',2);
      seq.name:=copy(seq.name,1,length(seq.name)-1);
      seq.longname:=wordextr(seq2[i],' ',2); //ID
      if pos(' RNA;',seq2[i])<>0 then
         seq._type:='RNA';
      if pos(' DNA;',seq2[i])<>0 then
         seq._type:='DNA';
      end;
   //date

   if (pos('DT',seq2[i])=1) and (seq.creationdate='') then
      begin
      strdate:=wordextr(seq2[i],' ',2);

      for j:=1 to 12 do
          if pos(copy(md,(j-1)*3+1,3),strdate)<>0 then
             seq.creationdate:=inttostr(j);
      if length(seq.creationdate)=1 then
         seq.creationdate:='0'+seq.creationdate;
      seq.creationdate:=strdate[1]+strdate[2]+'/'+seq.creationdate+'/'+copy(strdate,8,4);
      end;
   if pos('DE',seq2[i])=1 then
      begin
      seq.descrip:=copy(seq2[i],6,255);
      j:=i+1;
      while pos('DE',seq2[j])=1 do
            begin
            seq.descrip:=seq.descrip+' '+trim(copy(seq2[j],6,255));
            inc(j);
            inc(i);
            end;
      seq.hrsf:=seq.hrsf+CRLF;
      end;
(*   if pos('AC',seq2[i])=1 then
      begin
      sequence_attributes.s_name.text:=copy(seq2[i],6,255);
      sequence_attributes.s_longname.text:=sequence_attributes.s_name.text;
      end;
*)
(*   if pos(' DNA;',seq2[i])<>0 then
      seq.hrsf:=seq.hrsf+'type     DNA'+#13#10;
   if pos(' RNA;',seq2[i])<>0 then
      seq.hrsf:=seq.hrsf+'type     RNA'+#13#10;
*)
 (*  if pos('FT   source',seq2[i])=1 then
      begin
      ss:='feature  '+nucldeb(seq2[i])+'  '+nuclfin(seq2[i])+'  '+'9'+' square solid '+'source';
      features.reg.Items.add(ss);
      features.reg.checked[features.reg.items.count-1]:=true;
      end;
*)
(*   if pos('FT   CDS',seq2[i])=1 then
      begin
      ss:='feature  '+nucldeb(seq2[i])+'  '+nuclfin(seq2[i])+'  '+'9'+' square solid '+'CDS';
      features.reg.Items.add(ss);
      features.reg.checked[features.reg.items.count-1]:=true;
      end;
*)
   //quitte header qd seq
   if pos('SQ   ',seq2[i])=1 then
      goto suite;
   //insre toutes les lignes de l'header comme comments
//   seq.header:=seq.header+seq2[i]+#13#10;
   inc(i);
   end;
goto fin;
suite:
//copie header EMBL ds comments
//seq.hrsf:=seq.hrsf+'comments'+CRLF;
for j:=0 to i do
    seq.hrsf:=seq.hrsf+'   '+seq2[j]+CRLF;

inc(i);
if seq2[i]='' then
   begin
   inc(i,3);
   seqformat:='EMBL/GCG';
   seq.seq:=extr(seq2[i],flagerreur);
   while i<seq2.count-1 do
         begin
         inc(i);
         application.processmessages;
         frmProgress.progressbar1.position:=round(i/seq2.count*100);
         if pos('//',seq2[i])=1 then
            break;
         seq.seq:=seq.seq+extr(seq2[i],flagerreur);
         end;
   end
else
   begin
   seqformat:='EMBL';
   seq.seq:=extr(seq2[i],flagerreur);
   while i<seq2.count-1 do
         begin
         inc(i);
         application.processmessages;
         frmProgress.progressbar1.position:=round(i/seq2.count*100);
         if pos('//',seq2[i])=1 then
            break;
         seq.seq:=seq.seq+extr(seq2[i],flagerreur);
         end;
   end;
for j:=i to seq2.Count-1 do
     if pos('ID ',seq2[j])=1 then
         begin
         seqformat:='MULTI-EMBL';
         break;
         end;
fin:
frmProgress.hide;
end; //embl


//======================================================================================
procedure gb;
var j:integer;
    strdate:shortstring;
label suite,fin;
begin
frmProgress.caption:='Sequence analysis in progress...';
frmProgress.label1.caption:='Please wait...';
frmProgress.btnCancel.visible:=false;

frmProgress.show;

seq.hrsf:='';
seq.creator:='';
seq.creationdate:='';

while i<=seq2.count-1 do
   begin
   if seq2[i]='//' then
       break;
   if pos('LOCUS       ',seq2[i])=1 then
      begin
      seq.longname:=wordextr(seq2[i],' ',2);
      //date
      strdate:=wordextr(seq2[i],' ',8);
      for j:=1 to 12 do
          if pos(copy(md,(j-1)*3+1,3),strdate)<>0 then
             seq.creationdate:=inttostr(j);
      if length(seq.creationdate)=1 then
         seq.creationdate:='0'+seq.creationdate;
      seq.creationdate:=strdate[1]+strdate[2]+'/'+seq.creationdate+'/'+copy(strdate,8,4);
      end;
   if pos('ACCESSION ',seq2[i])=1 then
      seq.name:=wordextr(seq2[i],' ',2);
   if pos(' DNA ',seq2[i])<>0 then
      seq._type:='DNA';
   if pos(' RNA ',seq2[i])<>0 then
      seq._type:='RNA';
   if pos(' mRNA ',seq2[i])<>0 then
      seq._type:='RNA';
   if pos('DEFINITION  ',seq2[i])=1 then
      begin
      seq.descrip:=copy(seq2[i],13,255);
      j:=i+1;
      while copy(seq2[j],1,12)='            ' do
            begin
            seq.descrip:=seq.descrip+' '+trim(copy(seq2[j],13,255));
            inc(j);
            end;
      seq.hrsf:=seq.hrsf+CRLF;
      end;

   //quitte header qd seq
   if pos('ORIGIN',seq2[i])=1 then
      goto suite;

//   seq.header:=seq.header+seq2[i]+#13#10;
   inc(i);
   end;
goto fin;
suite:
//sequence header (DEFINITION)
for j:=0 to i do
    seq.hrsf:=seq.hrsf+'   '+seq2[j]+CRLF;
//seq.hrsf:=seq.hrsf+'sequence';
inc(i);
if seq2[i]='' then
   begin
   inc(i,3);
   seqformat:='Genbank/GCG';
   end
else
   seqformat:='Genbank';

seq.seq:=extr(seq2[i],flagerreur);
while i<seq2.count-1 do
   begin
   inc(i);
   if pos('//',seq2[i])=1 then
      break;
   seq.seq:=seq.seq+extr(seq2[i],flagerreur);
   end;
for j:=i to seq2.count-1 do
    if pos('LOCUS  ',seq2[j])=1 then
        begin
        seqformat:='MULTI-Genbank';
        break;
        end;
fin:
frmProgress.Hide;
end;//gb

//======================================================================================
procedure fasta;
var flagmulti:boolean;
begin
flagmulti:=false;
while i<seq2.count-1 do
   begin
   inc(i);
   if pos('>',seq2[i])<>0 then
      begin
      flagmulti:=true;
      dec(i);
      break;
      end;
   seq.seq:=seq.seq+extr(seq2[i],flagerreur);
   end;
if flagmulti then
   seqformat:='MULTI-FASTA'
else
   seqformat:='FASTA';
end;

//======================================================================================
procedure gcg;
var s,mois,jour:shortstring;
    b:byte;
begin
s:=copy(seq2[i],pos('Length:',seq2[i])+8,255);
while s[1]<>' ' do
      delete(s,1,1);
delete(s,1,1);
s:=trim(copy(s,1,pos('  ',s)-1));

mois:=copy(s,1,pos(' ',s)-1);
for b:=1 to 12 do
    if mois=month_[b] then
       mois:=inttostr(b);
if length(mois)=1 then
   mois:='0'+mois;

delete(s,1,pos(' ',s));

jour:=copy(s,1,pos(',',s)-1);
if length(jour)=1 then
   jour:='0'+jour;

delete(s,1,pos(',',s)+1);

s:=copy(s,1,pos(' ',s)-1);

seq.creationdate:=jour+'/'+mois+'/'+s;

while i<seq2.count-1 do
   begin
   inc(i);
   seq.seq:=seq.seq+extr(seq2[i],flagerreur);
   end;
seqformat:='GCG';
end;

//======================================================================================
procedure rsf;
label suite;
begin
seq.name:='';
seq.longname:='';
seq._type:='';
seq.descrip:='';
seq.creationdate:='';
seq.creator:='';
seq.hrsf:='';
seq.features:='';

while seq2[i]<>'{' do
   inc(i);
inc(i);
while i<=seq2.count-1 do
   begin
   if pos('name',seq2[i])=1 then
      seq.name:=trim(copy(seq2[i],5,255))
   else if pos('longname',seq2[i])=1 then
      seq.longname:=trim(copy(seq2[i],11,255))
   else if pos('type',seq2[i])=1 then
      seq._type:=trim(copy(seq2[i],5,255))
   else if pos('descrip',seq2[i])=1 then
      seq.descrip:=trim(copy(seq2[i],8,255))
   else if pos('creation-date',seq2[i])=1 then
      seq.creationdate:=trim(copy(seq2[i],16,255))
   else if pos('creator',seq2[i])=1 then
      seq.creator:=trim(copy(seq2[i],12,255))
   else if pos('feature',seq2[i])=1 then
      seq.features:=seq.features+seq2[i]+LF
(*
   else if pos('comments',seq2[i])=1 then
        begin
        inc(i);
        while (seq2[i]<>'') and (seq2[i][1]=' ') do
              begin
              seq.comments:=seq.comments+seq2[i]+LF;
              inc(i);
              end;
        end
*)
   else if pos('sequence',seq2[i])=1 then
      goto suite
   else if (pos('checksum',seq2[i])<>1) and (pos('comments',seq2[i])<>1) then
      seq.hrsf:=seq.hrsf+seq2[i]+CRLF;
   inc(i);
   end;
exit;
suite:
inc(i);
while seq2[i]<>'}' do
   begin
   seq.seq:=seq.seq+extr(seq2[i],flagerreur);
   inc(i);
   end;
if (i+1<=seq2.Count-1) and (seq2[i+1]='{') then
   seqformat:='MULTI-RSF'
else
   seqformat:='RSF';
end; //rsf

//======================================================================================
procedure gff;
begin
seq.hrsf:='';
seq.features:='';
while i<=seq2.count-1 do
    begin
    if pos('##date',seq2[i])<>0 then
       seq.creationdate:=copy(seq2[i],16,2)+'/'+copy(seq2[i],13,2)+'/'+copy(seq2[i],8,4);
    if (pos('##DNA',seq2[i])<>0) or (pos('##RNA',seq2[i])<>0) then
       begin
       seq.name:=copy(seq2[i],7,255);
       inc(i);
       while i<=seq2.Count-1 do
           begin
           if seq2[i]<>'##end-DNA' then
              seq.seq:=seq.seq+copy(seq2[i],3,255);
           inc(i);
           end;
       end;

    inc(i);
    end;
seqformat:='GFF';
end; //gff

//========================================================================================
procedure annhyb_probe;
begin
while i<=seq2.count-1 do
   begin
   seq.seq:=seq.seq+extr(seq2[i],flagerreur);
   inc(i);
   end;
seqformat:='AnnHyb probe';
seq.name:=seq.name;
seq._type:='';
seq.descrip:='';
//seq.hrsf:=seq.hrsf+'comments'+CRLF;
end;

//=====================================================
//=====================================================

var j,compt:cardinal;
label fin;
begin               //analyse_seq;
i:=0;
seqformat:='';
seq.seq:='';
seq.hrsf:='';
save_cursor:=screen.cursor;
screen.cursor:=crHourGlass;
remove_HTML(false);
while i<=seq2.count-1 do
   begin
   // format probe annhyb
   if seq2[0]='AnnHyb - Olivier Friard 1997' then
      begin
      seq.name:=seq2[1];
      i:=2;
      annhyb_probe;
      break;
      end;
   //seq RSF
   if pos('!!RICH_SEQUENCE',seq2[i])=1 then
      begin
      try
        rsf;
      except
        begin
        showmessage('file corrupted');
        frmProgress.hide;
        seqformat:='file corrupted';
        break;
        end;
      end;
      if seqformat='MULTI-RSF' then
         begin
         for j:=0 to i do
             seq2.Delete(0);
         seq2.Insert(0,'!!RICH_SEQUENCE');
         end;
      break;
      end;
   if pos('>',seq2[i])=1 then   //seq FASTA
      begin
      seq.name:=trim(copy(seq2[i],2,255));
      seq.longname:=seq.name;
      seq.creator:='';
      seq.creationdate:='';
      seq._type:='';
      seq.descrip:='';
      //seq.hrsf:=seq.hrsf+'comments'+CRLF;
      fasta;
      if seqformat='MULTI-FASTA' then
         begin
         for j:=0 to i do
             seq2.Delete(0);
         break;
         end;
      end;

   //EMBL format
   if pos('ID   ',seq2[i])=1 then     //seq EMBL
      begin
      seq.name:=trim(copy(seq2[i],6,11));
      try
          embl;
          if seqformat='MULTI-EMBL' then
              begin
              for j:=0 to i do
                  seq2.Delete(0);
              break;
              end;
      except
        begin
        showmessage('file corrupted');
        frmProgress.hide;
        seqformat:='file corrupted';
        break;
        end;
      end;
      end;

   //GENBANK format
   if pos('LOCUS       ',seq2[i])=1 then        //seq GB
      begin
      seq.name:=trim(copy(seq2[i],13,11));
      try
        gb;
        if seqformat='MULTI-Genbank' then
            begin
            for j:=0 to i do
               seq2.Delete(0);
            break;
            end;

      except
        begin
        showmessage('file corrupted');
        frmProgress.hide;
        seqformat:='file corrupted';
        break;
        end;
      end;
      end;

   //GCG format
   if (pos('Length:',seq2[i])<>0) and (pos('Check:',seq2[i])<>0) and (pos('..',seq2[i])<>0) then //seq GCG
      begin
      seq.name:=copy(trim(seq2[i]),1,pos(' ',trim(seq2[i]))-1);
      gcg;
      end;

   //GFF format
   if pos('##gff',seq2[i])=1 then
      begin
      gff;
      end;

   inc(i);
   end;
screen.cursor:=save_cursor;
if seqformat='file corrupted' then
   begin
   seqformat:='';
   exit;
   end;
if seqformat='' then
   if MessageDlg('Sequence file not recognized! Treat as plain text sequence?',mtConfirmation,[mbYes,mbNo],0) = mrYes then
      begin
      compt:=0;
      for i:=0 to seq2.count-1 do
          for j:=1 to length(seq2[i]) do
              begin
              application.ProcessMessages;
              if not (upcase(seq2[i][j]) in [' ',#10,#13,'0'..'9','A','C','G','T','Y','R','W','S','K','M','B','D','H','V','N']) then
                 inc(compt);
              end;
      if compt=0 then
          begin
          seqformat:='Plaintext';
          seq.name:='no name';
          seq.longname:='';
          seq.creator:='';
          seq.creationdate:='';
          seq._type:='';
          seq.descrip:='';
          for i:=0 to seq2.count-1 do
              begin
              application.processmessages;
              seq.seq:=seq.seq+extr(seq2[i],flagerreur);
              end;
          end
      else
          begin
          showmessage('Sequence file contains characters that are not valid nucleotides!');
          seqformat:='';
          end;
      end;
(*
if seqformat='' then
   begin
    frmProgress.caption:='Sequence processing';
    frmProgress.label1.caption:='Please wait...';
    frmProgress.btnCancel.visible:=false;
    frmProgress.progressbar1.position:=0;
    frmProgress.show;

   compt:=0;
   for i:=0 to seq2.count-1 do
       for j:=1 to length(seq2[i]) do
           begin
           application.processmessages;
           if not (upcase(seq2[i][j]) in [' ',#10,#13,'0'..'9','A','C','G','T','Y','R','W','S','K','M','B','D','H','V','N']) then
              inc(compt);
           frmProgress.progressbar1.position:=round(j/length(seq2[i])*100);
           end;
   if compt=0 then
      begin
      seqformat:='Plaintext';
      seq.name:='no name';
      seq.hrsf:='';
      for i:=0 to seq2.count-1 do
          begin
          application.processmessages;
          seq.seq:=seq.seq+extr(seq2[i],flagerreur);
          frmProgress.progressbar1.position:=round(i/seq2.count*100);
          end;
      end
   else
      seqformat:='Unknown sequence format!'+#10#13+'Characters not recognized as nucleotides: '+inttostr(compt);
   frmProgress.hide;
   end;
*)
fin:
end; // analyse seq
end.
