#!/bin/csh 
#   GBUPDATE,   Version  10/31/2007
#csh script to download GenBank files

# 31 Oct 2007 Added RM_CMD and FTPCOMMAND variables, to get
#             around differences amoung Unix and Linux implementations.
# 24 Feb 2007 As of GenBank Release 158.0, the Accession index
#             has been split among several files, named gbacc1.idx,gbacc2.idx etc.
#             gbupdate now handles these files in the same way it handles
#             sequence files. Simply include 'acc' in filelist as if it
#             was another division.
# 29 Apr 2004 GenPept files have a new naming convention. For example,
#             in GenBank Release 141, rel141.fsa_aa
# 21 Aug 2000 Updated to extract gzipped files, rather
#             than tar.Z files.

# Example: {uses 'at' command to run a delayed batch job}
#  at 1am
#  at>gbupdate filelist &
#  at>ctrl-D  {ends the command }

#  This assumes you are in the GenBank directory.  The existing files will be 
#  replaced with the new ones, and so must be writeable.
#  Remember to include the index and documentation files in filelist.
#  When disk space is tight, edit the 'filelist' file to download files in
#  order of decreasing size.  In general, you need to have at least twice as
#  much empty space as is necessary to hold the largest file to be downloaded.
#  This can be circumvented by setting $tmpdir to /tmp, rather than ".".

#-----------------------  Set environment variables -------------
#Check to see if $MAILID is set
if (${?MAILID} == 0) then
  echo Environment variable MAILID must be set to your full Internet address
  echo in the form     userid@hostname
  echo This is best done in your .cshrc file.
  exit
endif

# RM_CMD - command to be used for removing files and directories
if (-e /usr/bin/rm) then
   set RM_CMD = /usr/bin/rm
else
   if (-e /bin/rm) then
      set RM_CMD = /bin/rm
   else
      set RM_CMD = rm
   endif
endif

  # generate FTP command 
  # We need to run in the passive mode, which is required by
  # some firewalls.
  # There is a lot of inconsistency from system to system as far
  # as how ftp is run in the passive mode. One or more of these
  # works on each system, but none works on all systems:
  # ftp -p
  # ftp, input 'passive' from ftp.input
  # pftp
  # 
  set RESULT = `which pftp | wc -w`
   if ( "$RESULT" == "1" ) then
      set FTPCOMMAND = 'pftp'
   else
      set FTPCOMMAND = 'ftp -p'
   endif

#- - - - - - - - -  directory to store temporary files - - - - - - - - -
# Unless you are just scraping the last bytes of the
# filesystem in which GenBank is to reside, then the current working 
# directory should be fine.
set tmpdir = .

# /tmp and /var/tmp are two alternatives, if you're running out of space 
# in the target filesystem. Beware! if you max out /tmp, you will be
# interfering with the ability of everyone else on the system to
# do business as usual. /tmp on some systems can be surprisingly small,
# if it is taken from /swap.
#set tmpdir = /tmp

# On some systems, /var/tmp is an alternative place for temporary files
#set tmpdir = /var/tmp
#- - - - - - - - - - - - - - - - - - - - - -


set GBUSERID = anonymous
set GBPASSWD = $MAILID

# RLENGTH and LLENGTH tell which field gives the length of a file
# printed by 'ls -l' respectively, on remote and local hosts.
set RLENGTH = 5
set LLENGTH = 5

cd $GENBANK

# GenBank download site, and directory at that site.
# Mirror sites and directories are commented out.
# - - - - - - NCBI
#set GBHOST = ftp.ncbi.nih.gov
#set GBDIR = genbank
#set RLENGTH = 5

#- - - - - - -JAPAN
#set GBHOST = bio-mirror.jp.apan.net 
#set GBDIR = pub/biomirror/genbank 

#- - - - - - -AUSTRALIA
#set GBHOST = bio-mirror.au.apan.net
#set GBDIR = biomirror/genbank 

#- - - - - - -SINGAPORE
#set GBHOST = bio-mirror.sg.apan.net
#set GBDIR = biomirrors/genbank 

#- - - - - - -CHINA
#set GBHOST = bio-mirror.im.ac.cn
#set GBDIR = genbank 

#- - - - - - -USA - Indiana University
set GBHOST = bio-mirror.net
set GBDIR = biomirror/genbank 
set GBUSERID = anonymous
set GBPASSWD = $MAILID

#- - - - - - -USA - San Diego Supercomputing Center
#set GBHOST = genbank.sdsc.edu
#set GBDIR =  pub
#set GBUSERID = anonymous
#set GBPASSWD = $MAILID

# - - - - - - - - - - - - - - -
# Use 'ls -l' to write a list of GenBank files on remote server,
# Send output to ls.out
  # generate FTP command file
  echo user $GBUSERID $GBPASSWD > ftp.input
  echo cd $GBDIR >>  ftp.input
  echo bin >> ftp.input
  echo ls -l ls.out >> ftp.input 
  echo bye >> ftp.input
  # run FTP
  nice $FTPCOMMAND -i -n $GBHOST < ftp.input

#-----------------------  MAIN LOOP -------------
foreach file (`cat $1`)

     if ($file:e == gz | $file:e == Z) then
        set name = $file:r
     else
        set name = $file
     endif

     echo $file

  # Create a temporary list of all files for a
  # GenBank division.
  egrep -e gb$file'[0-9]*\.idx\.gz' ls.out > $$.temp
  egrep -e gb$file'[0-9]*\.seq\.gz' ls.out >> $$.temp
  cat $$.temp
  if ( -z $$.temp) then # division is in a single file
    echo $file > $$.filelist
  else # division is split among several files
    tr -s ' ' '	' < $$.temp | cut -f9 > $$.filelist 
  endif 
  cat $$.filelist
  $RM_CMD $$.temp

  foreach file (`cat $$.filelist`)

     #Nomenclature:
     # $file - original gzipped file eg. gbest1.seq.gz
     # $name - $file minus .gz extension eg. gbest1.seq
     # $base - raw file name eg. gbest1
     if ($file:e == gz | $file:e == Z) then
        set name = $file:r
        if ($name:e == seq) then
           set base = $name:r
           # Make some space by deleting the current GenBank division, if
           # it exists.
           echo "Removing file(s) for $base, if they exist"
           $RM_CMD $name.*
        endif
     endif

     # Create input file for ftp command. Logs in, moves to correct directory,
     # and downloads the data. Then logs out.
     echo user $GBUSERID $GBPASSWD > ftp.input
     echo cd $GBDIR >>  ftp.input
     echo bin >> ftp.input 
     echo get $file $tmpdir/$file  >>  ftp.input
     echo bye >> ftp.input

     #   Get the file from GenBank
     #nice $FTPCOMMAND -i -n $GBHOST < ftp.input
     nice +2 $FTPCOMMAND -i -n $GBHOST < ftp.input

     #   Make sure that the file received is the same length as the original
     #file.

     set ORIGINAL = `grep $file ls.out |tr -s ' ' '	' |cut -f$RLENGTH`
     set RECIEVED = `ls -l $tmpdir/$file |tr -s ' ' '	' |cut -f$LLENGTH`
     echo 'ORIGINAL=  '$ORIGINAL
     echo 'RECEIVED=  '$RECIEVED
     if ($ORIGINAL == $RECIEVED) then 
        ls -l $tmpdir/$file >> files_received
        if ($file:e == gz | $file:e == Z) then
           #Uncompress the file
           if ($name:e == seq) then
              nice +10 gunzip -f $tmpdir/$file 
              #Run splitdb to create the database
              nice +10 splitdb -c $tmpdir/$name $base.ano $base.wrp $base.ind
              set success = $status
              chmod a+r $base.*

              #Remove the .seq file if splitdb exited with a return code of 0.
              if ($success == 0) then 
                 $RM_CMD $tmpdir/$name
              # Otherwise, remove the partially split files to make room.
              # These will have to be split manually.
              else
                 $RM_CMD $base.ano $base.wrp $base.ind
              endif # success == 0	      
           else
#              nice +10 zcat -c -f $tmpdir/$file > $name
              nice +10 gunzip -c -f $tmpdir/$file > $name
              $RM_CMD $tmpdir/$file
              chmod a+r $name
	      endif
	      
           endif # $name:e == seq
         endif # $file:e == gz
	 if ($name:e == fsa_aa) then
	    $RM_CMD $GP/*.fsa_aa
	    mv $name $GP
            $RM_CMD $GP/genpept.wrp
	    ln -s $name $GP/genpept.wrp
	 else if ($name == gbrel.txt) then
	      mv gbrel.txt $DOC/GenBank	
	 endif 
      else
        echo $file >> files_missed
      endif # $ORIGINAL == RECEIVED
  end #foreach

  $RM_CMD $$.filelist
end #foreach
