#!/bin/bash 
#   GBUPDATE,   Version  01/03/2011
#csh script to download GenPept files

# 8/17/2009   Converted from csh script to bash script
#           
# 31 Oct 2007 Added RM_CMD and FTPCOMMAND variables, to get
#             around differences amoung Unix and Linux implementations.
# 24 Feb 2007 As of GenBank Release 158.0, the Accession index
#             has been split among several files, named gbacc1.idx,gbacc2.idx etc.
#             gbupdate now handles these files in the same way it handles
#             sequence files. Simply include 'acc' in filelist as if it
#             was another division.
# 29 Apr 2004 GenPept files have a new naming convention. For example,
#             in GenBank Release 141, rel141.fsa_aa
# 21 Aug 2000 Updated to extract gzipped files, rather
#             than tar.Z files.

# Example: {uses 'at' command to run a delayed batch job}
#  at 1am
#  at>gpupdate filelist &
#  at>ctrl-D  {ends the command }

#  This assumes you are in the GenPept directory.  The existing files will be 
#  replaced with the new ones, and so must be writeable.
#  Remember to include the index and documentation files in filelist.
#  When disk space is tight, edit the 'filelist' file to download files in
#  order of decreasing size.  In general, you need to have at least twice as
#  much empty space as is necessary to hold the largest file to be downloaded.
#  This can be circumvented by setting $tmpdir to /tmp, rather than ".".

#-----------------------  Set environment variables -------------
#Check to see if $MAILID is set

MAILID=`cat $BIRCH/local/admin/BIRCH.properties | grep 'BirchProps.adminEmail'| \
        cut -f2 -d "=" `
if [ ! "$MAILID" != "" ]
  then
  echo Environment variable MAILID must be set to your full Internet address
  echo in the form     userid@hostname
  echo This is best done in your .profile file.
  exit
fi


# RM_CMD - command to be used for removing files and directories
if [ -f /usr/bin/rm ]
then
   RM_CMD=/usr/bin/rm
else
   if [ -f /bin/rm ]
   then
      RM_CMD=/bin/rm
   else
      RM_CMD=rm
   fi
fi

  # generate FTP command 
  # We need to run in the passive mode, which is required by
  # some firewalls.
  # There is a lot of inconsistency from system to system as far
  # as how ftp is run in the passive mode. One or more of these
  # works on each system, but none works on all systems:
  # ftp -p
  # ftp, input 'passive' from ftp.input
  # pftp
  # 
RESULT=`which pftp | wc -w`
if [  $RESULT -eq 1  ]
then
   FTPCOMMAND='pftp'
else
   FTPCOMMAND='ftp -p'
fi


# directory to store temporary files
#tmpdir=.
tmpdir=/usr/local/tmp/psgendb
if [ ! -e $tmpdir ] 
then
   mkdir $tmpdir
fi

GBUSERID=anonymous
GBPASSWD=$MAILID

# RLENGTH and LLENGTH tell which field gives the length of a file
# printed by 'ls -l' respectively, on remote and local hosts.
RLENGTH=5
LLENGTH=5

cd $GP

# GenBank download site, and directory at that site.
# Mirror sites and directories are commented out.
# - - - - - - NCBI
#GBHOST=ftp.ncbi.nih.gov
#GBDIR=genbank
#RLENGTH=5

#- - - - - - -JAPAN
#GBHOST=bio-mirror.jp.apan.net 
#GBDIR=pub/biomirror/genbank 

#- - - - - - -AUSTRALIA
#GBHOST=bio-mirror.au.apan.net
#GBDIR=biomirror/genbank 

#- - - - - - -SINGAPORE
#GBHOST=bio-mirror.sg.apan.net
#GBDIR=biomirrors/genbank 

#- - - - - - -CHINA
#GBHOST=bio-mirror.im.ac.cn
#GBDIR=genbank 

#- - - - - - -USA - Indiana University
GBHOST=bio-mirror.net
GBDIR=biomirror/genpept 
GBUSERID=anonymous
GBPASSWD=$MAILID

#- - - - - - -USA - San Diego Supercomputing Center
#GBHOST=genbank.sdsc.edu
#GBDIR=pub
#GBUSERID=anonymous
#GBPASSWD=$MAILID

# - - - - - - - - - - - - - - -
# Use 'ls -l' to write a list of GenBank files on remote server,
# Send output to ls.out
# generate FTP command file
echo user $GBUSERID $GBPASSWD > ftp.input
echo cd $GBDIR >>  ftp.input
echo bin >> ftp.input
echo ls -l ls.out >> ftp.input 
echo bye >> ftp.input
# run FTP
nice $FTPCOMMAND -i -n $GBHOST < ftp.input


#-----------------------  MAIN LOOP -------------

for file in $(cat $1) ; do
     EXT=${file/*./}

     if [ "$EXT" = "gz" ] || [ "$EXT" = "Z" ]
        then
        name=${file%.*}
	
     else
        name=$file
     fi

     # Make some space by deleting the current file, if
     # it exists.
    if [ -f $name ]
       then
       echo "Removing $name"
       $RM_CMD $name
    fi

     # Create input file for ftp command. Logs in, moves to correct directory,
     # and downloads the data. Then logs out.
    echo user $GBUSERID $GBPASSWD > ftp.input
    echo cd $GBDIR >>  ftp.input
    echo bin >> ftp.input 
    echo get $file $tmpdir/$file  >>  ftp.input
    echo bye >> ftp.input

     #   Get the file from GenBank
    nice $FTPCOMMAND -i -n $GBHOST < ftp.input


     #   Make sure that the file received is the same length as the original
     #file.

    ORIGINAL=`grep $file ls.out |tr -s ' ' '	' |cut -f$RLENGTH`
    RECIEVED=`ls -l $tmpdir/$file |tr -s ' ' '	' |cut -f$LLENGTH`
    echo 'ORIGINAL=  '$ORIGINAL
    echo 'RECEIVED=  '$RECIEVED
    if [ "$ORIGINAL" = "$RECIEVED" ]
       then 
       ls -l $tmpdir/$file >> files_received

       if [ "$EXT" = "gz" ] || [ "$EXT" = "Z" ]
         then
            #Uncompress the file
         nice -n 10 gunzip -c -f $tmpdir/$file > $name
         $RM_CMD $tmpdir/$file
         chmod a+r $name
	 
       else
	 mv $name $GP       
       fi

         # Make a symbolic link to fasta file, called genpept.wrp.
         # This is a legacy for existing BIRCH scripts, and can be removed
         # when the scripts are updated. 
       if [ ${name/*./} == fasta ]
	 then
          $RM_CMD $GP/genpept.wrp
	  ln -s $name $GP/genpept.wrp	
       fi 
    else
       echo $file >> files_missed
    fi # $ORIGINAL = RECEIVED

done # for file
