#!/usr/bin/perl
#generic_ncbi_data_fetcher.pl
#
#This script uses NCBI's Entrez Programming Utilities to perform
#searches of NCBI databases. This script can return either the complete
#database records, or the IDs of the records (recommended). It is up to
#you to know how to handle the IDs and records. The results are written
#to a single output file.
#
#For additional information on NCBI's Entrez Programming Utilities see:
#http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
#
#There are five required command line options:
#
#-q unescaped query text, i.e. the query as a user would enter it.
#
#-o the results file to create.
#
#-d the database to search. The following values are supported: pubmed
#protein  nucleotide  nuccore  nucgss  nucest  structure  genome books
#cancerchromosomes  cdd  domains  gene  genomeprj  gensat  geo  gds
#homologene journals  mesh  ncbisearch  nlmcatalog  omia  omim  pmc
#popset  probe  pcassay pccompound  pcsubstance  snp  swissprot
#taxonomy  unigene  unists.
#
#-r the return type. Use "id" to obtain record ids, or use "complete"
#to specify that the complete records in default format should be
#obtained. Alternatively, supply any of the formats supported by
#NCBI. The accepted formats vary depending on the database you are
#searching. To specify certain formats for certain databases, edit the
#"Associating certain formats with certain databases" portion of this
#script. See how searches of "swissprot" are handled, for example, in
#this section.
#
#-m the max number of records or ids to obtain. Recommended value is
#100.
#
#There is one optional command line option:
#
#-s species to restrict search to.
#
#
#Example usage:
#
#The following obtains up to 100 NCBI ids for sequences found in
#swissprot. The search phrase is "diabetes" and the search is
#restricted to homo sapiens. 
#
#perl generic_ncbi_data_fetcher.pl -q diabetes -o results.txt -d
#swissprot -r id -m 100 -s homo sapiens
#
#The following obtains up to 10 PubMed ids for articles found
#PubMed. The search phrase is "dysphagia" and the search is restricted
#to homo sapiens.
#
#perl generic_ncbi_data_fetcher.pl -q dysphagia -o results.txt -d
#pubmed -r id -m 100 -s homo sapiens
#
#The following obtains up to 50 protein sequences in fasta format from
#GenBank. The search phrase is "telomere" and the search not restricted
#to any organism.
#
#perl generic_ncbi_data_fetcher.pl -q telomere -o results.txt -d
#protein -r fasta -m 50
#
#Exit status:
#If the script encounters an error it exits with a status of 1. If no
#error is encountered the script exits with a status of 0 upon
#completing.
#
#Written by Paul Stothard, Canadian Bioinformatics Help Desk.
#
#stothard@ualberta.ca

use warnings;
use strict;

#Command line processing.
use Getopt::Long;

#Web utilities
use LWP::Simple;
use URI::Escape;

use LWP::UserAgent;
use HTTP::Request::Common;

my $query = undef
my $outputFile = undef;
my $database = undef;
my $returnType = undef;
my $maxRecords = undef;
my $species = undef;

Getopt::Long::Configure ('bundling');
GetOptions ('q|query=s' => \$query,
	    'o|output_file=s' => \$outputFile,
	    'd|database=s' => \$database,
	    'r|return_type=s' => \$returnType,
	    'm|max_records=s' => \$maxRecords,
	    's|species=s' => \$species);

if(!defined($query)) {
    die(_getUsage());
}

if(!defined($outputFile)) {
    die(_getUsage());
}

if(!defined($database)) {
    die(_getUsage());
}

if(!defined($returnType)) {
    die(_getUsage());
}

$returnType = lc($returnType);

if(!defined($maxRecords)) {
    die(_getUsage());
}
if ($maxRecords =~ m/(\d+)/) {
    $maxRecords = $1;
}
else {
    die("-m must be an integer.\n");
}

my %param = (-query => undef,
	     -outputFile => undef,
	     -database => undef,
	     -returnType => undef,
	     -maxRecords => undef,
	     -format => undef);

#Set the default values of these parameters, which will be useful for most cases.
$param{'-query'} = $query;
$param{'-outputFile'} = $outputFile;
$param{'-database'} = $database;
$param{'-returnType'} = $returnType;
$param{'-maxRecords'} = $maxRecords;

#build up the query if there is a species restriction
if (defined($species)) {
    $param{'-query'} = $param{'-query'} . " AND " . $species . "[ORGN]";
}

#Associating certain formats with certain databases
#########################
#########################
#########################
#########################
#Insert logic to set output format and queries based on the database being searched.
#For example, NCBI does not support a SwissProt search directly, but this script
#supports it by using a modified query of the NCBI protein database.
if ($param{'-database'} eq "swissprot") {
    $param{'-database'} = "protein";
    $param{'-query'} = $param{'-query'} . " AND " . "srcdb_swiss-prot[PROP]";
    #when getting complete records we can specify genpept format
    if ($param{'-returnType'} eq "complete") {
	$param{'-format'} = "genpept";
    }
}
#########################
#########################
#########################
#########################

#to get ids only in output file, override -format
if ($param{'-returnType'} eq "id") {
    $param{'-format'} = "uilist";
}
#if it is still set to 'complete' use the 'native' format
elsif ($param{'-returnType'} eq "complete") {
    $param{'-format'} = "native";
}
#otherwise set it to the user set value, which wasn't 'id' and it wasn't 'complete'
else {
    $param{'-format'} = $param{'-returnType'};
}


_writeSearchResultsToFile(%param);


sub _writeSearchResultsToFile {
    my $url = "http://www.ncbi.nlm.nih.gov/entrez/eutils";

    my %param = @_;

    my $output_file = $param{'-outputFile'};
    my $db = $param{'-database'};
    my $query = _escapeQuery($param{'-query'});
    my $report = $param{'-format'};
    my $maxRecords = $param{'-maxRecords'};


    my $esearch = "$url/esearch.fcgi?" . "db=$db&retmax=1&usehistory=y&term=";

    my $esearch_result = get($esearch . $query);

    $esearch_result =~  m/<Count>(\d+)<\/Count>.*<QueryKey>(\d+)<\/QueryKey>.*<WebEnv>(\S+)<\/WebEnv>/s;

    my $count = $1;
    my $query_key = $2;
    my $web_env = $3;
    
    #modify count so that it does not exceed $maxRecords
    if ($count > $maxRecords) {
	$count = $maxRecords;
    }

    my $retstart;
    my $retmax = 500;
    if ($retmax > $count) {
	$retmax = $count;
    }

    open (OUTFILE, ">" . $output_file) or die ("Error: Cannot open $output_file : $!");

    print "$count entries to retrieve\n";

    for ($retstart = 0; $retstart < $count; $retstart = $retstart + $retmax) {
	print "Requesting entries $retstart to " . ($retstart + $retmax) . "\n";
	my $efetch = "$url/efetch.fcgi?" . "rettype=$report&retmode=text&retstart=$retstart&retmax=$retmax&" . "db=$db&query_key=$query_key&WebEnv=$web_env";   

	my $efetch_result = get($efetch);

	print (OUTFILE $efetch_result);
	#required by NCBI
	sleep(3);
    }

    close (OUTFILE) or die( "Error: Cannot close $output_file file: $!"); 
}

sub _getUsage {
    return "Usage: perl generic_ncbi_data_fetcher.pl -q <query> -o <output file> -d <database> -r <return type>  -m <max records> -s <species>\n";
}

sub _escapeQuery {
    my $query = shift;
    return uri_escape($query);
}
