#! /opt/gensoft/adm/bin/perl
use lib "/opt/gensoft/exe/RepeatModeler/2.0.5/lib/perl5";
use lib "/opt/gensoft/exe/RepeatModeler/2.0.5/lib/perl5";
##---------------------------------------------------------------------------##
##  File:
##      @(#) BuildDatabase.pl
##  Author:
##      Arian Smit <asmit@systemsbiology.org>
##      Robert Hubley <rhubley@systemsbiology.org>
##  Description:
##      A utility for creating a WU-Blast/RepeatModeler XDF database
##      from a set of fasta files.
##
#******************************************************************************
#* Copyright (C) Institute for Systems Biology 2004 Developed by
#* Arian Smit and Robert Hubley.
#*
#* This work is licensed under the Open Source License v2.1.  To view a copy
#* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or
#* see the license.txt file contained in this distribution.
#*
###############################################################################
#
# To Do:
#
#

=head1 NAME

BuildDatabase - Format FASTA files for use with RepeatModeler

=head1 SYNOPSIS

  BuildDatabase [-options] -name "mydb" <seqfile(s) in fasta format>
 or 
  BuildDatabase [-options] -name "mydb" 
                              -dir <dir containing fasta files *.fa, *.fasta,
                                     *.fast, *.FA, *.FASTA, *.FAST, *.dna,
                                     and  *.DNA > 
 or
  BuildDatabase [-options] -name "mydb" 
                              -batch <file containing a list of fasta files> 

=head1 DESCRIPTION

  This is basically a wrapper around AB-Blast's and NCBI Blast's
  DB formating programs.  It assists in aggregating files for processing 
  into a single database.  Source files can be specified by:

      - Placing the names of the FASTA files on the command
        line.
      - Providing the name of a directory containing FASTA files 
        with the file suffixes *.fa or *.fasta.
      - Providing the name of a manifest file which contains the 
        names of FASTA files ( fully qualified ) one per line.

  NOTE: Sequence identifiers are not preserved in this database. Each
        sequence is assigned a new GI ( starting from 1 ).  The 
        translation back to the original sequence is preserved in the
        *.translation file.

The options are:

=over 4

=item -h(elp)

Detailed help

=item -name <database name>

The name of the database to create.

=item -dir <directory>

The name of a directory containing fasta files to be processed.  The
files are recognized by their suffix.  Only *.fa and *.fasta files
are processed.

=item -batch <file>

The name of a file which contains the names of fasta files to process.
The files names are listed one per line and should be fully qualified.

=back

=head1 SEE ALSO

=over 4

RepeatModeler, RMBlast

=back

=head1 COPYRIGHT

Copyright 2004-2019 Institute for Systems Biology

=head1 AUTHOR

Robert Hubley <rhubley@systemsbiology.org>

=cut

#
# Module Dependence
#
use strict;
use FindBin;
use lib $FindBin::RealBin;
use Getopt::Long;
use Pod::Text;
use File::Temp qw/ tempfile tempdir /;
use Cwd;

# RepeatModeler Libraries
use RepModelConfig;
use lib $ENV{REPEATMASKER_LIB};

#
# Class Globals & Constants
#
my $CLASS = "BuildDatabase";
my $DEBUG = 0;
$DEBUG = 1 if ( $RepModelConfig::DEBUGALL == 1 );
my $version = $RepModelConfig::VERSION;

#
# Option processing
#  e.g.
#   -t: Single letter binary option
#   -t=s: String parameters
#   -t=i: Number paramters
#
my @opts = qw( help dir=s engine=s batch=s name=s );

# Add configuration parameters as additional command-line options
push @opts, RepModelConfig::getCommandLineOptions();

#
# Provide the POD text from this file and
# from the config file by merging them
# together.  The heading "CONFIGURATION
# OVERRIDES" provides the insertion point
# for the configuration POD.
#
sub usage {
  my $p = Pod::Text->new();
  $p->output_fh( *STDOUT );
  my $pod_str;
  open IN, "<$0"
      or die "Could not open self ($0) for generating documentation!";
  while ( <IN> ) {
    if ( /^=head1\s+CONFIGURATION OVERRIDES\s*$/ ) {
      my $c_pod = RepModelConfig::getPOD();
      if ( $c_pod ) {
        $pod_str .= $_ . $c_pod;
      }
    }
    else {
      $pod_str .= $_;
    }
  }
  close IN;
  print "$0 - $version\n";
  $p->parse_string_document( $pod_str );
  exit( 1 );
}

#
# Get the supplied command line options, and set flags
#
my %options = ();
unless ( &GetOptions( \%options, @opts ) ) {
  usage();
}

# Print the internal POD documentation if something is missing
if ( ( $#ARGV == -1 && !$options{'batch'} && !$options{'dir'} )
     || $options{'help'} )
{
  print "No query sequence file indicated\n\n";
  usage();
}

#
# Resolve configuration settings using the following precedence:
# command line first, then environment, followed by config
# file.
#
RepModelConfig::resolveConfiguration( \%options );
my $config           = $RepModelConfig::configuration;
my $XDFORMAT_PRGM    = $config->{'ABBLAST_DIR'}->{'value'} . "/xdformat";
my $NCBIBLASTDB_PRGM = $config->{'RMBLAST_DIR'}->{'value'} . "/makeblastdb";
my $NCBIDBCMD_PRGM   = $config->{'RMBLAST_DIR'}->{'value'} . "/blastdbcmd";

my $engine = "rmblast";
if ( $options{'engine'} && $options{'engine'} =~ /wublast|abblast/i ) {
  die "ERROR: \"-engine $options{'engine'}\" is deprecated, this verison of RepeatModeler uses rmblast only.\n";
}

if ( $options{'engine'} ) {
  if ( $options{'engine'} =~ /wublast|abblast/i ) {
    die "RepeatModeler doesn't appear to be configured to use the "
        . "$options{'engine'} search engine.  Please re-run configure before "
        . "using this or other RepeatModeler programs.\n"
        if ( !-x $XDFORMAT_PRGM );
    $engine = "abblast";
  }
  elsif ( $options{'engine'} =~ /rmblast|ncbi/i ) {
    die "RepeatModeler doesn't appear to be configured to use the "
        . "$options{'engine'} search engine.  Please re-run configure before "
        . "using this or other RepeatModeler programs.\n"
        if ( !-x $NCBIBLASTDB_PRGM );
    $engine = "rmblast";
  }
  else {
    print "I don't understand -engine $options{'engine'}\n";
    usage();
  }
}

die "Missing database name!\n" if ( !defined $options{'name'} );

my @fileList = ();

# Directory specified
if ( $options{'dir'} ) {
  if ( -d $options{'dir'} ) {

    # Process the directory
    opendir DIR, $options{'dir'}
        or die "Cannot open dir $options{'dir'}!\n";
    my $file;
    while ( defined( $file = readdir( DIR ) ) ) {
      if (    $file =~ /\.fa$/i
           || $file =~ /\.fasta$/i
           || $file =~ /\.fast$/i
           || $file =~ /\.dna/i )
      {
        push @fileList, "$options{'dir'}/$file";
      }
    }
    closedir( DIR );
  }
  else {
    die "Directory $options{'dir'} doesn't exist!\n";
  }
}

# Batch specified
if ( $options{'batch'} ) {
  if ( -s $options{'batch'} ) {
    open BATCH, "<$options{'batch'}"
        or die "Cannot open batch file $options{'batch'}!\n";
    while ( <BATCH> ) {
      s/[\n\r\s]+//g;
      if ( -s $_ ) {
        push @fileList, $_;
      }
      else {
        die "File $_ from batch $options{'batch'} does not exist!\n";
      }
    }
    close BATCH;
  }
  else {
    die "Batch file $options{'batch'} doesn't exist!\n";
  }
}

# Push remaining command line files
foreach my $file ( @ARGV ) {
  if ( -f $file ) {
    push @fileList, $file;
  }
  else {
    die "Command line fasta file $file does not exist!\n";
  }
}

#
# Sanitize sequence names
#
my $IDX;
my $index = 1;
open $IDX, ">$options{'name'}.translation"
    or die $CLASS . ": Cannot open file $options{'name'}.translation\n";

my $results = "";
my $dbSeqs  = 0;
my $dbSize  = 0;
print "Building database $options{'name'}:\n";

# Sanitizing the names of all input databases and placing
# in one big tempfile
#   $fh = tempfile();
my ( $SEQ, $seqFilename ) = tempfile( DIR => ".", UNLINK => 1 );
my $file;
for ( my $i = 0 ; $i <= $#fileList ; $i++ ) {
  $file = $fileList[ $i ];

  print "  Reading $file...\n";
  if ( $file =~ /^.*\.gz$/ ) {
    open IN, "gunzip -c $file|"
        or die $CLASS . ": Cannot open file using gunzip $file\n";
  }
  else {
    open IN, "<$file" or die $CLASS . ": Cannot open file $file\n";
  }
  while ( <IN> ) {
    if ( /^\>\s*(\S+)\s+(.*)/ ) {
      print $SEQ ">gi|$index\n";
      print $IDX "$1\t$index\n";
      $index++;
    }
    else {
      print $SEQ $_;
    }
  }
  close IN;
}
close $SEQ;
$file = $seqFilename;

if ( $engine eq "rmblast" ) {
  my $cmd = "$NCBIBLASTDB_PRGM -blastdb_version 4 -out $options{'name'} -parse_seqids -dbtype nucl -in $file 2>&1";
  print "Running: $cmd\n" if ( $DEBUG );
  $results = `$cmd`;
  if ( $? ) {
    print "The makeblastdb program exited with code " . ($? >> 8) . ".  Please check your input file(s) for potential formating errors.\n$NCBIBLASTDB_PRGM returned: $results\n";
    print "The command used was: $cmd\n";
    die;
  }
}
else {
  $results = `$XDFORMAT_PRGM -n -o $options{'name'} $file 2>&1`;
  while ( $results =~
/sequences\s*\(letters\)\s*(?:written|appended):\s*([\d,]+)\s+\(([\d,]+)\).*$/mg
      )
  {
    my $size = $2;
    my $seqs = $1;
    $size =~ s/,//g;
    $dbSize += $size;
    $seqs =~ s/,//g;
    $dbSeqs += $seqs;
    last;
  }
}

if ( $engine eq "abblast" ) {
  $results = `$XDFORMAT_PRGM -n -X $options{'name'} 2>&1`;
}
else {
  $results = `$NCBIDBCMD_PRGM -db $options{'name'} -info 2>&1`;
  while (
        $results =~ /\s+([\d\,]+)\s+sequences;\s+([\d\,]+)\s+total bases.*$/mg )
  {
    my $size = $2;
    my $seqs = $1;
    $size =~ s/,//g;
    $dbSize += $size;
    $seqs =~ s/,//g;
    $dbSeqs += $seqs;
    last;
  }
}
print "Number of sequences (bp) added to database: $dbSeqs ( $dbSize bp )\n";

close $IDX;

unlink( $seqFilename ) if ( -e $seqFilename );

1;
