#! /opt/gensoft/adm/bin/perl
use lib "/opt/gensoft/exe/RepeatModeler/2.0.5/lib/perl5";
use lib "/opt/gensoft/exe/RepeatModeler/2.0.5/lib/perl5";
##---------------------------------------------------------------------------##
##  File:
##      @(#) RepeatModeler
##  Author:
##      Arian Smit <asmit@systemsbiology.org>
##      Robert Hubley <rhubley@systemsbiology.org>
##  Description:
##      Takes one or more DNA sequence files, in fasta format, and returns
##      the consensus models of putative repeats.
##
#******************************************************************************
#* Copyright (C) Institute for Systems Biology 2008-2021 Developed by
#* Arian Smit and Robert Hubley.
#*
#* This work is licensed under the Open Source License v2.1.  To view a copy
#* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or
#* see the license.txt file contained in this distribution.
#*
###############################################################################
#
#

=head1 NAME

RepeatModeler - Model repetitive DNA

=head1 SYNOPSIS

  RepeatModeler [-options] -database <XDF Database>

=head1 DESCRIPTION

The options are:

=over 4

=item -h(elp)

Detailed help

=item -database

The name of the sequence database to run an analysis on.
This is the name that was provided to the BuildDatabase script
using the "-name" option.

=item -threads #

Specify the maximum number of threads which can be used by the
program at any one time. Note that the '-pa' parameter in previous
releases controlled the number of sequence batches compared 
in parallel using rmblastn (each running 4 threads).  Therefore,
if '-pa 4' was used previously the new thread parameter should be
set to '-threads 16'.

=item -recoverDir <Previous Output Directory>

If a run fails in the middle of processing, it may be possible recover
some results and continue where the previous run left off.  Simply supply
the output directory where the results of the failed run were saved and the
program will attempt to recover and continue the run.

=item -srand #

Optionally set the seed of the random number generator to a known value before
the batches are randomly selected ( using Fisher Yates Shuffling ).  This is
only useful if you need to reproduce the sample choice between runs.  This
should be an integer number.

=item -LTRStruct

Run the LTR structural discovery pipeline ( LTR_Harvest and LTR_retreiver ) and 
combine results with the RepeatScout/RECON pipeline. [optional]

=item -genomeSampleSizeMax #

Optionally change the maximum sample size (bp).  The sample sizes for RECON
are increased until this number is reached (default: 270MB in 5 rounds, or
243MB in 6 rounds for '-quick' option).

=item -numAdditionalRounds #

Optionally increase the number of rounds.  The sample size for 
additional rounds will change by size multiplier (currently 3).

=item -quick

In RepeatModeler 2.0.4 due to improvements in masking and parallelization the
sample sizes have increased improving sensitivity.  Using the quick option 
reverts back to the original sample sizes (pre 2.0.4) allowing for similar 
sensitivity as before, at a faster rate.

=back

=head1 CONFIGURATION OVERRIDES

=head1 SEE ALSO

=over 4

RepeatMasker, RMBlast

=back

=head1 COPYRIGHT

 Copyright 2005-2022 Institute for Systems Biology

=head1 AUTHOR

 RepeatModeler:
   Robert Hubley <rhubley@systemsbiology.org>
   Arian Smit <asmit@systemsbiology.org>

 LTR Pipeline Extensions:
   Jullien Michelle Flynn <jmf422@cornell.edu>

=cut

#
# Module Dependence
#
use strict;
use FindBin;
use lib $FindBin::RealBin;
use Getopt::Long;
use POSIX qw(:sys_wait_h ceil floor);
use File::Copy;
use File::Spec;
use File::Path;
use File::Basename;
use Cwd qw(abs_path getcwd cwd);
use Data::Dumper;
use Pod::Text;
use Time::HiRes qw( gettimeofday tv_interval);

# RepeatModeler Libraries
use RepModelConfig;
use lib $RepModelConfig::configuration->{'REPEATMASKER_DIR'}->{'value'};
use RepeatUtil;
use SeedAlignment;
use SeedAlignmentCollection;
use ThreadedTaskSimple;

# RepeatMasker Libraries
use SearchResult;
use SearchResultCollection;
use WUBlastSearchEngine;
use NCBIBlastSearchEngine;
use SeqDBI;
use SimpleBatcher;
use FastaDB;


# use Devel::Size qw(size total_size);

#
# Global Class Variables/Constants
#
my $CLASS = "RepeatModeler";
my $DEBUG = 0;
$DEBUG = 1 if ( $RepModelConfig::DEBUGALL == 1 );
$| = 1;    # Turn autoflush on
my %TimeBefore = ();

#
# Version
#
my $version = $RepModelConfig::VERSION;

my $cmdLine = $0 . join( ' ', @ARGV );

#
# Option processing
#  e.g.
#   -t: Single letter binary option
#   -t=s: String parameters
#   -t=i: Number parameters
#
my @getopt_args = (
                    '-version',                 # print out the version and exit
                    '-help',
                    '-dir=s',
                    '-debug',
                    '-quick',
                    '-database=s',
                    '-engine=s',
                    '-recoverDir=s',
                    '-pa=i',
                    '-threads=i',
                    '-srand=i',
                    '-rsSampleSize=s',
                    '-numAddlRounds=s',
                    '-genomeSampleSizeMax=s',
                    '-LTRStruct',
                    '-LTRMaxSeqLen=i',
                    '-onlyRS',
                    '-skipRS',
                    '-disableRMBlastStat',
                    '-onlyIns',                     # For internal use only
                    '-recon_batch_size=i',          # For debugging only 
                    '-recon_sample_size_start=i',   # For debugging only
);

# Add configuration parameters as additional command-line options
push @getopt_args, RepModelConfig::getCommandLineOptions();

#
# Get the supplied command line options, and set flags
#
my %options = ();
Getopt::Long::config( "noignorecase", "bundling_override" );
unless ( GetOptions( \%options, @getopt_args ) ) {
  usage();
}

$DEBUG = 1 if ( exists $options{'debug'} );

#
# Provide the POD text from this file and
# from the config file by merging them
# together.  The heading "CONFIGURATION
# OVERRIDES" provides the insertion point
# for the configuration POD.
#
sub usage {
  my $p = Pod::Text->new();
  $p->output_fh( *STDOUT );
  my $pod_str;
  open IN, "<$0"
      or die "Could not open self ($0) for generating documentation!";
  while ( <IN> ) {
    if ( /^=head1\s+CONFIGURATION OVERRIDES\s*$/ ) {
      my $c_pod = RepModelConfig::getPOD();
      if ( $c_pod ) {
        $pod_str .= $_ . $c_pod;
      }
    }
    else {
      $pod_str .= $_;
    }
  }
  close IN;
  print "$0 - $version\n";
  $p->parse_string_document( $pod_str );
  exit( 1 );
}

#
# Resolve configuration settings using the following precedence:
# command line first, then environment, followed by config
# file.
#
RepModelConfig::resolveConfiguration( \%options );
my $config           = $RepModelConfig::configuration;
my $NCBIDBCMD_PRGM   = $config->{'RMBLAST_DIR'}->{'value'} . "/blastdbcmd";
my $NCBIBLASTDB_PRGM = $config->{'RMBLAST_DIR'}->{'value'} . "/makeblastdb";
my $NCBIDBALIAS_PRGM =
    $config->{'RMBLAST_DIR'}->{'value'} . "/blastdb_aliastool";
my $DUSTMASKER_PRGM = $config->{'RMBLAST_DIR'}->{'value'} . "/dustmasker";
my $RMBLASTN_PRGM   = $config->{'RMBLAST_DIR'}->{'value'} . "/rmblastn";
my $XDFORMAT_PRGM   = $config->{'ABBLAST_DIR'}->{'value'} . "/xdformat";
my $XDGET_PRGM      = $config->{'ABBLAST_DIR'}->{'value'} . "/xdget";
my $WUBLASTN_PRGM   = $config->{'ABBLAST_DIR'}->{'value'} . "/blastn";
my $TRF_DIR         = $config->{'TRF_DIR'}->{'value'};
my $RSCOUT_DIR      = $config->{'RSCOUT_DIR'}->{'value'};
my $RECON_DIR       = $config->{'RECON_DIR'}->{'value'};
my $CDHIT_DIR       = $config->{'CDHIT_DIR'}->{'value'};

## validate params
# Minimal commands
foreach my $param ( "RMBLAST_DIR", "TRF_DIR", "RSCOUT_DIR", "RECON_DIR" ) {
  if ( !RepModelConfig::validateParam( $param ) ) {
    die
"\n  RepeatModeler dependency missing or incorrectly set for $param!\n  Rerun ./configure or check your command line to ensure that RepeatModeler\n  has access to and the correct version of this dependency.\n\n";
  }
}

# LTRPipeline dependencies
if ( $options{'LTRStruct'} ) {
  foreach my $param (
                      "GENOMETOOLS_DIR", "LTR_RETRIEVER_DIR",
                      "MAFFT_DIR",       "NINJA_DIR",
                      "CDHIT_DIR"
      )
  {
    if ( !RepModelConfig::validateParam( $param ) ) {
      die
"\n  LTRPipeline dependency missing or incorrectly set for $param!\n   Rerun ./configure or check your command line to ensure that RepeatModeler\n  has access to and the correct version of this dependency.\n\n";
    }
  }
  unless ( eval "require threads;" ) 
  {
    die "LTRRetriever used in the LTRPipeline depends on perl threads.  This version of\n".
        "perl does not appear to be compiled with the thread option.\n";
  }
}

if ( $options{'version'} ) {
  print "RepeatModeler version $version\n";
  exit;
}

# Print the internal POD documentation if something is missing
if ( !defined $options{'database'} || $options{'help'} ) {
  print "No database indicated\n\n";
  usage();
}

if ( $options{'pa'} ) {
  die "\nERROR: The -pa parameter has been deprecated.  Please use the newer\n" .
        "      \"-threads #\" parameter which precisely controls the maximum\n" .
        "       number of simultaneous threads this program will use. Note that\n" .
        "       the -pa parameter previously controlled the number of simultaneous\n" .
        "       batches each run with rmblast using 4 threads, therefore -pa 4 could\n" .
        "       potentially have used up to 16 threads.  Keep this in mind when\n" .
        "       comparing runtime differences between versions.\n\n";
}

if ( ! exists $options{'threads'} || $options{'threads'} < 2 ) {
  print "WARNING: RepeatModeler is a computationally intensive program.\n" .
        "         It is recommended that for anything other than debugging\n" .
        "         purposes the program should run with greater than eight\n" .
        "         threads (-threads #).\n\n";
}
# Make it easier to pass to functions
my $threads = $options{'threads'} ? $options{'threads'} : 1;

# Seed the random number generator if requested
my $seed = time;
$seed = $options{'srand'} if ( defined $options{'srand'} );
srand( $seed );

if ( exists $options{'disableRMBlastStat'} ) {
  $ENV{'BLAST_USAGE_REPORT'} = "false";
}

#
# Setup the search engines
#
my $searchEngineN;
my $engine = "rmblast";
my $engineVersion = "";
my $isRMBlastQueryThreadable = 0; # A flag that is set if RMBlastn supports query threading (2.13 and above)

# DEPRECATED: Using only rmblast now
if ( $options{'engine'} && $options{'engine'} =~ /wublast|abblast/i ) {
  die "ERROR: \"-engine $options{'engine'}\" is deprecated, this verison of RepeatModeler uses rmblast only.\n";
}
if ( $engine ) {
  if ( $engine =~ /rmblast|ncbi/i ) {
    $engine        = "rmblast";
    $searchEngineN =
        NCBIBlastSearchEngine->new( pathToEngine => $RMBLASTN_PRGM );
    if ( not defined $searchEngineN ) {
      die "Cannot execute $RMBLASTN_PRGM please make "
          . "sure you have setup RepeatModeler to use NCBI (RMBlast) by "
          . "running the configure script.\n";
    }
    $engineVersion = $searchEngineN->getVersion();
    if ( $engineVersion =~ /.*(\d+)\.(\d+)\.\d+.*/ ) {
      my $major = $1;
      my $minor = $2;
      if ( $major > 2 || ( $major == 2 && $minor >= 13 ) ) {
        $isRMBlastQueryThreadable = 1;
      }
    }
  }
  else {
    print "I don't recognize the search engine type:  $engine\n";
    exec "pod2text $0";
    die;
  }
}

##
## Sampling/Iteration Parameters
##
my $rsSampleSize;                  # The size of the sequence given
                                   #   to RepeatScout. ( round #1 )
my $fragmentSize;                  # The size of the batches for
                                   #   all-vs-other search.
my $reconSampleSizeStart;          # The initial sample size for
                                   #   RECON analysis.
my $genomeSampleSizeMax;           # The max sample size for
                                   #   RECON analysis
my $genomeSampleSizeMultiplier;    # The multiplier for sample size
                                   #   between rounds
my $numAdditionalRounds;           # Number of additional rounds at
                                   #   genomeSampleSizeMax to run.
my $recommendedFamilySize;         # RECON/RS family size minimum
                                   #   needed to build a family.

if ( $options{'quick'} ) {
  # Pre-RepeatModeler 2.0.4 Sampling Strategy  
  #   RepeatScout Round 1: 40MB
  #   RECON Rounds 2-6   : 3,9,27,81,243MB
  $rsSampleSize               = 40000000; 
  $fragmentSize               = 40000;       
  $reconSampleSizeStart       = 3000000;   
  $genomeSampleSizeMax        = 243000000;   
  $genomeSampleSizeMultiplier = 3; 
  $numAdditionalRounds = 0;                  
  $recommendedFamilySize = 15;
}else {
  # RepeatModeler 2.0.4+ Sampling Strategy  
  #   RepeatScout Round 1: 40MB
  #   RECON Rounds 2-5   : 10,30,90,270MB
  $rsSampleSize               = 40000000; 
  $fragmentSize               = 40000;       
  $reconSampleSizeStart       = 10000000;   
  $genomeSampleSizeMax        = 270000000;   
  $genomeSampleSizeMultiplier = 3; 
  $numAdditionalRounds = 0;                  
  $recommendedFamilySize = 15;
}

##
## Special Case: Input consists of only TE insertions
##               relative to it's direct ancestral
##               reconstructed genome.  This is an 
##               experimental feature used by the
##               Zoonomia project.
##
my $insertMinFamilySize;
if ( $options{'onlyIns'} ) {
  $genomeSampleSizeMax        = 300000000;
  $reconSampleSizeStart       = 200000;
  $genomeSampleSizeMultiplier = 4;
  $recommendedFamilySize      = 50;
  $insertMinFamilySize        = 15;
  $numAdditionalRounds = 0;
}
                                               
# Sampling parameter overrides
$numAdditionalRounds = $options{'numAddlRounds'} if ( $options{'numAddlRounds'} );
$rsSampleSize = $options{'rsSampleSize'} if ( $options{'rsSampleSize'} );
$genomeSampleSizeMax = $options{'genomeSampleSizeMax'}
    if ( $options{'genomeSampleSizeMax'} );
$fragmentSize = $options{'recon_batch_size'} if ( exists $options{'recon_batch_size'} );
$reconSampleSizeStart = $options{'recon_sample_size_start'} if ( exists $options{'recon_sample_size_start'} );

# Feature On Hold : We need a system that takes into
#   account genome size, false positive rate of each
#   discovery method, and detection target.
# The absolute minimum RECON/RS family size for which
#   we will build a model.  Only invoked if there
#   are only small families returned by RECON/RS.
#my $minFamilySize = 4;

#
# Gather details from the database
#
my $genomeDB = $options{'database'};
if ( -s $genomeDB ) {
  $genomeDB = File::Spec->rel2abs($genomeDB);
}elsif ( -s "$genomeDB.nsq" ){
  $genomeDB = File::Spec->rel2abs("$genomeDB.nsq");
}
$genomeDB =~ s/(.+)\.n[nihs][rndiq]$/$1/;
my $dbIndex;
$dbIndex = `$NCBIDBCMD_PRGM -db $genomeDB -entry all -outfmt "%i %l"`;

# Initialize working directory and logging
my $tmpDir;
my $LOG;
if ( $options{'recoverDir'} && -d $options{'recoverDir'} ) {
  $tmpDir =  abs_path($options{'recoverDir'});
  open $LOG, ">>", "$tmpDir/rmod.log" or die "Could not open up $tmpDir/rmod.log for writing (appending)!\n";
}else {
  #
  # Create temp directory
  #
  my @tmpDirPath =
      ( getcwd(), ( File::Spec->splitpath( $options{'database'} ) )[ 1 ] );

  if ( -d $options{'dir'} ) {
    $tmpDir = $options{'dir'};
  }
  else {
    $tmpDir = createTempDir( \@tmpDirPath );
  }
  open $LOG, ">", "$tmpDir/rmod.log" or die "Could not open up $tmpDir/rmod.log for writing!\n";
}

# print to screen and log
sub log_print {
  my $string = shift;
  print "$string";
  print $LOG "$string";
}

#
# Print greeting
#
my $title = "RepeatModeler Version $version";
log_print "$title\n";
log_print "=" x length( $title ) . "\n";
log_print "Using output directory = $tmpDir\n";
log_print "Search Engine = $engine $engineVersion\n";
log_print "Threads = " . $options{'threads'} . "\n" if ( $options{'threads'} );
log_print "Dependencies: TRF " . RepModelConfig::getDependencyVersion("TRF_DIR") . ", RECON " . RepModelConfig::getDependencyVersion("RECON_DIR") . ", RepeatScout " . RepModelConfig::getDependencyVersion("RSCOUT_DIR") . ", RepeatMasker " . RepModelConfig::getDependencyVersion("REPEATMASKER_DIR") . "\n";
if ( $options{'LTRStruct'} ) {
  log_print "LTR Structural Analysis: Enabled ( " . 
      "GenomeTools " . RepModelConfig::getDependencyVersion("GENOMETOOLS_DIR") . ", " .
      "LTR_Retriever " . RepModelConfig::getDependencyVersion("LTR_RETRIEVER_DIR") . ",\n" .
      "                                   Ninja " . RepModelConfig::getDependencyVersion("NINJA_DIR") . ", " .
      "MAFFT " . RepModelConfig::getDependencyVersion("MAFFT_DIR") . ",\n" .
      "                                   CD-HIT " . RepModelConfig::getDependencyVersion("CDHIT_DIR") . " )\n";
}
else {
  log_print "LTR Structural Analysis: Disabled [use -LTRStruct to enable]\n";
}
log_print "Random Number Seed: $seed\n";
log_print "Database = $genomeDB ";

my $dbSize       = 0;
my @contigSizes  = ();
while ( $dbIndex =~ />?(\S+)\s+(\d+)\s*$/mg ) {
  print "." if ( ( $#contigSizes % 10000 ) == 0 );
  push @contigSizes,  $2;
  $dbSize += $2;
}
undef $dbIndex;
print "\n";

die "Database $genomeDB does not contain any sequences!\n"
    if ( scalar(@contigSizes) < 1 );
log_print "  - Sequences = " . ( $#contigSizes + 1 ) . "\n";
log_print "  - Bases = $dbSize\n";

if ( @contigSizes > 100 ) {
  # Should be longest to shortest for N50 calc: Fixed 20210225
  @contigSizes = sort { $b <=> $a } @contigSizes;

  my $tabSize      = 0;
  my $halfAssembly = $dbSize / 2;
  my $N50          = 0;
  for ( my $i = 0 ; $i < $#contigSizes ; $i++ ) {
    $tabSize += $contigSizes[ $i ];
    $N50 = $contigSizes[ $i ];
    last if ( $tabSize + $contigSizes[ $i ] > $halfAssembly );
  }
  log_print "  - N50 = $N50\n";

  # Sort shortest to longest for histogram
  @contigSizes = sort { $a <=> $b } @contigSizes;
  log_print "  - Contig Histogram:\n";
  log_print
"  Size(bp)                                                        Count\n";
  log_print
"  -----------------------------------------------------------------------\n";
  my @bins          = ();
  my $numBins       = 15;
  my $minVal        = $contigSizes[ 0 ];
  my $maxHistBarCnt = 0;
  my $binSize       = ( $contigSizes[ $#contigSizes ] - $minVal ) / $numBins;
  $binSize = $minVal if ( $binSize == 0 );

  foreach my $val ( @contigSizes ) {
    my $bin = int( ( $val - $minVal ) / $binSize );
    # JR 20200121: the largest contig will normally end up in
    # "one-past-the-last" bin by the above formula
    if ($bin >= $numBins) {
      $bin = $numBins - 1;
    }
    $bins[ $bin ]++;
    $maxHistBarCnt = $bins[ $bin ] if ( $bins[ $bin ] > $maxHistBarCnt );
  }

  my $labelWidth = ( length( $binSize * $numBins ) * 2 ) + 1;
  my $histoWidth = 50;
  my $markTic    = $maxHistBarCnt / $histoWidth;

  for ( my $i = $numBins - 1 ; $i >= 0 ; $i-- ) {
    my $rangeStart = int( $minVal + ( $i * $binSize ) );
    my $rangeStr = "$rangeStart-" . int( $rangeStart + $binSize );
    if ( $i == $numBins - 1 ) {
      $rangeStr = "$rangeStart-$contigSizes[$#contigSizes]";
    }
    log_print "  $rangeStr" . " " x ( $labelWidth - length( $rangeStr ) ) . " |";
    log_print "*" x ( int( $bins[ $i ] / $markTic ) )
        . " " x ( $histoWidth - int( $bins[ $i ] / $markTic ) )
        . " [ $bins[$i] ]\n";
  }
  log_print "\n";
  undef @bins;
  if ( $N50 < 10000 ) {
    log_print
"  WARN: The N50 for this assembly is low ( <10,000 ).  The de novo methods\n";
    log_print
"        employed by RepeatModeler are intended for use with long contiguous\n";
    log_print
"        sequences and may not perform well with an over-abundance of short\n";
    log_print "        contigs in the database.\n";
  }
}
undef @contigSizes;

die "Database $genomeDB is not large enough ( minimum $fragmentSize bp ) to "
    . "process with RepeatModeler.\n"
    if ( $fragmentSize > $dbSize );

my $dbSourceList;
if ( $options{'onlyIns'} ) {
  $dbSourceList = &initializeSampleSourceList(
                                               approxBlockSize => $fragmentSize,
                                               genomeDB        => $genomeDB,
                                               dbSize          => $dbSize,
                                               noFrag          => 1
  );
}
else {
  $dbSourceList = &initializeSampleSourceList(
                                               approxBlockSize => $fragmentSize,
                                               genomeDB        => $genomeDB,
                                               dbSize          => $dbSize
  );
}


#
# Randomize List
#
&fisherYatesShuffle( $dbSourceList );

#
# Setup initial parameters
#

my $round = 1;
$round = 2 if ( $options{'skipRS'} );
my $totSampledBP = 0;
my $familyCutoff = $recommendedFamilySize;
my $sampleSize   = $rsSampleSize;
if ( $options{'skipRS'} ) {

  # If we are skipping RS we need to do this early check to make sure
  # the genome isn't really small and would cause only one round of
  # RECON to run.
  $sampleSize = $reconSampleSizeStart;
  if ( ( $dbSize - ( $totSampledBP + $sampleSize ) ) <= ( $sampleSize / 2 ) ) {
    print "   - Increasing sample size to include end piece now = ";
    $sampleSize += ( $dbSize - ( $totSampledBP + $sampleSize ) );
    print "$sampleSize\n";
  }
}

#
# Tweak initial parameters if we are recovering
# from a failed run
#
if ( $options{'recoverDir'} && -d $options{'recoverDir'} ) {
  my $recoverDir = $options{'recoverDir'};
  ## Determine highest successful round
  my $i                = 0;
  my $highestGoodRound = 0;
  while ( $i++ < 100 ) {
    if ( -d "$recoverDir/round-$i" ) {
      if (    ( $i == 1 && -s "$recoverDir/round-1/consensi-refined.fa" )
           || ( $i > 1 && -s "$recoverDir/round-$i/consensi.fa" ) )
      {
        $highestGoodRound = $i;
      }
      else {
        last;
      }
    }
    else {
      last unless ( $i == 1 && $options{'skipRS'} );
    }
  }

  if ( $highestGoodRound <= 1 ) {
    print "\n\nOops...the $recoverDir run did not get passed round-1.\n"
        . "It makes more sense to restart this run from the beginning.\n"
        . "Remove the -recoverDir option and rerun the program.\n\n";
    exit;
  }

  my $badRound    = ( $highestGoodRound + 1 );
  my $badRoundDir = "$recoverDir/round-$badRound";

  if ( !-d $badRoundDir ) {
    print "\n\nThis directory ( $recoverDir )\n"
        . "appears to contain a successful run of RepeatModeler.  If this\n"
        . "is not the case, please report this as a bug at the RepeatMasker\n"
        . "website ( www.repeatmasker.org )\n\n";
    exit;
  }

  print "\n\n** RECOVERING $recoverDir **\n\n";
  print "  - Attempting to rerun round $badRound\n";

  ## Backup previous results
  print "  - Backing up previous $recoverDir/consensi.fa file\n";
  if ( -s "$recoverDir/consensi.fa" ) {

    # Back this up
    my $i = 1;
    while ( -s "$recoverDir/consensi.fa.backup_$i" && $i < 100 ) {
      $i++;
    }
    if ( !-s "$recoverDir/consensi.fa.backup_$i" ) {
      system( "cp $recoverDir/consensi.fa $recoverDir/consensi.fa.backup_$i" );
    }
    else {
      die "Something went wrong while trying to backup:\n"
          . "       $recoverDir/consensi.fa\n";
    }
  }

  print "  - Backing up previous $recoverDir/round-$badRound" . " directory.\n";

  # Move bad directory aside
  my $i = 1;
  while ( -d "$recoverDir/round-$badRound.backup_$i" && $i < 100 ) {
    $i++;
  }
  if ( !-d "$recoverDir/round-$badRound.backup_$i" ) {
    system(
       "mv $recoverDir/round-$badRound $recoverDir/round-$badRound.backup_$i" );
  }
  else {
    die "Something went wrong while trying to backup:\n"
        . "       $recoverDir/round-$badRound\n";
  }

  print "  - Recalculating sample size...( please be patient )\n";

  # Recalculate totSampledBP
  $totSampledBP = 0;
  my $startIdx = 1;
  $startIdx = 2 if ( $options{'skipRS'} );
  for ( my $i = $startIdx ; $i <= $highestGoodRound ; $i++ ) {

    # Open up sampleDB-$i.fa with FastaDB and count bases
    my $sDB = FastaDB->new( fileName => "$recoverDir/round-$i/sampleDB-$i.fa",
                            openMode => SeqDBI::ReadOnly );

    foreach my $seqID ( $sDB->getIDs() ) {
      $totSampledBP += $sDB->getSeqLength( $seqID );
    }
    undef $sDB;
  }

  # Set round number
  $round = $highestGoodRound + 1;

  # Adjust sample size
  #   - Only starts increasing for RECON after round 2
  for ( my $i = 2 ; $i < $badRound ; $i++ ) {
    $sampleSize *= $genomeSampleSizeMultiplier;
  }
  $sampleSize = $genomeSampleSizeMax
      if ( $sampleSize > $genomeSampleSizeMax );
}

#
# Evaluate storage performance  : This program is heavily filesystem
# dependent.  If the working directory is stored on a network filesystem
# the program will run slowly.  The numbers choosen represent arbitrary
# cutoffs based on current hardware ( Jan/2018 )..
#
my $throughput  = benchmarkStorage( "$tmpDir/bench.dat" );
my $performance = "Undetermined";
if ( $throughput < 300 ) {
  $performance = "poor";
}
elsif ( $throughput >= 300 && $throughput < 700 ) {
  $performance = "fair";
}
elsif ( $throughput >= 700 && $throughput < 1000 ) {
  $performance = "good";
}
elsif ( $throughput >= 1000 ) {
  $performance = "excellent";
}
log_print "Storage Throughput = $performance ( "
    . sprintf( "%0.2f", $throughput )
    . " MB/s )\n";
if ( $performance eq "poor" ) {
  log_print
"  - NOTE: Poor storage througput will have a large impact on RepeatModeler\n"
      . "          performance.  The low throughput observed above may be due to\n"
      . "          transient usage patterns on the system and may not reflect the\n"
      . "          actual system performance. Whenever possible run RepeatModeler\n"
      . "          in a directory stored on a fast local disk and not over a\n"
      . "          network filesytem.\n";
}

#
# Basic info
#
print "\nReady to start the sampling process.\n";
print
"INFO: The runtime of RepeatModeler heavily depends on the quality of the assembly\n";
print
"      and the repetitive content of the sequences.  It is not imperative\n";
print
    "      that RepeatModeler completes all rounds in order to obtain useful\n";
print
"      results.  At the completion of each round, the files ( consensi.fa, and\n";
print "      families.stk ) found in:\n";
print "      $tmpDir/ \n";
print "      will contain all results produced thus far. These files may be \n";
print
"      manually copied and run through RepeatClassifier should the program\n";
print "      be terminated early.\n";

#
# Main loop
#
#    Sample from the sequence database starting from $reconSampleSizeStart
#  and increasing by a factor of $genomeSampleSizeMultiplier until either
#  the sequence database is fully covered or we complete one round with
#  the sample size at $genomeSampleSizeMax.  NOTE: If -numAddlRounds is
#  used, then additional rounds at $genomeSampleSizeMax will be run.
#
elapsedTime( "runtime" );    # Setup the timers

# TODO: What if we begin with such a small sample....?
#        Change the cutoff and possibly increase to full genome size
my $families;
my $numModels;
my $inLastRound = 0;
while ( 1 ) {

  # Reset the timers
  elapsedTime( 1 );
  elapsedTime( 2 );

  my $seqRemaining = ( $dbSize - ( $totSampledBP + $sampleSize ) );

  log_print "\n\nRepeatModeler Round # $round\n";
  log_print "========================\n";
  log_print "Searching for Repeats\n";
  log_print " -- Sampling from the database...\n";

  if (
       (
         $options{'onlyIns'} && (    $sampleSize >= $genomeSampleSizeMax
                                  || $sampleSize >= $dbSize )
       )
       || (
            !$options{'onlyIns'}
            && (    $totSampledBP >= $dbSize
                 || ( $sampleSize >= $genomeSampleSizeMax && $numAdditionalRounds <= 0 )
                 || $seqRemaining < $fragmentSize )
       )
      )
  {

    # this round will be last
    $inLastRound = 1;
    #print "   - Last round, lowering family cutoff to $minFamilySize elements.\n";
    #$familyCutoff = $minFamilySize;
    if ( $options{'onlyIns'} ) {
      log_print "   - Last round, lowering family cutoff to $insertMinFamilySize elements.\n";
      $familyCutoff = $insertMinFamilySize;
    }
  }
  # Setup file locations
  my $roundTmpDir       = "$tmpDir/round-$round";
  my $instFile          = "$tmpDir/instdb.txt";
  my $sampleFastaFile   = "$roundTmpDir/sampleDB-$round.fa";
  my $consensiFile      = "$tmpDir/consensi.fa";
  my $combFamiliesFile  = "$tmpDir/families.stk";
  my $roundConsensiFile = "$roundTmpDir/consensi.fa";
  mkdir( $roundTmpDir );

  # Fine grained timing
  elapsedTime( 3 );

  # Sample from the input database
  my $usedBlockIndex = ();
  if ( $round == 1 ) {
    log_print "   - Gathering up to $rsSampleSize bp\n";
    $usedBlockIndex = &sampleFromDB(
                                     sampleSizeMax  => $rsSampleSize,
                                     dbSourceBlocks => $dbSourceList,
                                     dbType         => $engine,
                                     dbFile         => $genomeDB,
                                     fastaFile      => $sampleFastaFile,
                                     prohibitX      => 1
    );
    &clearSampleUsedCounts( dbSourceBlocks => $dbSourceList );
  }
  else {
    if ( $options{'onlyIns'} ) {
      log_print "   - Gathering up to $sampleSize bp ( with replacement )\n";
      &sampleFromDB(
                     sampleSizeMax   => $sampleSize,
                     dbSourceBlocks  => $dbSourceList,
                     dbType          => $engine,
                     dbFile          => $genomeDB,
                     withReplacement => 1,
                     fastaFile       => $sampleFastaFile
      );

      # If we are going to sample with replacement we need to at
      # least reshuffle the source list
      &fisherYatesShuffle( $dbSourceList );
    }
    else {
      log_print "   - Gathering up to $sampleSize bp\n";
      &sampleFromDB(
                     sampleSizeMax  => $sampleSize,
                     dbSourceBlocks => $dbSourceList,
                     dbType         => $engine,
                     dbFile         => $genomeDB,
                     fastaFile      => $sampleFastaFile
      );

    }
  }

  # Tabulate non-Ambiguous bases
  my $sampleDB = FastaDB->new( fileName => $sampleFastaFile,
                               openMode => SeqDBI::ReadOnly );
  my $actualGenomeSampleSize = 0;
  my $preMaskNonAmbigBases   = 0;
  my %sampleContigs          = ();
  foreach my $seqID ( $sampleDB->getIDs() ) {
    my $tmpName = $sampleDB->getDescription( $seqID );
    $tmpName = $1 if ( $tmpName =~ /(\S+):\d+-\d+/ );
    $sampleContigs{$tmpName}++;
    $preMaskNonAmbigBases   += $sampleDB->getSubtLength( $seqID );
    $actualGenomeSampleSize += $sampleDB->getSeqLength( $seqID );
  }
  undef $sampleDB;
  #print "sampleContigs = " . total_size(\%sampleContigs) . "\n";

  if ( $round == 1 ) {
    log_print "   - Final Sample Size = $actualGenomeSampleSize "
        . "bp ( $preMaskNonAmbigBases non ambiguous )\n";
    log_print "   - Num Contigs Represented = "
        . scalar( keys( %sampleContigs ) ) . "\n";
    log_print "   - Sequence extraction : " . elapsedTime( 3 ) . "\n";
    elapsedTime("repeatscout");
    log_print " -- Running RepeatScout on the sequences...\n";
    &runRepeatScout(
                     workDir     => $roundTmpDir,
                     rscoutPath  => $RSCOUT_DIR,
                     trfPrgm     => "$TRF_DIR/trf",
                     dustmskPrgm => $DUSTMASKER_PRGM,
                     fastaFile   => $sampleFastaFile,
                     seqSize     => $actualGenomeSampleSize
    );
    log_print "   - RepeatScout: " . elapsedTime( "repeatscout" ) . "\n";

    my $roundModels = 0;
    if ( -s "$sampleFastaFile.rscons.filtered" ) {
      system( "cp $sampleFastaFile.rscons.filtered $roundConsensiFile" );

      # MAKEBLASTDB the database
      system(   "$NCBIBLASTDB_PRGM -blastdb_version 4 -out $sampleFastaFile "
              . "-parse_seqids -dbtype nucl -in $sampleFastaFile >> "
              . "$roundTmpDir/makeblastdb.log 2>&1" );

      ( $roundModels ) = &buildRSConsensiMT(
                                 workDir          => $roundTmpDir,
                                 consensiFile     => $consensiFile,
                                 combFamiliesFile => $combFamiliesFile,
                                 round            => $round,
                                 familyCutoff     => $familyCutoff,
                                 threads          => $threads,
                                 numContigs => scalar( keys( %sampleContigs ) ),
                                 usedBlockIndex => $usedBlockIndex
      );

      $numModels = $roundModels;
      # We don't need to keep the large families datastructure around
      # at this point
      undef $families;
    }
    else {
      system( "touch $sampleFastaFile.rscons.filtered" );
      log_print "NOTE: RepeatScout did not return any models.\n";
    }
    $round++;

    last if ( $options{'onlyRS'} );

    # reset the sampleSize for the first round of RECON.
    $sampleSize = $reconSampleSizeStart;

    log_print "Round Time: " . elapsedTime( 1 ) . " : $roundModels families discovered.\n";
    next;
  }
  log_print "   - Sequence extraction : " . elapsedTime( 3 ) . "\n";

  # TRF sample
  log_print " -- Running TRFMask on the sequence...\n";

  # copy sample file for mask processing
  system( "cp $sampleFastaFile $roundTmpDir/tmpSample.fa" );

  system( "TRFMask $roundTmpDir/tmpSample.fa" );
  if ( -s "$roundTmpDir/tmpSample.fa.masked" ) {
    system( "mv $roundTmpDir/tmpSample.fa.masked $roundTmpDir/tmpSample.fa" );
  }
  log_print "   - TRFMask time " . elapsedTime( 3 ) . "\n";

  if ( -s $consensiFile ) {
    log_print " -- Masking repeats from the previous rounds...\n";

    $searchEngineN->setMinScore( 250 );
    # Changed 10/15/2022 -- shouldn't need alignments for masking
    $searchEngineN->setGenerateAlignments( 0 );
    $searchEngineN->setGapInit( -25 );
    $searchEngineN->setBandwidth( 10 );    # Changes gapW=31
    $searchEngineN->setInsGapExt( -5 );
    $searchEngineN->setDelGapExt( -5 );
    $searchEngineN->setMinMatch( 7 );
    $searchEngineN->setScoreMode( SearchEngineI::complexityAdjustedScoreMode );
    $searchEngineN->setAdditionalParameters( undef );
    $searchEngineN->setMaskLevel( undef );

    my $annots;
    my $bp_masked;
    if ( $searchEngineN->getVersion() =~ /.*(\d+)\.(\d+)\.\d+.*/ &&
         ( $1 > 2 || ($1 == 2 && $2 >= 13) ) ) {
      ($annots, $bp_masked) = RepeatUtil::ncbiMaskDatabaseNativeMT(
                                  makeDBPath   => $NCBIBLASTDB_PRGM,
                                  rmblastnPath => $RMBLASTN_PRGM,
                                  workingDir   => $roundTmpDir,
                                  fastaFile    => "$roundTmpDir/tmpSample.fa",
                                  consensi     => $consensiFile,
                                  threads      => $threads );
    }else {
      ($annots, $bp_masked) = RepeatUtil::ncbiMaskDatabaseMT(
                                  makeDBPath   => $NCBIBLASTDB_PRGM,
                                  dbCMDPath    => $NCBIDBCMD_PRGM,
                                  aliasPath    => $NCBIDBALIAS_PRGM,
                                  rmblastnPath => $RMBLASTN_PRGM,
                                  workingDir   => $roundTmpDir,
                                  fastaFile    => "$roundTmpDir/tmpSample.fa",
                                  consensi     => $consensiFile,
                                  threads      => $threads );

    }
    log_print "       $annots repeats masked totaling $bp_masked bp(s).\n";
    
    # clear special parameters 10/15/22
    $searchEngineN->setAdditionalParameters( undef );

    if ( -s "$roundTmpDir/tmpSample.fa.masked" ) {
      system( "mv $roundTmpDir/tmpSample.fa.masked $roundTmpDir/tmpSample.fa" );
    }
    log_print "   - TE Masking time " . elapsedTime( 3 ) . "\n";

    if ( $options{'onlyIns'} ) {

      # Remove sequences containing only/mostly Ns.  Because we
      # need to preserve the id's *and* they need to be contiguous,
      # we will simply replace these sequences with a short run
      # of N's as a placeholder.
      # First, reverse the replacement above ( masked is masked )
      system( "mv $roundTmpDir/tmpSample.fa $roundTmpDir/tmpSample.fa.masked" );
      open IN,  "<$roundTmpDir/tmpSample.fa.masked" or die;
      open OUT, ">$roundTmpDir/tmpSample.fa"        or die;
      my $id;
      my $seq;
      my $desc;
      my $seqsCut = 0;
      while ( <IN> ) {

        if ( /^>(\S+)\s+(.*)/ ) {
          my $tmpID = $1;
          my $tmpDE = $2;
          if ( $seq ) {
            my @nonNBlocks =
                sort { length( $b ) <=> length( $a ) } split( /[nNxX]+/, $seq );
            if ( length( $nonNBlocks[ 0 ] ) > 30 ) {
              print OUT ">$id $desc\n$seq\n";
            }
            else {
              print OUT ">$id $desc\nNNNNNNNNNN\n";
              $seqsCut++;
            }
          }
          $id   = $tmpID;
          $desc = $tmpDE;
          $seq  = "";
          next;
        }
        s/[\n\r\s]+//g;
        $seq .= $_;
      }
      if ( $seq ) {
        my @nonNBlocks =
            sort { length( $b ) <=> length( $a ) } split( /[nNxX]+/, $seq );
        if ( length( $nonNBlocks[ 0 ] ) > 30 ) {
          print OUT ">$id $desc\n$seq\n";
        }
        else {
          print OUT ">$id $desc\nNNNNNNNNNN\n";
          $seqsCut++;
        }
      }
      close OUT;
      if ( $seqsCut > 0 ) {
        log_print
"       Removed $seqsCut sequences from sample containing mostly Ns ( <30bp non-N blocks )...\n";
      }
    }
  }

  # Now "tmpSample.fa" has been fully masked.  Save it to the more
  # sensible name $sampleFastaFile with a ".masked" extension.
  system( "mv $roundTmpDir/tmpSample.fa $sampleFastaFile.masked" );

  # Tabulate non-Ambiguous bases
  my $sampleDB = FastaDB->new( fileName => "$sampleFastaFile.masked",
                               openMode => SeqDBI::ReadOnly );
  my $postMaskNonAmbigBases = 0;
  foreach my $seqID ( $sampleDB->getIDs() ) {
    $postMaskNonAmbigBases += $sampleDB->getSubtLength( $seqID );
  }
  undef $sampleDB;

  log_print " -- Sample Stats:\n";
  log_print "       Sample Size $actualGenomeSampleSize bp\n";
  log_print "       Num Contigs Represented = "
      . scalar( keys( %sampleContigs ) ) . "\n";
  log_print "       Non ambiguous bp:\n";
  log_print "             Initial: $preMaskNonAmbigBases bp\n";
  log_print "             After Masking: $postMaskNonAmbigBases bp\n";
  if ( $preMaskNonAmbigBases > 0 ) {
    log_print "             Masked: "
        . sprintf(
                   "%0.2f",
                   (
                     100 - (
                        ( $postMaskNonAmbigBases * 100 ) / $preMaskNonAmbigBases
                     )
                   )
        )
        . " % \n";
  }
  else {
    log_print "             Masked: 0\n";
  }

  # MAKEBLASTDB the database
  system(   "$NCBIBLASTDB_PRGM -blastdb_version 4 -out $sampleFastaFile.masked "
          . "-parse_seqids -dbtype nucl -in $sampleFastaFile.masked >> "
          . "$roundTmpDir/makeblastdb.log 2>&1" );
 
  # Gather details of the sample
  # TODO: Get this from the sample routine
  my %sampleSeqs;
  my @sampleSeqIDArray = ();
  my $sampleDBSize     = 0;
  my $sampleDBLen      = 0;
  my $sampleIndex = `$NCBIDBCMD_PRGM -db $sampleFastaFile.masked -entry all -outfmt "gi|%g %l"`;

  # Both search engines produce indexes in reverse order ie:
  #    gi|3  35000
  #    gi|2  40000
  #    gi|1  40000
  while ( $sampleIndex =~ /(\S+)\s+(\d+)\s*$/mg ) {
    $sampleSeqs{$1} = $2;
    push @sampleSeqIDArray, $1;
    $sampleDBLen += $2;
    $sampleDBSize++;
  }
  undef $sampleIndex;
  $totSampledBP += $sampleDBLen;
  log_print " -- Input Database Coverage: $totSampledBP bp out of $dbSize bp ( "
      . sprintf( "%0.2f", ( $totSampledBP / $dbSize ) * 100 )
      . " % )\n";
  log_print "Sampling Time: " . elapsedTime( 2 ) . "\n";

  #
  # Run all-by-other combinations
  #
  $searchEngineN->setMatrix(
                "/opt/gensoft/exe/RepeatModeler/2.0.5/share/Matrices/ncbi/" . "nt/comparison.matrix" );
  $searchEngineN->setGenerateAlignments( 0 );
  $searchEngineN->setTempDir( "$roundTmpDir" );
  $searchEngineN->setMinScore( 250 );
  $searchEngineN->setGapInit( -25 );
  $searchEngineN->setInsGapExt( -5 );
  $searchEngineN->setDelGapExt( -5 );
  $searchEngineN->setMinMatch( 7 );
  $searchEngineN->setScoreMode( SearchEngineI::complexityAdjustedScoreMode );
  $searchEngineN->setCores( $threads );

  my $numHSPs = 0;    # Number of High Scoring Pairs produced
  # First check to see if this was already run
  unless ( -f "$roundTmpDir/msps.out" ) {
    # Nope....starting
    log_print "Running all-by-other comparisons...\n";
    my $totalComparisons =
        ( ( $sampleDBSize * $sampleDBSize ) - $sampleDBSize ) / 2;
    my $completedComparisons = 0;
    log_print "  - Total Comparisons = $totalComparisons\n";

    my $startGID  = 1;    # The gi ID of the start of the batch range
    my $endGID    = 0;    # The gi ID of the end of the batch range
    my $batchSize = 0;    # The size of the batch in bp
    my $batchNum  = 1;    # Batch number we are working on
    my $retryLimit   = 2;     # Attempt to run batch # times before failing it

    my $compStartTime = time;
    while ( $startGID < $sampleDBSize ) {
      if (
         $sampleSeqs{ $sampleSeqIDArray[ $startGID ] } < ( $fragmentSize / 4 ) )
      {
        do {
          $endGID++;
          $batchSize += $sampleSeqs{ "gi|" . $endGID };
            } while (    $endGID < $sampleDBSize
                      && $batchSize < $fragmentSize );
      }
      else {
        $endGID = $startGID;
      }

      my $status = undef;
      my $retry = 0;
      my $batchFile = "$roundTmpDir/batch-$batchNum";
      do {
        log_print "WARNING: Retrying batch ( $batchNum ) [ $status ]...\n"
          if ( $retry > 0 );
        $status = runAllvsOtherMatrixRow( $searchEngineN, $batchNum, $startGID,
                                          $endGID, $roundTmpDir, 
                                          $sampleFastaFile, $NCBIDBALIAS_PRGM,
                                          $sampleDBSize );
      }while ( ($status != 0 || ! -e "$batchFile.msps") && 
               $retry++ < $retryLimit );

      if ( $retry >= $retryLimit ) {
        ## Too many retries.
        ## We are out of here!
        log_print "\n\nFATAL ERROR: RepeatModeler giving up. One or more\n"
                  . "batches failed!  Unfortunately this type of error\n"
                  . "cannot be recovered from. Please submit the following\n"
                  . "details to the feedback page at the repeatmasker\n"
                  . "website:\n\n"
                  . "       http://www.repeatmasker.org\n\n"
                  . "RepeatModeler Version: $version\n"
                  . "Search Engine: $engine [ "
                  . $searchEngineN->getVersion() . " ]\n"
                  . "Command Line: $cmdLine\n"
                  . "Batch Number: $batchNum\n"
                  . "Disk Space:\n"
                  . `df $tmpDir` . "\n";
        if ( -e "/proc/meminfo" ) {
          log_print "System Memory:\n";
          open IN, "</proc/meminfo";
          while ( <IN> ) {
            log_print if ( /Mem|Cache|Swap/ );
          }
          close IN;
        }
        log_print "Further details about this problem may be found in\n"
                . "the directory: $tmpDir\n";
        log_print "\n\n";
        exit( -1 );
      }
 
      if ( $status == 0
           && -e "$batchFile.msps" )
      {
        # Run successful
        print "Batch completed: batch=$batchNum\n"
                if ( $DEBUG );
        ## Append msps
        open MSP, "<$batchFile.msps"
          or die "Could not open $batchFile.msps for reading!\n";
        while ( <MSP> ) {
          $numHSPs++;
        }
        close MSP;
        system( "cat $batchFile.msps >> $roundTmpDir/msps.out" );
        unlink( "$batchFile.msps" ) unless ( $DEBUG );
      }

      my $numComparisons = 0;
      for ( my $l = $startGID ; $l <= $endGID ; $l++ ) {
        $numComparisons += $sampleDBSize - $l;
      }
      $completedComparisons += $numComparisons;
      my $comparisonsLeft   = $totalComparisons - $completedComparisons;
      my $estimatedTimeLeft = "--";
      if ( ( time - $compStartTime ) > 0 && $completedComparisons > 0 ) {
        $estimatedTimeLeft =
              int( $comparisonsLeft /
                   ( $completedComparisons / ( time - $compStartTime ) ) );
        my $Min = sprintf( "%02s", int( $estimatedTimeLeft / 60 ) );
        $estimatedTimeLeft -= $Min * 60;
        my $Hours = sprintf( "%02s", int( $Min / 60 ) );
        $Min -= $Hours * 60;
        my $Sec = sprintf( "%02s", $estimatedTimeLeft );
        $estimatedTimeLeft = "$Hours:$Min:$Sec";
      }

      print "      "
            . sprintf("%3s",
                      int(( $completedComparisons / $totalComparisons ) * 100)
                      )
            . "% completed, "
            . " $estimatedTimeLeft (hh:mm:ss) est. time remaining.\n";

      $startGID  = $endGID + 1;
      $batchSize = 0;
      $batchNum++;
    }

    log_print "Comparison Time: "
        . elapsedTime( 2 )
        . ", $numHSPs HSPs Collected\n";
  }

  ## Bugfix reported by Nathaniel Street.
  my $roundModels = 0;
  if ( $numHSPs > $recommendedFamilySize ) {

    # Create the seqnames file
    unless ( -f "$roundTmpDir/seqnames" ) {
      open OUT, ">$roundTmpDir/seqnames";
      print OUT "" . scalar( @sampleSeqIDArray ) . "\n";
      foreach my $key ( sort( @sampleSeqIDArray ) ) {
        print OUT "$key\n";
      }
      close OUT;
    }

    # Run recon
    unless ( -f "$roundTmpDir/summary/eles" ) {
      runRECON( workDir   => $roundTmpDir,
                reconPath => $RECON_DIR );
    }

    #
    #  Build consensi
    #
    if ( -f "$roundTmpDir/summary/eles" ) {
      ( $roundModels ) = &buildReconConsensiMT(
                                          workDir          => $roundTmpDir,
                                          consensiFile     => $consensiFile,
                                          combFamiliesFile => $combFamiliesFile,
                                          round            => $round,
                                          genomeDB         => $genomeDB,
                                          familyCutoff     => $familyCutoff,
                                          threads          => $threads
      );
 
      # We don't need to keep the large families datastructure around
      # at this point
      undef $families;
      $numModels += $roundModels;
    }
    else {
      log_print "\n  - Recon did not produce any element definitions.\n\n";
    }
  }

  # Remove all the old batch files
  unless ( $DEBUG ) {
    opendir( DIR, $roundTmpDir )
        or die $CLASS
        . ": Can't open director $roundTmpDir "
        . "for reading!\n";
    my $file;
    while ( defined( $file = readdir( DIR ) ) ) {
      if (    $file =~ /batch-(\d+)(.fa|.gilist|.gilist.txt)/
           || $file =~ /family-(\d+)(-cons.html|-cons-\d+.fa)/ )
      {

        # family-68-cons.html family-68-cons.malign
        # batch-11.fa  family-6-cons-3.fa
        unlink( "$roundTmpDir/$file" );
      }
    }
    closedir( DIR );
  }

  log_print "Round Time: " . elapsedTime( 1 ) . " : $roundModels families discovered.\n";

# How do we proceed from here
#
#  Case 1 (default): Sample without replacement.  Sample up to $genomeSampleSizeMax (100MB).  Typically
#                    this isn't anywhere close to the genome size, but if it is, quit when we get
#                    reach that.  If the last round is reached and there is only a little bit left
#                    over, add that as well.
#  Case 2 (onlyIns): Only genome inserts.  Sample (with replacement) up to $genomeSampleSizeMax.  No
#                    need to quit early.
#
#  $genomeSampleSizeMax          : The target sample size for the last round
#  $sampleSize                   : The current size of the sample requested for this round
#  $sampleDBLen                  : The true length of the sampleDB-#.fa file
#  $dbSize                       : The size of the input database
#  $genomeSampleSizeMultiplier   : The multiplier for sample size between rounds
#  $totSampledBP                 : The sum of the sampleDB-#.fa files.  The total bases searched thus far
#  $fragmentSize                 : The size of the batches for all-vs-other comparisons
#
# Calculate the new sampleSize for the next round
# RECON Rounds, check for completion.
  if ( $options{'onlyIns'} ) {
    if (    $sampleSize >= $genomeSampleSizeMax
         || $sampleSize >= $dbSize )
    {
      last;
    }
  }
  else {
    if (    $totSampledBP >= $dbSize
         || ( $sampleSize >= $genomeSampleSizeMax && $numAdditionalRounds <= 0 )
         || $seqRemaining < $fragmentSize )
    {
      last;
    }
  }
  if ( $numAdditionalRounds > 0 && $sampleSize >= $genomeSampleSizeMax )
  {
    $numAdditionalRounds--;
  }

  # This is a followup round of RECON, scale sampleSize accordingly
  $sampleSize *= $genomeSampleSizeMultiplier;

  # Are we sampling without replacement? If so, make sure
  # we don't run out of sequence.
  if ( !$options{'onlyIns'} ) {
    if ( $seqRemaining > 0 && $seqRemaining <= ( $sampleSize / 2 ) ) {
      log_print "   - Increasing sample size to include end piece now = ";
      $sampleSize += $seqRemaining;
      log_print "$sampleSize\n";
    }
  }

  # Do not go overboard
  $sampleSize = $genomeSampleSizeMax
      if ( $sampleSize > $genomeSampleSizeMax );

  $round++;
}    # Main Loop while(...
log_print "\nRepeatScout/RECON discovery complete: $numModels families found\n";

## TODO: Use the last round's masking data to re-estimate the
##       abundance of families found in previous rounds.

# Optionally perform LTR structural search
if ( $options{'LTRStruct'} ) {
  elapsedTime( 1 );
  log_print "\n\nLTR Structural Analysis\n";
  log_print "=======================\n";

  # Save genome to sequence file
  open OUT, ">$tmpDir/tmpInputSeq"
      or die "Could not open $tmpDir/tmpInputSeq for writing!\n";
  open SEQ, "$NCBIDBCMD_PRGM -db $genomeDB -entry all -outfmt \"%f\"|"
      or die "Could not run: $NCBIDBCMD_PRGM\n";
  while ( <SEQ> ) {
    if ( /^>/ ) {

      # Replace >gi|232 with >gi-232 so that LTR_retriever is happy.  This is
      # a known issue with LTR_retriever-2.6
      s/\|/-/;
    }
    print OUT;
  }
  close SEQ;
  close OUT;
  if ( -s "$tmpDir/tmpInputSeq" ) {
    my $optionalArgs = " -giToID $genomeDB.translation";
    $optionalArgs .= " -LTRMaxSeqLen $options{'LTRMaxSeqLen'}"
        if ( exists $options{'LTRMaxSeqLen'} );
    $optionalArgs .= " -threads $options{'threads'}" if ( exists $options{'threads'} );
    $optionalArgs .= " -debug"             if ( $DEBUG );
    ## Run the LTR Pipeline
    print "Calling LTRPipeline as: LTRPipeline $optionalArgs $tmpDir/tmpInputSeq\n" if ( $DEBUG );
    system( "LTRPipeline $optionalArgs $tmpDir/tmpInputSeq" );
    unlink "$tmpDir/tmpInputSeq" unless ( $DEBUG );
    if ( -s "$tmpDir/tmpInputSeq-ltrs.fa" ) {

      # Combine results between both pipelines
      # run CDhit
      # /usr/local/cd-hit/cd-hit-est -aS 0.8 -c 0.8 -g 1 -G 0 -A 80 -M 10000
      #         -i all_consensi.fasta -o all_consensi.clusters1.fasta -T 20
      #   T is threads, 0 = unlimited
      #
      # >Cluster 0
      # 0	8283nt, >rnd-3_family-2#LINE... *
      # 1	3592nt, >rnd-4_family-1#LINE... at 1:3592:4587:8184/+/98.72%
      # 2	2990nt, >rnd-5_family-1#LINE... at 1:2990:4994:7985/+/97.46%
      # 3	1641nt, >rnd-5_family-22#LIN... at 2:1623:10:1632/+/87.13%
      #
      # Jullien clustered and removed RECON/RepeatScout hits that overlapped
      # Then she clustered a second time and collapsed clusters using Refiner.
      #
      # What we are doing:
      #   Cluster the sequences with RepeatModeler's RECON/RepeatScout pipeline.
      #   Identify non-singletons and collapse in favor of the longest sequence
      #   ( for RECON/RepeatScout clusters ) or in favor of LTRPipeline in mixed
      #   clusters.  Keep all LtrPipeline produced results.
      #
      log_print "  -- Clustering results with previous rounds...\n";

      # 1. Combine results from both pipelines into a file
      system(
"cat $tmpDir/tmpInputSeq-ltrs.fa $tmpDir/consensi.fa > $tmpDir/combined.fa" );
      system(
"cat $tmpDir/tmpInputSeq-ltrs.stk $tmpDir/families.stk > $tmpDir/combined.stk"
      );

      # 2. Cluster results
      my $cmd =
            "$CDHIT_DIR/cd-hit-est -aS 0.8 -c 0.8 -g 1 -G 0 -A 80 -M 10000 "
          . "-i $tmpDir/combined.fa -o $tmpDir/cd-hit-out -T $threads "
          . "> $tmpDir/cd-hit-stdout 2>&1";
      system( $cmd);
      unlink( "$tmpDir/cd-hit-stdout" ) if ( -e "$tmpDir/cd-hit-stdout" );
      unlink( "$tmpDir/cd-hit-out" )    if ( -e "$tmpDir/cd-hit-out" );

      # 3. Process clusters and remove redundancy
      my %redundant_families;
      my %putative_subfamilies;
      my $ltrFamCnt = 0;
      my $rrFamCnt  = 0;
      if ( -s "$tmpDir/cd-hit-out.clstr" ) {
        open IN, "<$tmpDir/cd-hit-out.clstr"
            or die
"RepeatModeler: Could not open $tmpDir/cd-hit-out.clstr for reading!\n";
        my @cluster = ();
        my $longest_ltr_id;
        my $longest_ltr_size;
        my $longest_rnd_id;
        my $longest_rnd_size;
        while ( <IN> ) {
          if ( /^>Cluster/ ) {
            if ( @cluster > 1 ) {

              # I started out by picking the longest LTRPipeline
              # derived family as the cluster rep.  Now we
              # consider all LTRPipeline candidates as dominant.
              # The only reason two more more LTRPipeline candidates
              # would show up in a cluster would be due to
              # overlap between LTR/INT sequences that didn't get
              # removed by LTR_retriever.
              #
              # Keeping the longest_ltr_id variable as an indicator
              # that there is at least one LTRPipeline candidate in the
              # cluster.
              if ( $longest_ltr_id ) {
                foreach my $id ( @cluster ) {

                  # Now...remove only overlapping RECON/RepeatScout
                  # candidates ( rnd-#_family-# ).
                  if ( $id !~ /^ltr-\d+_family-\d+/ ) {

            # Use the longest as the "reason" why we are removing it.
            #print "Removing $id because it's redundant with $longest_ltr_id\n";
                    $redundant_families{$id}++;
                  }
                }
              }
              elsif ( $longest_rnd_id ) {
                foreach my $id ( @cluster ) {
                  if ( $id ne $longest_rnd_id ) {

               #print "Labeling $id as putative subfamily of $longest_rnd_id\n";
                    $putative_subfamilies{$id} = $longest_rnd_id;
                  }
                }
              }
            }
            $longest_ltr_id   = undef;
            $longest_ltr_size = 0;
            $longest_rnd_id   = undef;
            $longest_rnd_size = 0;
            @cluster          = ();
            next;
          }
          if ( /^\d+\s+(\d+)nt,\s*>((rnd|ltr)-\d+_family-\d+)/ ) {
            my $size = $1;
            my $id   = $2;
            my $type = $3;
            if ( $type eq "ltr" ) {
              $ltrFamCnt++;
              if ( $size > $longest_ltr_size ) {
                $longest_ltr_id   = $id;
                $longest_ltr_size = $size;
              }
            }
            if ( $type eq "rnd" ) {
              $rrFamCnt++;
              if ( $size > $longest_rnd_size ) {
                $longest_rnd_id   = $id;
                $longest_rnd_size = $size;
              }
            }
            push @cluster, $id;
          }
        }
        close IN;

        # TODO: Do we really want to keep this?
        #unlink("$tmpDir/cd-hit-out.clstr" );
      }
      log_print "       - $rrFamCnt RepeatScout/RECON families\n";
      log_print "       - $ltrFamCnt LTRPipeline families\n";
      if ( keys( %redundant_families ) ) {
        system(
               "mv $tmpDir/consensi.fa $tmpDir/consensi.fa.recon_rscout_only" );
        system( "mv $tmpDir/combined.fa $tmpDir/consensi.fa.with_redundancy" );

        # Filter consensi.fa and families.stk
        open IN, "<$tmpDir/consensi.fa.with_redundancy"
            or die
"RepeatModeler: Could not open $tmpDir/consensi.fa.with_redundancy for reading";
        open OUT, ">$tmpDir/consensi.fa"
            or die
            "RepeatModeler: Could not open $tmpDir/consensi.fa for writing";
        my $id;
        my $data;
        while ( <IN> ) {
          if ( /^>(\S+)/ ) {
            my $tmpID = $1;
            if ( $data ) {
              if ( !exists $redundant_families{$id} ) {
                print OUT $data;
              }
            }
            if ( exists $putative_subfamilies{$tmpID} ) {
              my $tstr = $_;
              $tstr =~ s/[\n\r]//g;
              $data =
                  "$tstr [ putative subfamily of "
                  . $putative_subfamilies{$tmpID} . " ]\n";
            }
            else {
              $data = $_;
            }
            $id = $tmpID;
            next;
          }
          $data .= $_;
        }
        if ( $data ) {
          if ( !exists $redundant_families{$id} ) {
            print OUT $data;
          }
        }
        close IN;
        close OUT;
        system(
             "mv $tmpDir/families.stk $tmpDir/families.stk.recon_rscout_only" );
        system(
               "mv $tmpDir/combined.stk $tmpDir/families.stk.with_redundancy" );
        open IN, "<$tmpDir/families.stk.with_redundancy"
            or die
"RepeatModeler: Could not open $tmpDir/families.stk.with_redundancy for reading";
        open OUT, ">$tmpDir/families.stk"
            or die
            "RepeatModeler: Could not open $tmpDir/families.stk for writing";
        $id   = "";
        $data = "";

        while ( <IN> ) {
          if ( /^#=GF\s+ID\s+(\S+)/ ) {
            $id = $1;
          }

          if ( /^#=GF\s+DE\s+(\S.*)/ ) {
            if ( exists $putative_subfamilies{$id} ) {
              my $tstr = $_;
              $tstr =~ s/[\n\r]//g;
              $data .=
                  "$tstr [ putative subfamily of "
                  . $putative_subfamilies{$id} . " ]\n";
            }
            else {
              $data .= $_;
            }
          }
          else {
            $data .= $_;
          }

          if ( /^\/\// ) {
            if ( !exists $redundant_families{$id} ) {
              print OUT $data;
            }
            $data = "";
          }
        }
        close IN;
        close OUT;
        log_print "       - Removed "
            . scalar( keys( %redundant_families ) )
            . " redundant LTR families.\n";
        log_print "       - Final family count = "
            . (
            ( $rrFamCnt + $ltrFamCnt ) - scalar( keys( %redundant_families ) ) )
            . "\n";
      }
    }
  }
  else {
    log_print
"\nWARNING: Could not create input file for LTRPipeline from $genomeDB! Continuing using\nresults from RepeatScout/RECON pipeline only.\n";
  }
  log_print "LTRPipeline Time: " . elapsedTime( 1 ) . "\n";
}

elapsedTime( 1 );
if ( $numModels > 0 ) {
  log_print "\n\n";
  my $origDir = getcwd();
  chdir( $tmpDir );
  my $addlOpts = "";
  $addlOpts = "-threads $options{'threads'} " if ( $options{'threads'} );
  system(   "RepeatClassifier "
          . "-consensi consensi.fa -stockholm families.stk" );
  chdir( $origDir );
  log_print "Classification Time: " . elapsedTime( 1 ) . "\n";
}

log_print "\n\nProgram Time: " . elapsedTime( "runtime" ) . "\n";
print "Working directory:  $tmpDir\n";
print "may be deleted unless there were problems with the run.\n";

if ( $numModels > 0 ) {
  system( "cp $tmpDir/consensi.fa.classified $genomeDB-families.fa" )
      if ( -s "$tmpDir/consensi.fa.classified" );
  system( "cp $tmpDir/families-classified.stk $genomeDB-families.stk" )
      if ( -s "$tmpDir/families-classified.stk" );
  system( "cp $tmpDir/rmod.log $genomeDB-rmod.log" )
      if ( -s "$tmpDir/rmod.log" );
  print "\nThe results have been saved to:\n";
  print
"  $genomeDB-families.fa  - Consensus sequences for each family identified.\n";
  print
"  $genomeDB-families.stk - Seed alignments for each family identified.\n";
  print
"  $genomeDB-rmod.log     - Execution log.  Useful for reproducing results.\n\n";
  print "The RepeatModeler stockholm file is formatted so that it can\n";
  print
"easily be submitted to the Dfam database.  Please consider contributing\n";
  print
      "curated families to this open database and be a part of this growing\n";
  print
      "community resource.  For more information contact help\@dfam.org.\n\n\n";
}
else {
  print "No families identified.  Perhaps the database is too small\n";
  print "or contains overly fragmented sequences.\n";
}
exit;






#############################################################################
#                          S U B R O U T I N E S                            #
#############################################################################

sub runAllvsOtherMatrixRow {
  my $searchEngineN = shift;
  my $batchNum = shift;
  my $startGID = shift;
  my $endGID   = shift;
  my $roundTmpDir = shift;
  my $sampleFastaFile = shift;
  my $NCBIDBALIAS_PRGM  = shift;
  my $sampleDBSize = shift;

  # Generate query batch file
  my $sampleID = $startGID;
  while ( $sampleID <= $endGID ) {
    system(   "$NCBIDBCMD_PRGM -db $sampleFastaFile.masked "
            . "-entry \"gi|$sampleID\" "
            . ">> $roundTmpDir/batch-$batchNum.fa "
            . "2>>$roundTmpDir/blastdbcmd.log" );
    $sampleID++;
  }

  $searchEngineN->setQuery( "$roundTmpDir/batch-$batchNum.fa" );
  $searchEngineN->setSubject( "$sampleFastaFile.masked" );

  # Create a gilist
  # The batches are sometimes made up of smaller sequences
  # grouped together to form the equivalent of a standard
  # sized fragment ( $fragmentSize ).  Note that this breaks
  # the standard all-vs-other approach of only comparing to
  # sequences with a GID greater than the sequence itself.
  # Instead for those multi-sequence batches some amount of
  # identity and reciprocal matches:
  #   e.g: s2/s2, s3/s3, s2/s3, s3/s2  among others
  #
  #         b1..........b2..........b3....
  #         s1 s2 s3 s4 s5 s6 s7 s8 s9 s10
  # b1  s1     |----------search1--------|
  # .   s2           
  # .   s3          
  # .   s4                     
  # b2  s5                 |---search2---|  
  # ......................................
  # 
  # These will get filtered out later.  That is why the giList starts
  # after the starting GID regardless of being a 1 sequence
  # batch or a multi-sequence batch.
  #
  my @giList = ( ( $startGID + 1 ) .. $sampleDBSize );
  open GI, ">$roundTmpDir/batch-$batchNum-gilist.txt"
    or die "Could not open up file $roundTmpDir/batch-$batchNum-gilist.txt "
           . "for writing!\n";
  print GI join( "\n", @giList ) . "\n";
  close GI;
  system(   "$NCBIDBALIAS_PRGM -gi_file_in "
            . "$roundTmpDir/batch-$batchNum-gilist.txt "
            . "-gi_file_out $roundTmpDir/batch-$batchNum-gilist "
            . "> /dev/null 2>&1" );
  $searchEngineN->setAdditionalParameters(
        " -gilist $roundTmpDir/batch-$batchNum-gilist " );

  # Run search
  my ( $status, $resultCollection ) = $searchEngineN->search();
  open OUT, ">$roundTmpDir/batch-$batchNum.msps"
     or die "Could not open $roundTmpDir/batch-$batchNum.msps for writing!\n";

  print "Search Parameters: " . $searchEngineN->getParameters() . "\n"
     if ( $DEBUG );
  if ( $status ) {
    print STDERR "\nERROR from search engine (", $? >> 8, ") \n";
  }
  else {
    print "Filtering ( " . $resultCollection->size() . " ) Results...\n"
      if ( $DEBUG );

    for ( my $k = 0 ; $k < $resultCollection->size() ; $k++ ) {
      my $resultRef = $resultCollection->get( $k );
      my $qryId     = $resultRef->getQueryName();
      $qryId = $1 if ( $qryId =~ /^gi\|(\d+).*/ );
      my $sbjId = $resultRef->getSubjName();
      $sbjId = $1 if ( $sbjId =~ /^gi\|(\d+).*/ );
      # This is necessary to screen out A/B,B/A,A/A,B/B matches
      # generated by multi-sequence batches (see above).
      next if ( $sbjId <= $qryId );
      my $mspLine = "";
      $mspLine = sprintf( "%06d %04.1f ",
                          $resultRef->getScore(),
                          ( 100 - $resultRef->getPctDiverge() ) );

      if ( $resultRef->getOrientation ne "C" ) {
        $mspLine .= sprintf( "%06d %06d ",
                             $resultRef->getQueryStart(),
                             $resultRef->getQueryEnd() );
      }
      else {
        $mspLine .= sprintf( "%06d %06d ",
                             $resultRef->getQueryEnd(),
                             $resultRef->getQueryStart() );
      }
      $mspLine .= sprintf( "%s %06d %06d %s \n",
                   "gi|$qryId",
                           $resultRef->getSubjStart(),
                           $resultRef->getSubjEnd(), "gi|$sbjId" );
      print OUT $mspLine;
    }
  }
  close OUT;

  return ($status);
}


sub benchmarkStorage {
  my $file = shift;

  my $numBlocks = 1000;
  my $blockSize = 64 * 1024;

  my $data;
  my @alpha = ( 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '-', '_' );
  for ( my $i = 0 ; $i < $blockSize ; $i++ ) {
    $data .= $alpha[ rand( @alpha ) ];
  }

  my $elapsed;
  my $numBlocksWritten = 0;
  my $start            = [ gettimeofday ];

  # Perform the experiment
  open OUT, ">$file" or die "Could not open $file for writing!\n";
  for ( my $i = 0 ; $i < $numBlocks ; $i++ ) {
    print OUT "$data";
  }
  close OUT;
  unlink $file;

  $elapsed = tv_interval( $start );
  my $bytesPerSec = ( ( $numBlocks * $blockSize ) / $elapsed );

  return ( $bytesPerSec / 1000000 );
}

sub runRECON {
  my %parameters = @_;

  elapsedTime( 2 );
  die $CLASS . "::runRecon() missing reconPath parameter!\n"
      if ( !defined $parameters{'reconPath'} );
  my $RECON_DIR = $parameters{'reconPath'};

  die $CLASS . "::runRecon() missing workDir parameter!\n"
      if ( !defined $parameters{'workDir'} );

  my $tmpDir  = $parameters{'workDir'};
  my $origDir = getcwd();
  chdir( $tmpDir );

  # If these two do not exist imagespread will fail
  # TODO: Make sure they aren't already here.
  system( "mkdir images" );
  system( "mkdir summary" );

  print "  - RECON: Running imagespread..\n";
  `$RECON_DIR/imagespread seqnames msps.out`;
  `mv gmon.out imagespread-gmon.out` if ( -f "gmon.out" );

  if ( $? ) { die "imagespread failed. Exit code $?\n"; }
  print "RECON Elapsed: " . elapsedTime( 2 ) . "\n";

  my $sect = 1;

  for ( my $i = 1 ; $i <= $sect ; $i++ ) {
    my $spread = "images/spread" . $i;
    `sort -T . -k 3,3 -k 4n,4n -k 5nr,5nr $spread >> images/images_sorted`;
    if ( $? ) { die "sort failed for $spread.\n"; }
  }

  `rm -f images/spread*`;

  # initial definition of elements
  `rm -rf ele_def_res`;
  `mkdir ele_def_res`;

  print "  - RECON: Running initial definition of elements ( eledef )..\n";
  `$RECON_DIR/eledef seqnames msps.out single`;
  `mv gmon.out eledef-gmon.out` if ( -f "gmon.out" );
  if ( $? ) { die "eledef failed. Exit code $?\n"; }
  print "RECON Elapsed: " . elapsedTime( 2 ) . "\n";

  # re-defining elements
  `rm -rf ele_redef_res`;
  `mkdir ele_redef_res`;

  `rm -f tmp tmp2`;
  `ln -s ele_def_res tmp`;
  `ln -s ele_redef_res tmp2`;

  print "  - RECON: Running re-definition of elements ( eleredef )..\n";
  `$RECON_DIR/eleredef seqnames`;
  `mv gmon.out eleredef-gmon.out` if ( -f "gmon.out" );
  if ( $? ) { die "eleredef failed. Exit code $?\n"; }
  print "RECON Elapsed: " . elapsedTime( 2 ) . "\n";

  `rm -f tmp tmp2`;

  # re-defining edges
  `rm -rf edge_redef_res`;
  `mkdir edge_redef_res`;

  `rm -f tmp tmp2`;
  `ln -s ele_redef_res tmp`;
  `ln -s edge_redef_res tmp2`;

  print "  - RECON: Running re-definition of edges ( edgeredef )..\n";
  `$RECON_DIR/edgeredef seqnames`;
  `mv gmon.out edgeredef-gmon.out` if ( -f "gmon.out" );
  if ( $? ) { die "edgeredef failed. Exit code $?\n"; }
  print "RECON Elapsed: " . elapsedTime( 2 ) . "\n";

  # famdef
  `rm -f tmp tmp2`;
  `ln -s edge_redef_res tmp`;

  print "  - RECON: Running family definition ( famdef )..\n";
  `$RECON_DIR/famdef seqnames`;
  `mv gmon.out famdef-gmon.out` if ( -f "gmon.out" );
  if ( $? ) { die "famdef failed. Exit code $?\n"; }
  print "RECON Elapsed: " . elapsedTime( 2 ) . "\n";

  `rm -f tmp`;

  unless ( $DEBUG ) {
    `rm -rf ele_def_res`;
    `rm -rf ele_redef_res`;
    `rm -rf edge_redef_res`;
  }

  chdir( $origDir );

  return;
}

##-------------------------------------------------------------------------##
#
# Use: my = runRepeatScout( .. );
#
#
##-------------------------------------------------------------------------##
sub runRepeatScout {
  my %parameters = @_;
  die $CLASS . "::runRepeatScout() missing rscoutPath parameter!\n"
      if ( !defined $parameters{'rscoutPath'} );
  my $RSCOUT_DIR = $parameters{'rscoutPath'};

  die $CLASS . "::runRepeatScout() missing trfPrgm parameter!\n"
      if ( !defined $parameters{'trfPrgm'} );
  my $TRF_PRGM = $parameters{'trfPrgm'};

  die $CLASS . "::runRepeatScout() missing dustmskPrgm parameter!\n"
      if ( !defined $parameters{'dustmskPrgm'} );
  my $DUSTMASKER_PRGM = $parameters{'dustmskPrgm'};

  die $CLASS . "::runRepeatScout() missing workDir parameter!\n"
      if ( !defined $parameters{'workDir'} );
  my $tmpDir = $parameters{'workDir'};

  die $CLASS . "::runRepeatScout() missing fastaFile parameter!\n"
      if ( !defined $parameters{'fastaFile'} );
  my $fastaFile = $parameters{'fastaFile'};

  die $CLASS . "::runRepeatScout() missing seqSize parameter!\n"
      if ( !defined $parameters{'seqSize'} );
  my $seqSize = $parameters{'seqSize'};

  my $origDir = getcwd();
  chdir( $tmpDir );

  # Calculate lmer size
  my $lmerSize = ceil( ( log( $seqSize ) / log( 4 ) ) + 1 );

  print "   - RepeatScout: Running build_lmer_table ( l = $lmerSize )..\n";
  my $cmd =
        "$RSCOUT_DIR/build_lmer_table -l $lmerSize -sequence "
      . "$fastaFile -freq $fastaFile.lfreq 2>&1 > repeatscout.log";
  `$cmd`;
  if ( $? ) { die "build_lmer_table failed. Exit code $?\n"; }

  print "   - RepeatScout: Running RepeatScout..";
  $cmd =
        "$RSCOUT_DIR/RepeatScout -l $lmerSize -sequence $fastaFile"
      . " -tandemdist 500 -output $fastaFile.rscons -freq $fastaFile.lfreq "
      . "-stopafter 100 2>&1 >> repeatscout.log";

  `$cmd`;
  if ( $? || -s "repeatscout.log" ) {
    die
"RepeatScout failed. Exit code $?.\n  Check $tmpDir/repeatscout.log for more details.\n";
  }

  if ( -s "$fastaFile.rscons" ) {
    my $rawFams = 0;
    open IN, "<$fastaFile.rscons"
        or die "Could not open $tmpDir/$fastaFile.rscon for reading!\n";
    while ( <IN> ) {
      $rawFams++ if ( /^>/ );
    }
    close IN;
    print " : $rawFams raw families identified\n";

    # Run TRF/DUSTMASKER on sequences
    print "   - RepeatScout: Running filtering stage..";
    $ENV{'TRF_COMMAND'}        = $TRF_PRGM;
    $ENV{'DUSTMASKER_COMMAND'} = $DUSTMASKER_PRGM;
    $cmd                       =
          "$RSCOUT_DIR/filter-stage-1.prl $fastaFile.rscons > "
        . "$fastaFile.rscons.filtered 2>> filter-stage-1.log";
    `$cmd`;

    my $finalFams = 0;
    if ( -s "$fastaFile.rscons.filtered" ) {
      open IN, "<$fastaFile.rscons.filtered"
          or die
          "Could not open $tmpDir/$fastaFile.rscon.filtered for reading!\n";
      while ( <IN> ) {
        $finalFams++ if ( /^>/ );
      }
      close IN;
    }
    else {
      if ( -s "filter-stage-1.log" ) {
        open IN, "<filter-stage-1.log"
            or die "Could not open $tmpDir/filter-stage-1.log for reading!\n";
        my $errMsg = "";
        while ( <IN> ) {
          if ( /ERROR/ || /Can\'t locate/ || /BEGIN failed/ ) {
            $errMsg .= $_;
          }
        }
        close IN;
        if ( $errMsg ne "" ) {
          die "RepeatScout filter-stage-1 failed:\n$errMsg"
              . "Please see $tmpDir/filter-stage-1.log for details.\n";
        }
      }
    }
    print " $finalFams families remaining\n";
  }
  else {
    print " : no families identified\n";
  }

  chdir( $origDir );

  return;
}


##-------------------------------------------------------------------------##
#
# Use: my = buildRSConsensiMT( .. );
#
#   RepeatScout produces consensus sequences but does not indicate which
#   regions from the genome contributed to the overall multiple alignment.
#   To compensate for this we search the genome with the consensus and
#   obtain the complete set of matching regions.
#
##-------------------------------------------------------------------------##
sub buildRSConsensiMT {
  my %parameters = @_;

  die "buildRSConsensiMT(): Missing workDir parameter!\n"
      if ( !defined $parameters{'workDir'} );
  my $workDir = $parameters{'workDir'};

  die "buildRSConsensiMT(): Missing round parameter!\n"
      if ( !defined $parameters{'round'} );
  my $round = $parameters{'round'};

  die "buildRSConsensiMT(): Missing consensiFile parameter!\n"
      if ( !defined $parameters{'consensiFile'} );
  my $consensiFile = $parameters{'consensiFile'};

  die "buildRSConsensiMT(): Missing combFamiliesFile parameter!\n"
      if ( !defined $parameters{'combFamiliesFile'} );
  my $combFamiliesFile = $parameters{'combFamiliesFile'};

  die "buildRSConsensiMT(): Missing familyCutoff parameter!\n"
      if ( !defined $parameters{'familyCutoff'} );
  my $familySizeCutoff = $parameters{'familyCutoff'};

  die "buildRSConsensiMT(): Missing numContigs parameter!\n"
      if ( !defined $parameters{'numContigs'} );
  my $dbNumContigs = $parameters{'numContigs'};

  die "buildRSConsensiMT(): Missing usedBlockIndex parameter!\n"
      if ( !defined $parameters{'usedBlockIndex'} );
  my $usedBlockIndex = $parameters{'usedBlockIndex'};

  die "buildRSConsensiMT(): Missing threads parameter!\n"
      if ( !defined $parameters{'threads'} );
  my $threads = $parameters{'threads'};

  my %families            = ();
  my $numModels           = 0;
  my @indices             = ();
  my %localizedToSeqNames = ();

  ## Get Sequences
  my $searchEngineN = NCBIBlastSearchEngine->new( pathToEngine => $RMBLASTN_PRGM );
  $searchEngineN->setMatrix(
                       "/opt/gensoft/exe/RepeatModeler/2.0.5/share/Matrices/ncbi/nt/comparison.matrix" );

  # Use seg masking in rmblast?
  $searchEngineN->setTempDir( "$workDir" );
  $searchEngineN->setMinScore( 250 );
  $searchEngineN->setGenerateAlignments( 1 );
  $searchEngineN->setGapInit( -25 );
  $searchEngineN->setInsGapExt( -5 );
  $searchEngineN->setDelGapExt( -5 );
  $searchEngineN->setMinMatch( 7 );
  $searchEngineN->setCores($threads);
  $searchEngineN->setScoreMode( SearchEngineI::complexityAdjustedScoreMode );

  # TODO: Large Satellite Detection
  #    1. Build a database from consensi.fa
  elapsedTime( "satellite_filtering" );
  if ( $engine eq "abblast" ) {
    # XDFORMAT the conseni.fa
    system(   "$XDFORMAT_PRGM -n -I "
                . "$workDir/consensi.fa >> "
                . "$workDir/xdformat.log 2>&1" );
  }
  else {
    # MAKEBLASTDB the consensi.fa
    system(   "$NCBIBLASTDB_PRGM -blastdb_version 4 -out $workDir/consensi.fa "
                . "-parse_seqids -dbtype nucl -in $workDir/consensi.fa >> "
                . "$workDir/makeblastdb.log 2>&1" );
  }
  #    2. Search consensi.fa against itself
  $searchEngineN->setSubject( "$workDir/consensi.fa" );
  $searchEngineN->setQuery( "$workDir/consensi.fa" );
  #    3. For each family which has a off diagonal match to itself
  #       calculate the coverage of the off diagonal hits
  #       Filter out families which are larger than 3kb, 
  #       are covered by >90% by off diagonal hits.
  my %satelliteSeqIDs = ();
  my ( $status, $resultCollection ) = $searchEngineN->search();
  if ( $status ) {
    print STDERR "\nERROR from search engine (", $? >> 8, ") \n";
  }
  else {
    print "   - Large Satellite Filtering.. ";
    my %coverage = ();
    for ( my $k = 0 ; $k < $resultCollection->size() ; $k++ ) {
      my $resultRef = $resultCollection->get( $k );
      # Only consider families against themselves and off diagonal hits
      next if ( $resultRef->getQueryName() ne $resultRef->getSubjName() ||
                ( $resultRef->getQueryStart() == 1 && $resultRef->getQueryRemaining() == 0 &&
                  $resultRef->getSubjStart() == 1 && $resultRef->getSubjRemaining() == 0 ) );
      my $ID = $resultRef->getQueryName();
      my $familyLen = $resultRef->getQueryEnd() + $resultRef->getQueryRemaining();
      # Do not want to screen out computationally easy satellites
      next if ( $familyLen < 3000 );
      $coverage{$ID} += $resultRef->getQueryEnd() - $resultRef->getQueryStart() + 1;
      if ( $coverage{$ID} > 0.9 * $familyLen ) {
        $satelliteSeqIDs{$ID} = 1;
      }
    }
    if ( keys %satelliteSeqIDs ) {
      if ( $DEBUG ) {
        system("cp $workDir/consensi.fa $workDir/consensi.rsorig.fa");
      }
      my $consDB = FastaDB->new( fileName => "$workDir/consensi.fa",
                                 openMode => SeqDBI::ReadOnly );
      open OUTC, ">$workDir/consensi-filtered.fa" or 
         die "Could not open $workDir/consensi-filtered.fa for writing!\n";

      foreach my $seqID ( $consDB->getIDs() ) {
        unless ( exists $satelliteSeqIDs{ $seqID } ) {
          my $seq  = $consDB->getSequence( $seqID );
          my $desc = $consDB->getDescription( $seqID );
          print OUTC ">$seqID\n";
          $seq =~ s/(.{50})/$1\n/g;
          print OUTC "$seq\n";
        }elsif ( $DEBUG ) {
          print "INFO: Filtering out putative satellite $seqID ( coverage = $coverage{$seqID} bp ).\n";
        }
      }
      close OUTC;
      undef $consDB;

      system("mv $workDir/consensi-filtered.fa $workDir/consensi.fa");
    }
  }
  # Cleanup uncessary DB files
  unless ( $DEBUG ) {
    foreach my $suffix ( "nhr", "nin", "nnd", "nni", "nog", "nsq" ) {
      unlink("$workDir/consensi.fa.$suffix") if ( -e "$workDir/consensi.fa.$suffix" );
    }
  }
  print ": " . scalar(keys(%satelliteSeqIDs)) . " found in " . elapsedTime("satellite_filtering") . "\n";

  # Refine remaining consensi
  print "   - Collecting repeat instances...";
  elapsedTime( "cinstances" );

  # Regenerate database for consensi.fa
  system(   "$NCBIBLASTDB_PRGM -blastdb_version 4 -out $workDir/consensi.fa "
                . "-parse_seqids -dbtype nucl -in $workDir/consensi.fa >> "
                . "$workDir/makeblastdb.log 2>&1" );

  # New - masklevel + switched query/subj
  $searchEngineN->setMaskLevel( 80 );
  $searchEngineN->setQuery( "$workDir/sampleDB-$round.fa" );
  $searchEngineN->setSubject( "$workDir/consensi.fa" );

  #print "Running: " . $searchEngineN->getParameters() . "\n";
  my ( $status, $resultCollection ) = $searchEngineN->search();
  if ( $status ) {
    print STDERR "\nERROR from search engine (", $? >> 8, ") \n";
  }
  else {
    print ": " . elapsedTime("cinstances") . "\n";
    elapsedTime( "family_refinement" );
    open OUTC, ">$workDir/consensi-refined.fa";
    open INDX, ">$workDir/index.html";
    #print "Saving results as $workDir/RSalign.out\n";
    $resultCollection->write("$workDir/RSalign.out", SearchResult::NoAlign);

    print "Filtering Results...\n" if ( $DEBUG );
    for ( my $k = 0 ; $k < $resultCollection->size() ; $k++ ) {
      my $resultRef = $resultCollection->get( $k );
      my $orient = $resultRef->getOrientation();

      # Extract the true seq name and keep track of how
      # many sequences this element is found -- used for
      # segmental duplication filtering.
      my $sampleID = $resultRef->getQueryName();
      my $seqID = "gi|" . ( $k + 1 ) . " ";
      if ( defined $usedBlockIndex->{$sampleID} ) {
        my $thisID = $usedBlockIndex->{$sampleID}->{'seqID'};
        $localizedToSeqNames{$thisID}++;
        # 1/31/2022 : RMH - Preserve orientation in family sequence file
        if ( $orient eq "C" || $orient eq "-" ) {
          $seqID .=
            $thisID . ":"
            . ($resultRef->getQueryEnd() + $usedBlockIndex->{$sampleID}->{'start'} - 1)
            . "-"
            . ($resultRef->getQueryStart() + $usedBlockIndex->{$sampleID}->{'start'} - 1);
        }else {
          $seqID .=
            $thisID . ":"
            . ($resultRef->getQueryStart() + $usedBlockIndex->{$sampleID}->{'start'} - 1)
            . "-"
            . ($resultRef->getQueryEnd() + $usedBlockIndex->{$sampleID}->{'start'} - 1);
        }
      }
      my $sequence = $resultRef->getQueryString();
      $sequence =~ s/-//g;
      push @{ $families{ $resultRef->getSubjName() }->{'elements'} },
          {
            'seqID' => $seqID,
            'seq'   => $sequence,
            'score' => $resultRef->getScore()
          };
    }
    undef $resultCollection;

    # Sort keys by most abundant family first
    # RMH: 9/19/23 : Added secondary sort to ensure deterministic behaviour
    my @sortedKeys = sort {
      $#{ $families{$b}->{'elements'} } <=> $#{ $families{$a}->{'elements'} } ||
          $a cmp $b 
    } keys( %families );

    my $doRefinement = 1;
    if ( $doRefinement ) {

      # Create files of instances for refinement
      my $familyID = 0;
      my %refinableFamilies = ();
      # key = "R=#" identifier
      foreach my $key ( @sortedKeys ) {
        # Quit if remaining family sizes are below cutoff
        last if ( ($#{ $families{$key}->{'elements'} }+1) < $familySizeCutoff );
 
        #print "SORT: $key = family-$familyID\n";
        $families{$key}->{'roundfam'} = $familyID;

        open FAM, ">$workDir/family-$familyID.fa"
            or die "RepeatModler: Can't open "
            . "output file family-$familyID.fa\n";

        # Sort elements in each family by highest scoring first
        my $numElements = 0;
        foreach my $ele ( sort { $b->{'score'} <=> $a->{'score'} }
                          @{ $families{$key}->{'elements'} } )
        {
          # Number of elements to keep
          last if ( $numElements == 100 );
          print FAM ">" . $ele->{'seqID'} . "\n";
          print FAM "" . $ele->{'seq'} . "\n";
          $numElements++;
        }
        close FAM;
        $families{$key}->{'numOfEles'} = $numElements;
        $refinableFamilies{$key}++;
        $familyID++;
      }

      my $tsk = ThreadedTaskSimple->new();
      $tsk->setWorkingDir($workDir);
      $tsk->setName("refining_job");
      $tsk->setNumThreads($threads);
      $tsk->setMaxRetries(1);

      my $jobIdx = 0;
      foreach my $key ( keys %refinableFamilies ) {
        my $familyID = $families{$key}->{'roundfam'};
        #print " -- Refining Family $key / $familyID ( RS Elements: "
        #    . ( $#{ $families{$key}->{'elements'} } + 1 )
        #    . ", Using $numElements )\n";
        $tsk->addJob( name => "refiner-job-$jobIdx",
                      function => \&refineOneFamily,
                      parameters => ["$workDir/family-$familyID.fa", $genomeDB, $workDir, 1, $DEBUG] );
        $jobIdx++;
      }
      elapsedTime(2);
      $tsk->execute();
      print "Refinement: " . elapsedTime( 2 ) . "\n" if ( 1 || $DEBUG );

      for my $key ( keys %refinableFamilies ) {
        my $cons;
        my $maSize = 1;
        my $familyID = $families{$key}->{'roundfam'};
        my $numElements = $families{$key}->{'numOfEles'};

        if ( -s "$workDir/family-$familyID.fa.refiner_cons" ) {
          open INREF, "<$workDir/family-$familyID.fa.refiner_cons"
              or die $CLASS
              . ": Could not open refined model $workDir/family-$familyID."
              . "fa.refiner_cons!\n";
          while ( <INREF> ) {
            if ( /Final Multiple Alignment Size = (\d+)/ ) {
              $maSize = $1;
            }
            else {
              $cons .= $_;
            }
          }
          close INREF;
        }else {
          print "  WARNING: Refiner did not return a consensus for $workDir/family-$familyID.fa.\n";
        }

        if ( $cons ne "" ) {
          
          # Clean up consensi starting/ending with Ns
          #   - These are sections of the multiple alignment
          #     where there is no support for a consensus.
          $cons =~ s/^[Nn]*([^Nn].*[^Nn])[Nn]*$/$1/;

          # Save the consensus
          $families{$key}->{'consensus'}         = $cons;
          $families{$key}->{'finalElementCount'} = $maSize;

          # Concatenate individual seed alignments into one file per-round
          if ( -s "$workDir/family-$familyID.fa.refiner.stk" ) {
            my $stockholmFile = SeedAlignmentCollection->new();
            my $IN;
            open $IN, "<$workDir/family-$familyID.fa.refiner.stk"
                or die
                "Could not open family-$familyID.fa.refiner.stk for reading!\n";
            open OUT, ">> $workDir/families.stk"
                or die "Could not open $workDir/families.stk for appending!\n";
            $stockholmFile->read_stockholm( $IN );
            close $IN;

            # File only contains one family
            my $seedAlign = $stockholmFile->get( 0 );

            # Add details to DE line
            $seedAlign->setDescription( "RepeatModeler Generated - rnd-$round"
              . "_family-$familyID, RepeatScout: [ Index = $key, RS Size = "
              . ( $#{ $families{$key}->{'elements'} } + 1 )
              . ", Refiner Input Size = $numElements, Final Multiple Alignment Size = $maSize ]"
            );
            print OUT "" . $seedAlign->toString();
            close OUT;
          }

          # Save the consensus to the consensi file.
          print OUTC ">family-$familyID ( RepeatScout Family =  $key, "
              . " Size = "
              . ( $#{ $families{$key}->{'elements'} } + 1 )
              . ", Refiner Input Size = $numElements, Final Multiple Alignment"
              . " Size = "
              . $maSize . " )\n";
          print OUTC "$cons\n";

          my $indexStr =
                "<a href=\"family-$familyID-cons.html\">family-$familyID"
              . " ( RepeatScout Family $key, Size = "
              . ( $#{ $families{$key}->{'elements'} } + 1 )
              . ", Refiner Input Size = $numElements, "
              . " Final Multiple Alignment Size = "
              . $maSize
              . " )</a><br>\n";

          print INDX $indexStr;
          push @indices, [ $maSize, $indexStr ];

        }
      }
      close INDX;

      # sorted indices
      open INDX, ">$workDir/index.html"
          or die "Could not open $workDir/index.html for writing\n";

      foreach my $index ( sort { $b->[ 0 ] <=> $a->[ 0 ] } @indices ) {
        print INDX $index->[ 1 ];
      }
      close INDX;
      close OUTC;
    }
    else    # if ( $doRefinement )
    {
      my $familyID = 0;
      my $consDB = FastaDB->new( fileName => "$workDir/consensi.fa",
                                 openMode => SeqDBI::ReadOnly );
      foreach my $key ( @sortedKeys ) {
        $families{$key}->{'roundfam'}          = $familyID;
        $families{$key}->{'consensus'}         = $consDB->getSequence( $key );
        $families{$key}->{'finalElementCount'} =
            $#{ $families{$key}->{'elements'} } + 1;
        $familyID++;
      }
      undef $consDB;
    }

    #
    # Read in seed alignment data so that we can sort it by membership
    # as we do consensi.
    #
    my %seedAlnById = ();
    if ( -s "$workDir/families.stk" ) {
      my $stockholmFile = SeedAlignmentCollection->new();
      my $IN;
      open $IN, "<$workDir/families.stk"
          or die "Could not open $workDir/families.stk for reading!\n";
      $stockholmFile->read_stockholm( $IN );
      close $IN;

      for ( my $i = 0 ; $i < $stockholmFile->size() ; $i++ ) {
        my $seedAlign = $stockholmFile->get( $i );
        my $id        = $seedAlign->getId();
        if ( $id =~ /family-(\d+)/ ) {
          $seedAlign->setId( "rnd-$round" . "_family-$1" );
          $seedAlnById{$1} = $seedAlign;
        }
        else {
          print
"Warning: could not decode ID from $workDir/families.stk file: id = $id\n";
        }
      }
    }

    #
    # Write out final trimmed consensi.fa file
    #
    open OUTC, ">>$consensiFile"
        or die "Could not open $consensiFile for appending!\n";
    open OUTS, ">>$combFamiliesFile"
        or die "Could not open $combFamiliesFile for appending!\n";

    foreach my $familyKey (
      # RMH: 9/19/23 Added secondary sort to ensure deterministic behaviour
      sort {
        $families{$b}{'finalElementCount'} <=> $families{$a}{'finalElementCount'} ||
	$a cmp $b
      }
      keys( %families )
        )
    {
      last
          if (
             $families{$familyKey}->{'finalElementCount'} < $familySizeCutoff );
      print OUTS ""
          . $seedAlnById{ $families{$familyKey}->{'roundfam'} }->toString();
      print OUTC ">rnd-$round"
          . "_family-"
          . $families{$familyKey}->{'roundfam'}
          . " ( RepeatScout Family Size = "
          . ( $#{ $families{$familyKey}->{'elements'} } + 1 ) . ", "
          . "Final Multiple Alignment Size = "
          . $families{$familyKey}->{'finalElementCount'} . ", "
          . "Localized to "
          . scalar( keys( %localizedToSeqNames ) ) . " out " . "of "
          . $dbNumContigs
          . " contigs )\n";
      print OUTC "$families{ $familyKey }->{'consensus'}\n";
      $numModels++;

    }
    close OUTC;
    close OUTS;

    print "Family Refinement: " . elapsedTime( "family_refinement" ) . "\n";

  }
  undef $searchEngineN;
  #return \%families, $numModels;
  undef %families;
  return $numModels;

}


##-------------------------------------------------------------------------##
#
# Use: my = buildReconConsensiMT( workDir => "", round => #,
#                               consensiFile => "", combFamiliesFile => "",
#                               familyCutoff => # );
#
# Process output of RECON and generate consensi for each putative
# family.
#
##-------------------------------------------------------------------------##
sub buildReconConsensiMT {
  my %parameters = @_;

  die "buildReconConsensiMT(): Missing workDir parameter!\n"
      if ( !defined $parameters{'workDir'} );
  my $workDir = $parameters{'workDir'};

  die "buildReconConsensiMT(): Missing round parameter!\n"
      if ( !defined $parameters{'round'} );
  my $round = $parameters{'round'};

  die "buildReconConsensiMT(): Missing consensiFile parameter!\n"
      if ( !defined $parameters{'consensiFile'} );
  my $consensiFile = $parameters{'consensiFile'};

  die "buildReconConsensiMT(): Missing combFamiliesFile parameter!\n"
      if ( !defined $parameters{'combFamiliesFile'} );
  my $combFamiliesFile = $parameters{'combFamiliesFile'};

  die "buildReconConsensiMT(): Missing familyCutoff parameter!\n"
      if ( !defined $parameters{'familyCutoff'} );
  my $familySizeCutoff = $parameters{'familyCutoff'};

  die "buildReconConsensiMT(): Missing threads parameter!\n"
      if ( !defined $parameters{'threads'} );
  my $threads = $parameters{'threads'};

  die "buildReconConsnesiMT(): Missing genomeDB path parameter!\n"
      if ( !defined $parameters{'genomeDB'} );
  my $genomeDB = $parameters{'genomeDB'};

  ## Read in element definitions
  my %families = ();
  if ( -f "$workDir/summary/eles" ) {
    open IN, "<$workDir/summary/eles";
    while ( <IN> ) {
      next if ( /^#/ );
      if ( /^\s+\d+\s+/ ) {
        my @fields = split;

        #  fam    ele   dir  sequence    start     end
        #   1      2     1    gi|1009       51      114
        if ( $fields[ 4 ] < 0 || $fields[ 5 ] < 0 ) {
          warn "WARNING: RECON returned a negative offset:\n    $_\n"
              . "In file: $workDir/summary/eles.  Ignoring line and"
              . " moving on.\n";
          next;
        }
        push @{ $families{ $fields[ 0 ] }->{'elements'} },
            {
              seqName   => $fields[ 3 ],
              elementID => $fields[ 1 ],
              orient    => $fields[ 2 ],
              start     => $fields[ 4 ],
              end       => $fields[ 5 ]
            };
      }
    }
    close IN;
  }
  else {
    die "Error: Recon failed to produce the summary/eles file!\n";
  }

  ## Get Sequences
  print "  - Obtaining element sequences\n";
  open OUTC, ">$workDir/consensi.fa";
  open INDX, ">$workDir/index.html";

  ### Locate the batch containing this elements sequence.
  my $batchName = "sampleDB-$round.fa";

  ## Open the sequence batch file.
  my $batchDB = FastaDB->new( fileName => "$workDir/$batchName",
                              openMode => SeqDBI::ReadOnly );

  my @indices = ();

  # RMH: 9/19/23 : added secondary sort to ensure deterministic behaviour
  my @sortedKeys = sort {
    $#{ $families{$b}->{'elements'} } <=> $#{ $families{$a}->{'elements'} } ||
    $a cmp $b  
  } keys( %families );

  log_print "Number of families returned by RECON: "
      . scalar( keys( %families ) ) . "\n";
  print "Processing families with greater than $familySizeCutoff elements\n";
  elapsedTime( "family_refinement" );

  # Loop through all families starting with the ones
  # containing the most elements.
  print "  - Saving instances...\n" if ( $DEBUG );
  elapsedTime( 2 );
  my %refinableFamilies = ();
  foreach my $familyID ( @sortedKeys ) {
    last if ( $#{ $families{$familyID}->{'elements'} } < $familySizeCutoff );
    #print "\nProcessing RECON family: $familyID\n";

    ## Using the eles file and the sequence batch file
    ## create a new family-#.fa file containing the
    ## sequences for each elemement.
    my $elementsRef = $families{$familyID}->{'elements'};
    open FAM, ">$workDir/family-$familyID.fa"
        or die "RepeatModler: Can't open "
        . "output file family-$familyID.fa\n";

    my $numElements = 0;

    ## Loop through the elements for a given family starting
    ## first with the longest ones.
    #TODO: Consider saving all instances to a an aux file
    my $giID = 1;
    foreach my $elementRef (
      sort {
        ( $b->{'end'} - $b->{'start'} ) <=> ( $a->{'end'} - $a->{'start'} )
      } @{$elementsRef}
        )
    {
      $numElements++;
      #
      # Parse genomic start/end out of batch seqname:
      #
      #   The RM convention is that a batch file is a fragment
      #   of a genomic (input sequence) sample.  The range of
      #   the fragment is placed in the sequence name.  The
      #   general form is SeqID:startPos-endPos.  So in order
      #   to place this sequence in it's genomic context we
      #   need to parse out these numbers.
      #
      my $seqName = $elementRef->{'seqName'};
      my $seqDesc = $batchDB->getDescription( $seqName );
      my $genomicStartPos;
      my $genomicEndPos;
      my $genomicSeqID;
      if ( $seqDesc =~ /(\S+):(\d+)-(\d+)/ ) {
        $genomicSeqID    = $1;
        $genomicStartPos = $2;
        $genomicEndPos   = $3;
      }
      else {
        print
"ERROR: Sequence reference >$seqName< is not in the correct form!\n";
      }

      ## Get the sequence.
      my $startOffset  = $elementRef->{'start'};
      my $endOffset    = $elementRef->{'end'};
      my $length       = $endOffset - $startOffset;
      my $actualSeqLen = $batchDB->getSeqLength( $seqName );
      if ( $endOffset > $actualSeqLen ) {
        print
"WARNING: Attempt to extract substring outside the range of the sequence\n"
            . "         seqID=$seqName range=$startOffset-$endOffset!  Please report this\n"
            . "         warning to the developers at help\@dfam.org.\n";
        next;
      }
      my $sequence = $batchDB->getSubstr( $seqName, $startOffset, $length );

      ## Determine if we need to reorient the sequence based
      ## on Recon's call of the orientation.
      if ( $elementRef->{'orient'} ne "1" ) {
        $sequence = reverse( $sequence );
        $sequence =~ tr/ACGTYRMKHBVD/TGCARYKMDVBH/;
        my $tmpOffset = $startOffset;
        # 2/1/22 : RMH - Bug these should be 1-based, fully closed coordinates
        #$startOffset = $endOffset;
        #$endOffset   = $tmpOffset;
        $startOffset = $endOffset-1;
        $endOffset   = $tmpOffset-1;
      }

      ## Save the sequence
      $elementRef->{'sequence'} = $sequence;
      # 2/1/22 : RMH - Bug these should be 1-based, fully closed coordinates
      #print FAM ">gi|$giID $genomicSeqID" . ":"
      #    . ( $startOffset + $genomicStartPos - 1 ) . "-"
      #    . ( $endOffset + $genomicStartPos - 1 )
      #    . " element-"
      #    . $elementRef->{'elementID'} . "\n";
      print FAM ">gi|$giID $genomicSeqID" . ":"
          . ( $startOffset + $genomicStartPos ) . "-"
          . ( $endOffset + $genomicStartPos )
          . " element-"
          . $elementRef->{'elementID'} . "\n";
 
      print FAM "" . $elementRef->{'sequence'} . "\n";
      $giID++;
    }
    close FAM;
    $refinableFamilies{$familyID}++;
    #print "  - Saving " . ( $#{$elementsRef} + 1 ) . " elements\n";
  }
  print "Instance Gathering: " . elapsedTime( 2 ) . "\n" if ( 1 || $DEBUG );


  my $tsk = ThreadedTaskSimple->new();
  $tsk->setWorkingDir($workDir);
  $tsk->setName("refining_job");
  $tsk->setNumThreads($threads);
  $tsk->setMaxRetries(1);

  my $jobIdx = 0;
  for my $familyID ( keys %refinableFamilies ) {
    $tsk->addJob( name => "refiner-job-$jobIdx",
                function => \&refineOneFamily,
                parameters => ["$workDir/family-$familyID.fa", $genomeDB, $workDir, 1, $DEBUG] );
    $jobIdx++;
  }  
  print "Refining $jobIdx families\n";

  elapsedTime(2);
  $tsk->execute();
  print "Refinement: " . elapsedTime( 2 ) . "\n" if ( $DEBUG );

  for my $familyID ( keys %refinableFamilies ) {
    #print "  - Refining family-$familyID model...\n";
    my $cons;
    my $maSize = 1;
    if ( -s "$workDir/family-$familyID.fa.refiner_cons" ) {
      open INREF, "<$workDir/family-$familyID.fa.refiner_cons"
          or die $CLASS
          . ": Could not open refined model $workDir/family-$familyID."
          . "fa.refiner_cons!\n";
      while ( <INREF> ) {
        if ( /Final Multiple Alignment Size = (\d+)/ ) {
          $maSize = $1;
        }
        else {
          $cons .= $_;
        }
      }
      close INREF;
    }
    else {
      print "  WARNING: Refiner did not return a consensus for $workDir/family-$familyID.fa.\n";
    }

    if ( $cons ne "" ) {

      # Clean up consensi starting/ending with Ns
      #   - These are sections of the multiple alignment
      #     where there is no support for a consensus.
      $cons =~ s/^[Nn]*([^Nn].*[^Nn])[Nn]*$/$1/;

      # Save the consensus
      $families{$familyID}->{'consensus'}         = $cons;
      $families{$familyID}->{'finalElementCount'} = $maSize;

      # Concatenate individual seed alignments into one file per-round
      if ( -s "$workDir/family-$familyID.fa.refiner.stk" ) {
        my $stockholmFile = SeedAlignmentCollection->new();
        my $IN;
        open $IN, "<$workDir/family-$familyID.fa.refiner.stk"
            or die
            "Could not open family-$familyID.fa.refiner.stk for reading!\n";
        open OUT, ">> $workDir/families.stk"
            or die "Could not open $workDir/families.stk for appending!\n";
        $stockholmFile->read_stockholm( $IN );
        close $IN;

        # File only contains one family
        my $seedAlign = $stockholmFile->get( 0 );

        # Add details to DE line
        $seedAlign->setDescription( "RepeatModeler Generated - rnd-$round"
                              . "_family-$familyID, RECON: [ RECON Size = "
                              . ( $#{ $families{$familyID}->{'elements'} } + 1 )
                              . ", Final Multiple Alignment Size = $maSize ]" );
        print OUT "" . $seedAlign->toString();
        close OUT;
      }

      # Save the consensus to the consensi file.
      print OUTC ">family-$familyID ( Recon Family Size = "
          . ( $#{ $families{$familyID}->{'elements'} } + 1 ) . ", "
          . "Final Multiple Alignment Size = "
          . $maSize . " )\n";
      print OUTC "$cons\n";

      my $indexStr =
            "<a href=\"family-$familyID-cons.html\">family-$familyID"
          . " ( Recon Family Size = "
          . ( $#{ $families{$familyID}->{'elements'} } + 1 ) . ", "
          . "Final Multiple Alignment Size = "
          . $maSize
          . " )</a><br>\n";

      print INDX $indexStr;
      push @indices, [ $maSize, $indexStr ];

    }

  }    # Foreach family
  undef $batchDB;
  close INDX;

  #
  # Read in seed alignment data so that we can sort it by membership
  # as we do consensi.
  #
  my %seedAlnById = ();
  if ( -s "$workDir/families.stk" ) {
    my $stockholmFile = SeedAlignmentCollection->new();
    my $IN;
    open $IN, "<$workDir/families.stk"
        or die "Could not open $workDir/families.stk for reading!\n";
    $stockholmFile->read_stockholm( $IN );
    close $IN;

    for ( my $i = 0 ; $i < $stockholmFile->size() ; $i++ ) {
      my $seedAlign = $stockholmFile->get( $i );
      my $id        = $seedAlign->getId();
      if ( $id =~ /family-(\d+)/ ) {
        $seedAlign->setId( "rnd-$round" . "_family-$1" );
        $seedAlnById{$1} = $seedAlign;
      }
      else {
        print
"Warning: could not decode ID from $workDir/families.stk file: id = $id\n";
      }
    }
  }

  # sorted indices
  open INDX, ">$workDir/index.html";
  foreach my $index ( sort { $b->[ 0 ] <=> $a->[ 0 ] } @indices ) {
    print INDX $index->[ 1 ];
  }
  close INDX;
  close OUTC;

  #
  # Append trimmed consensi.fa file to final results
  # Append trimmed stockholm file to final results
  #
  my $numModels = 0;
  open OUTC, ">>$consensiFile"
      or die "Could not open $consensiFile for appending!\n";
  open OUTS, ">>$combFamiliesFile"
      or die "Could not open $combFamiliesFile for appending!\n";

  foreach my $familyKey (
    # RMH: 9/19/23: Added secondary sort to ensure deterministic behaviour
    sort {
      $families{$b}{'finalElementCount'} <=> $families{$a}{'finalElementCount'} ||
      $a cmp $b
    }
    keys( %families )
      )
  {

    last
        if ( $families{$familyKey}->{'finalElementCount'} < $familySizeCutoff );
    print OUTS "" . $seedAlnById{$familyKey}->toString();
    print OUTC ">rnd-$round"
        . "_family-$familyKey ( Recon Family Size = "
        . ( $#{ $families{$familyKey}->{'elements'} } + 1 ) . ", "
        . "Final Multiple Alignment Size = "
        . $families{$familyKey}->{'finalElementCount'} . " )\n";
    print OUTC "$families{ $familyKey }->{'consensus'}\n";
    $numModels++;

  }
  close OUTC;
  close OUTS;
  print "Family Refinement: " . elapsedTime( "family_refinement" ) . "\n";

  #return \%families, $numModels;
  undef %families;
  return $numModels;
}

# For multi threaded jobs
sub refineOneFamily {
  my $instancesFile = shift;
  my $genomeDB = shift;
  my $workDir = shift;
  my $threads = shift;
  my $DEBUG = shift;
  
  my $threadParam = "";
  $threadParam = " -threads $threads " if ( $threads );
  my $cmd = "Refiner $threadParam "
          . "-noTmp -giToID $genomeDB.translation $instancesFile";
  $cmd .= " -quiet" unless ( $DEBUG );
  #print "Running: $cmd\n";
  system( $cmd );
}

 
#
# initializeSampleSourceList
#
#  Generate a list of sequence chunks that are between 100 and 'approxBlockSize'-ish
# base pairs long.  This ensures that we can build batches of a reasonable size with
# a random sampling of this list.  We also want to characterize the sequences
# ( e.g length and non-ambiguous base length ) and keep a count of the times this
# sequence unit was used in a sample ( for with-replacement sampling ).
#
# For instance, given human chr1 we would want to generate a list of ranges on this
# sequence like so:
#
#   { 'seqID' => "chr1", 'start' => 1, 'end' => 40000,
#     'length' => 40000, 'non_ambig_len' => 35828, used_cnt => 0 },
#   { 'seqID' => "chr1", 'start' => 40001, 'end' => 80000,
#     'length' => 40000, 'non_ambig_len' => 38238, used_cnt => 0 },
#   ...
#
# A contig of 500bp would be represented as:
#
#   { 'seqID' => "contig7", 'start' => 1, 'end' => 500,
#     'length' => 500, 'non_ambig_len' => 482, used_cnt => 0 },
#
#
# TODO: Consider how this might be sampled from so we get a diversity of
#       sequence lengths.
#
sub initializeSampleSourceList {
  my %parameters = @_;

  my $approxBlockSize = $parameters{'approxBlockSize'};
  my $genomeDB        = $parameters{'genomeDB'};
  my $dbSize          = $parameters{'dbSize'};
  my $noFrag          = $parameters{'noFrag'};

  my @sampleSourceList = ();

  # Open up genomeDB, breakup sequences (if need be) and
  # catalog what we have

  open IN, "$NCBIDBCMD_PRGM -db $genomeDB -entry all|"
      or die "initializeSampleSourceList: Could read from database "
      . "$genomeDB using $NCBIDBCMD_PRGM";

  print "initializeSampleSourceList: Scanning genome database...\n"
      if ( $DEBUG );
  my $id  = "";
  my $seq = "";
  while ( <IN> ) {
    if ( /^>/ || eof ) {
      my $tmpID = "endoffile";
      $tmpID = $1 if ( /^>(\S+)/ );
      if ( $seq ) {
        my $seqSize = length $seq;
        my $nonAmbigSize = $seqSize - ( $seq =~ tr/XN/XN/ );
        if ( $nonAmbigSize >= 100 ) {
          if ( defined $noFrag ) {
            push @sampleSourceList,
                {
                  'seqID'         => $id,
                  'start'         => 1,
                  'end'           => $seqSize,
                  'length'        => $seqSize,
                  'non_ambig_len' => $nonAmbigSize,
                  'used_cnt'      => 0
                };
          }
          else {
            my $partialBlockSize = $seqSize -
                ( int( $seqSize / $approxBlockSize ) * $approxBlockSize );
            my $numBlocks = int( $seqSize / $approxBlockSize ) + 1;
            my $blockAddl = 0;
            if ( $partialBlockSize < ( $approxBlockSize / 2 ) ) {

              # divide last block between all others
              $blockAddl = int( $partialBlockSize / $numBlocks );
            }

            my $seqRemaining = $seqSize;
            my $startPos     = 1;
            my $blockSize    = $approxBlockSize + $blockAddl;
            for ( my $i = 0 ; $i < $numBlocks ; $i++ ) {
              last if ( $blockSize >= $seqRemaining );
              my $blockseq = substr( $seq, $startPos, $blockSize );
              my $nonAmbigBlockSize = $blockSize - ( $blockseq =~ tr/XN/XN/ );
              push @sampleSourceList,
                  {
                    'seqID'         => $id,
                    'start'         => $startPos,
                    'end'           => $startPos + $blockSize - 1,
                    'length'        => $blockSize,
                    'non_ambig_len' => $nonAmbigBlockSize,
                    'used_cnt'      => 0
                  };

              $seqRemaining -= $blockSize;
              $startPos += $blockSize;
            }
            if ( $seqRemaining > 0 ) {
              my $blockseq = substr( $seq, $startPos );
              my $nonAmbigBlockSize =
                  $seqRemaining - ( $blockseq =~ tr/XN/XN/ );
              push @sampleSourceList,
                  {
                    'seqID'         => $id,
                    'start'         => $startPos,
                    'end'           => $startPos + $seqRemaining - 1,
                    'length'        => $seqRemaining,
                    'non_ambig_len' => $nonAmbigBlockSize,
                    'used_cnt'      => 0
                  };
            }
          }
        }
        else {
          print "initializeSampleSourceList: Discarding sequence $id because "
              . "it's painfully small ( size=$seqSize, nonAmbigSize=$nonAmbigSize )\n"
              if ( $DEBUG );
        }
      }
      $id  = $tmpID;
      $seq = "";
      next;
    }
    s/[\n\r\s]+//g;
    $seq .= uc( $_ );
  }
  close IN;

  return \@sampleSourceList;
}

#
# sampleFromDB
#
#   Given a list of sequence ranges that range from 100bp to
#   approxBlockSize-ish, pick at most sampleSizeMax bp of
#   sample for this round.  The input list is assumed to be
#   pre-randomized ( using FisherYatesShuffle ) and the
#   sample strategy is without-replacement by default.
#
#  Given:
#    dbSourceBlocks : a list of sample blocks e.g:
#          ( { 'seqID' => "chr1", 'start' => 1, 'end' => 40000,
#              'length' => 40000, 'non_ambig_len' => 35828, used_cnt => 0 }
#            ... )
#    sampleSizeMax:    The maximum sample size
#    dbFile : The compressed sequence database
#    dbType : rmblast or abblast
#    fastaFile : The output file to create
#    prohibitX : Convert 'x'/'X' to 'N'
#    withReplacement : Allow sequences to be reused between
#                      calls to this routine.
#    dbSourceBlock coordinates are 1-based, fully closed.
#
sub sampleFromDB {
  my %parameters = @_;

  my $dbSourceBlocks  = $parameters{'dbSourceBlocks'};
  my $sampleSizeMax   = $parameters{'sampleSizeMax'};
  my $dbFile          = $parameters{'dbFile'};
  my $dbType          = $parameters{'dbType'};
  my $fastaFile       = $parameters{'fastaFile'};
  my $prohibitX       = $parameters{'prohibitX'};
  my $withReplacement = $parameters{'withReplacement'};

  open ENTRYBAT, ">$fastaFile.entry_batch"
      or die
      "sampleFromDB: Could not open $fastaFile.entry_batch for writing!\n";

  my $sampleSeqSize   = 0;
  my %usedBlocksIndex = ();
  my @blocks          = ();
  foreach my $sampleSourceBlock ( @{$dbSourceBlocks} ) {
    my $seqID        = $sampleSourceBlock->{'seqID'};
    my $start        = $sampleSourceBlock->{'start'};
    my $end          = $sampleSourceBlock->{'end'};
    my $seqSize      = $sampleSourceBlock->{'length'};
    my $nonAmbigSize = $sampleSourceBlock->{'non_ambig_len'};

    if ( defined $withReplacement
         || $sampleSourceBlock->{'used_cnt'} == 0 )
    {

      # NOTE: It is assumed that under the "with-replacement" regime
      #       that the caller has re-shuffled the dbSourceBlocks so
      #       that the same sample is not re-generated each time.
      print ENTRYBAT "$seqID $start-$end\n";
      push @blocks, $sampleSourceBlock;
      $sampleSourceBlock->{'used_cnt'}++;
      $sampleSeqSize += $nonAmbigSize;
      last if ( $sampleSeqSize >= $sampleSizeMax );
    }
  }
  close ENTRYBAT;

  die "ABBLAST is no longer supported" if ( $dbType eq "abblast" );
  open IN, "$NCBIDBCMD_PRGM -db $dbFile -entry_batch $fastaFile.entry_batch|"
      or die
"sampleFromDB: Could not read entry_batch sequences from $dbFile using $NCBIDBCMD_PRGM.";
  open FASTA, ">$fastaFile"
      or die "sampleFromDB: Could not open $fastaFile for writing!\n";

  my $seq;
  my $id;
  my $giIndex = 1;
  while ( <IN> ) {
    if ( /^>/ || eof ) {
      my $tmpID = "endoffile";
      $tmpID = $1 if ( /^>(\S+)/ );
      if ( $seq ) {

        # Renumber sequences
        my $sample = $blocks[ $giIndex - 1 ];
        # NOTE: The coordinates stored in the SampleDB-#.fa files
        #       is 1-based, fully closed coordinates!
        print FASTA ">gi|$giIndex "
            . $sample->{'seqID'} . ":"
            . $sample->{'start'} . "-"
            . $sample->{'end'} . "\n";
        $giIndex++;
        $seq =~ s/[xX]/n/g if ( $prohibitX );
        $seq =~ s/(.{50})/$1\n/g;
        print FASTA "$seq\n";
        my $key = "gi|" . ( $giIndex - 1 );
        $usedBlocksIndex{$key} = $sample;
      }
      $id  = $tmpID;
      $seq = "";
      next;
    }
    s/[\n\r\s]+//g;
    $seq .= $_;
  }
  close IN;
  close FASTA;

  return ( \%usedBlocksIndex );
}

sub clearSampleUsedCounts {
  my %parameters     = @_;
  my $dbSourceBlocks = $parameters{'dbSourceBlocks'};
  foreach my $sampleSourceRecord ( @{$dbSourceBlocks} ) {
    $sampleSourceRecord->{'used_cnt'} = 0;
  }
}

##-------------------------------------------------------------------------##
## Use:  &fisherYatesShuffle( \@input_list );
##
## Randomly permute elements the input list.  This implementation uses
## the built-in random number generator so that the same sequence of
## shuffles can be reproduced by setting the generator seed.
##
## The Fisher–Yates shuffle, in its original form, was described in 1938
## by Ronald Fisher and Frank Yates in their book "Statistical tables for
## biological, agricultural and medical research."
##
##-------------------------------------------------------------------------##
sub fisherYatesShuffle {
  my $array = shift;
  my $i;
  for ( $i = @$array - 1 ; $i >= 0 ; --$i ) {
    my $j = int rand( $i + 1 );
    next if $i == $j;
    @$array[ $i, $j ] = @$array[ $j, $i ];
  }
}

##-------------------------------------------------------------------------##
## Use:  my ( $tempDir ) = &createTempDir( \@tmpPath, $prefix );
##
## Attempt to create a temporary directory of the form:
##
##     $prefix_$PID.$DATE/
##
## Where:
##     $prefix : Option prefix string or "RM" if not provided
##     $PID    : The process ID of this perl script
##     $DATE   : The date in DayOfWeek, Month, Day, Hour, Min
##               Second, Year
## Returns:
##    The name of the temporary directory or "" if it cannot be
##    created or written to.
##
##-------------------------------------------------------------------------##
sub createTempDir {
  my $tmpPathRef = shift;
  my $optPrefix  = shift;

  ## Get date
  my $date = localtime( time() );

  # Windows does not support the use of ":" in a filename.
  $date =~ s/[ ,\t,\n:]//g;

  # Set optional prefix
  my $prefix = "RM";
  $prefix = $optPrefix if ( $optPrefix ne "" );

  my $runnumber = "$$" . ".$date";
  my $tempDir   = "";
  foreach my $directory ( @{$tmpPathRef} ) {
    if ( $directory =~ /\/$/ ) {
      $tempDir = $directory . $prefix . "_$runnumber";
    }
    else {
      $tempDir = $directory . "/$prefix" . "_$runnumber";
    }

    if ( -d "$tempDir" || mkdir $tempDir, 0777 ) {
      if ( open( IN, ">$tempDir/deleteMe" ) ) {
        close IN;
        unlink( "$tempDir/deleteMe" );
        last;
      }
    }
    $tempDir = "";
  }
  return ( $tempDir );
}

##-------------------------------------------------------------------------##
## Use: my $string = elapsedTime( $index );
##
## Great little utility for measuring the elapsed
## time between one or more sections of perl code.
##
##-------------------------------------------------------------------------##
sub elapsedTime {
  my ( $TimeHistIdx ) = @_;
  if ( defined $TimeBefore{$TimeHistIdx} ) {
    my $DiffTime = time - $TimeBefore{$TimeHistIdx};
    $TimeBefore{$TimeHistIdx} = time;
    my $Min = int( $DiffTime / 60 );
    $DiffTime -= $Min * 60;
    my $Hours = int( $Min / 60 );
    $Min -= $Hours * 60;
    my $Sec = $DiffTime;
    my $timeStr = sprintf( "%02d:%02d:%02d", $Hours, $Min, $Sec );
    return "$timeStr (hh:mm:ss) Elapsed Time";
  }
  else {
    $TimeBefore{$TimeHistIdx} = time;
    return 0;
  }
}

1;

