#! /opt/gensoft/adm/bin/perl
use lib "/opt/gensoft/exe/RepeatModeler/2.0.5/lib/perl5";
use lib "/opt/gensoft/exe/RepeatModeler/2.0.5/lib/perl5";
##---------------------------------------------------------------------------##
##  File:
##      @(#) TRFMask
##  Author:
##      Arian Smit <asmit@systemsbiology.org>
##      Robert Hubley <rhubley@systemsbiology.org>
##  Description:
##      Use the TRF program to mask a sequence
##
#******************************************************************************
#* Copyright (C) Institute for Systems Biology 2008-2019 Developed by
#* Arian Smit and Robert Hubley.
#*
#* This work is licensed under the Open Source License v2.1.  To view a copy
#* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or
#* see the license.txt file contained in this distribution.
#*
###############################################################################
#
# To Do:
#
#

=head1 NAME

TRFMask - Mask tandem repeats in a sequence file

=head1 SYNOPSIS

  TRFMask [-options]  <fastaFile>

=head1 DESCRIPTION

Run TRF on a sequence and mask simple repeats, di-nucl
and above, with 5 or more tandemly repeated units.

The options are:

=over 4

=item -h(elp)

Detailed help

=back

=head1 SEE ALSO

=over 4

RepeatModeler

=back

=head1 COPYRIGHT

Copyright 2005-2019 Institute for Systems Biology

=head1 AUTHOR

Robert Hubley <rhubley@systemsbiology.org>

=cut

#
# Module Dependence
#
use strict;
use FindBin;
use lib $FindBin::RealBin;
use Data::Dumper;
use File::Spec;
use Carp;
use Getopt::Long;

# RepeatModeler Libraries
use RepModelConfig;
use lib $ENV{REPEATMASKER_LIB};
use TRF;
use TRFSearchResult;

# RepeatMasker Libraries
use FastaDB;
use SearchEngineI;
use SimpleBatcher;

#
# Class Globals & Constants
#
my $CLASS = "TRFMask";
my $DEBUG = 0;
$DEBUG = 1 if ( $RepModelConfig::DEBUGALL == 1 );
my $config   = $RepModelConfig::configuration;
my $TRF_PRGM = $config->{'TRF_DIR'}->{'value'} . "/trf";

#
# Option processing
#  e.g.
#   -t: Single letter binary option
#   -t=s: String parameters
#   -t=i: Number paramters
#
my @opts = qw( help );

#
# Get the supplied command line options, and set flags
#
my %options = ();
unless ( &GetOptions( \%options, @opts ) ) {
  exec "pod2text $0";
  exit( 1 );
}

# Print the internal POD documentation if something is missing
if ( $options{'help'} ) {

  # This is a nifty trick so we don't have to have
  # a duplicate "USAGE()" subroutine.  Instead we
  # just recycle our POD docs.  See PERL POD for more
  # details.
  exec "pod2text $0";
  die;
}

die $CLASS . ": Missing fasta file!\n"
    if ( !defined $ARGV[ 0 ] || -z $ARGV[ 0 ] );
my $fastaFile = $ARGV[ 0 ];
my $wrkDir    = ( File::Spec->splitpath( $fastaFile ) )[ 1 ];
$fastaFile = ( File::Spec->splitpath( $fastaFile ) )[ 2 ];
$wrkDir = "." if ( $wrkDir eq "" );
print "Working directory = $wrkDir\n" if ( $DEBUG );

print "Masking $fastaFile\n" if ( $DEBUG );
my $maskedFile = $fastaFile . ".masked";

# TODO....determine if we should use /tmp or not?
my $ver = `$TRF_PRGM -v 2>&1`;
( $ver ) = ( $ver =~ /Tandem Repeats Finder, Version (\S+)/ );
if ( $ver < 4.09 ) {
  die "TRFMask requires TRF version 4.09 or above\n";
}
print "Running: $TRF_PRGM  ( version = $ver )\n" if ( $DEBUG );

my $cmd = "$TRF_PRGM $wrkDir/$fastaFile 2 7 7 80 10 50 500 -ngs -h";
open IN, "$cmd |" or die "TRFMask: Could not run command $cmd\n";
print "Running $cmd\n" if ( $DEBUG );
my %results;
my $id = "";
while ( <IN> ) {

# @gi|521 gi|579970:1-422
# 224 268 22 2.0 22 87 8 65 13 20 37 28 1.90 TCAGTAGTCGTGCTGCATGGGC TCAGTAGCTGTGCTGCATGGGCTCAGTAGTCGTGGTGCATGGGCT ATTGTGGTGGCTTCTCTTATTGCTGAACATAGGCTCGAGGCACCCAGACT TATGTGCCCCAGCATGCGGGATCTTCCTAGACAGGGATTGAAAGAACCCA
  if ( /^@(\S+)/ ) {
    $id = $1;
  }
  elsif ( /^(\d+)\s+(\d+)\s+(\d+)\s+([\d\.]+)/ ) {

    # $1 startPos
    # $2 endPos
    # $3 period
    # $4 copies
    if ( $3 > 1 && $4 > 4 ) {
      push @{ $results{$id} }, [ $1, $2 ];
    }
  }
}
close IN;

open IN, "<$wrkDir/$fastaFile"
    or die "TRFMask: Could not open $fastaFile for reading!\n";
open OUT, ">$wrkDir/$fastaFile.masked"
    or die "TRFMask: Could not open $wrkDir/$fastaFile.masked for writing!\n";
my $seq;
my $id;
my $fullHDR;
my $repeatsMasked = 0;
while ( <IN> ) {

  if ( /^>(\S+)/ ) {
    my $tmpID  = $1;
    my $tmpHDR = $_;
    if ( $seq ) {
      if ( exists $results{$id} ) {
        foreach my $result ( @{ $results{$id} } ) {
          my $len = $result->[ 1 ] - $result->[ 0 ] + 1;
          substr( $seq, $result->[ 0 ] - 1, $len ) = "N" x $len;
          $repeatsMasked++;
        }
      }
      $seq =~ s/(.{50})/$1\n/g;
      print OUT "$fullHDR$seq\n";
    }
    $id      = $tmpID;
    $fullHDR = $tmpHDR;
    $seq     = "";
    next;
  }
  s/[\n\r\s]+//g;
  $seq .= $_;
}
if ( $seq ) {
  if ( exists $results{$id} ) {
    foreach my $result ( @{ $results{$id} } ) {
      my $len = $result->[ 1 ] - $result->[ 0 ] + 1;
      substr( $seq, $result->[ 0 ] - 1, $len ) = "N" x $len;
    }
  }
  $seq =~ s/(.{50})/$1\n/g;
  print OUT "$fullHDR$seq\n";
  $repeatsMasked++;
}
close IN;
close OUT;

print "       $repeatsMasked Tandem Repeats Masked\n";

1;
