#!/usr/bin/env bash

# exit when any command fails
set -eo pipefail


# wfmash's default values
SEGMENT_LENGTH=5000
MAP_PCT_ID=90
MASH_KMER=19
MASH_KMER_THRES=0.001

# wfmash's parameters
input_fasta=false
segment_length=$SEGMENT_LENGTH
block_length=false
map_pct_id=$MAP_PCT_ID
n_mappings=false
no_splits=false
sparse_map=false
mash_kmer=$MASH_KMER
mash_kmer_thres=$MASH_KMER_THRES
exclude_delim=false

# seqwish's default values
MIN_MATCH_LENGTH=19
SPARSE_FACTOR=0
TRANSCLOSE_BATCH=10000000

# seqwish's parameters
min_match_length=$MIN_MATCH_LENGTH
sparse_factor=$SPARSE_FACTOR
transclose_batch=$TRANSCLOSE_BATCH

# smoothxg's default values
MAX_PATH_JUMP=0
MAX_EDGE_JUMP=0
TARGET_POA_LENGTH=700,900,1100
POA_PADDING=0.001
PAD_MAX_DEPTH=100
CONSENSUS_PREFIX=Consensus_

# smoothxg's parameters
skip_normalization=false
n_haps=false
max_path_jump=$MAX_PATH_JUMP
max_edge_jump=$MAX_EDGE_JUMP
target_poa_length=$TARGET_POA_LENGTH
poa_params=false
poa_padding=$POA_PADDING
pad_max_depth=$PAD_MAX_DEPTH
run_abpoa=false
run_global_poa=false
write_maf=false
consensus_spec=false # Disabled due to PGGB's #133 and #182 issues
consensus_prefix=$CONSENSUS_PREFIX

# odgi's parameters
do_viz=true
do_layout=true
do_stats=false

# vg's parameter
vcf_spec=false

# multiqc's parameter
multiqc=false

# default values
OUTPUT_DIR=$(pwd)
THREADS=$(getconf _NPROCESSORS_ONLN 2>/dev/null || getconf NPROCESSORS_ONLN 2>/dev/null || echo 1)

# general parameters
output_dir=$OUTPUT_DIR
input_temp_dir=false
input_paf=false
resume=false
threads=$THREADS
poa_threads=0
keep_intermediate_files=false
compress=false
show_version=false
show_help=false

# not exposed parameters
no_merge_segments=false
block_ratio_min=0
reduce_redundancy=true


if [ $# -eq 0 ]; then
    show_help=true
fi

# read the options
cmd=$0" "$@
TEMP=`getopt -o i:o:D:a:p:n:s:l:K:F:k:x:f:B:XH:j:P:O:Me:t:T:vhASY:G:Q:d:I:R:NbrmZzV: --long input-fasta:,output-dir:,temp-dir:,input-paf:,map-pct-id:,n-mappings:,segment-length:,block-length-min:,mash-kmer:,mash-kmer-thres:,min-match-length:,sparse-map:,sparse-factor:,transclose-batch:,skip-normalization,n-haps:,path-jump-max:,subpath-min:,edge-jump-max:,threads:,poa-threads:,skip-viz,do-layout,help,no-merge-segments,do-stats,exclude-delim:,poa-length-target:,poa-params:,poa-padding:,run-abpoa,global-poa,write-maf,consensus-spec:,consensus-prefix:,pad-max-depth:,block-id-min:,block-ratio-min:,no-splits,resume,keep-temp-files,multiqc,compress,vcf-spec:,version -n 'pggb' -- "$@"`
eval set -- "$TEMP"

# extract options and their arguments into variables.
while true ; do
    case "$1" in
        -i|--input-fasta) input_fasta=$2 ; shift 2 ;;
        -s|--segment-length) segment_length=$2 ; shift 2 ;;
        -l|--block-length) block_length=$2 ; shift 2 ;;
        -p|--map-pct-id) map_pct_id=$2 ; shift 2 ;;
        -n|--n-haplotypes) n_mappings=$2 ; shift 2 ;;
        -N|--no-splits) no_splits=true ; shift ;;
        -x|--sparse-map) sparse_map=$2 ; shift 2 ;;
        -K|--mash-kmer) mash_kmer=$2 ; shift 2 ;;
        -F|--mash-kmer-thres) mash_kmer_thres=$2 ; shift 2 ;;
        -Y|--exclude-delim) exclude_delim=$2 ; shift 2 ;;
        -k|--min-match-length) min_match_length=$2 ; shift 2 ;;
        -f|--sparse-factor) sparse_factor=$2 ; shift 2 ;;
        -B|--transclose-batch) transclose_batch=$2 ; shift 2 ;;
        -X|--skip-normalization) skip_normalization=true ; shift ;;
        -H|--n-haplotypes-smooth) n_haps=$2 ; shift 2 ;;
        -j|--path-jump-max) max_path_jump=$2 ; shift 2 ;;
        -e|--edge-jump-max) max_edge_jump=$2 ; shift 2 ;;
        -G|--poa-length-target) target_poa_length=$2 ; shift 2 ;;
        -P|--poa-params) poa_params=$2 ; shift 2 ;;
        -O|--poa-padding) poa_padding=$2 ; shift 2 ;;
        -d|--pad-max-depth) pad_max_depth=$2 ; shift 2 ;;
        -b|--run-abpoa) run_abpoa=true ; shift ;;
        -z|--global-poa) run_global_poa=true ; shift ;;
        -M|--write-maf) write_maf=true ; shift ;;
        #-C|--consensus-spec) consensus_spec=$2 ; shift 2 ;;
        -Q|--consensus-prefix) consensus_prefix=$2 ; shift 2 ;;
        -v|--skip-viz) do_viz=false ; do_layout=false; shift ;;
        -S|--do-stats) do_stats=true ; shift ;;
        -V|--vcf-spec) vcf_spec=$2 ; shift 2 ;;
        -m|--multiqc) multiqc=true ; shift ;;
        -o|--output-dir) output_dir=$2 ; shift 2 ;;
        -D|--temp-dir) input_temp_dir=$2 ; shift 2 ;;
        -a|--input-paf) input_paf=$2 ; shift 2 ;;
        -r|--resume) resume=true ; shift ;;
        -t|--threads) threads=$2 ; shift 2 ;;
        -T|--poa-threads) poa_threads=$2 ; shift 2 ;;
        -A|--keep-temp-files) keep_intermediate_files=true ; shift ;;
        -Z|--compress) compress=true ; shift ;;
        --version) show_version=true ; shift ;;
        -h|--help) show_help=true ; shift ;;
        --) shift ; break ;;
        *) echo "$2" "Internal error!" ; exit 1 ;;
    esac
done

if [ $show_version == true ]; then
    SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
    cd "$SCRIPT_DIR"
    GIT_VERSION=$(git describe --always --tags --long)
    echo "pggb $GIT_VERSION"
    cd - &> /dev/null
    exit
fi

# Mandatory parameters
if [[ "$input_fasta" == false || $n_mappings == false ]]; then
    show_help=true
    >&2 echo "ERROR: mandatory arguments -i and -n"
fi

if (( "$n_mappings" < 2 )); then
    show_help=true
    >&2 echo "ERROR: -n must be greater than or equal to 2"
fi

if [ $show_help == true ]; then
    padding=`printf %${#0}s` # prints as many spaces as the length of $0
    echo "usage: $0 -i <input-fasta> -n <n-haplotypes> [options]"
    echo "options:"
    echo "   [wfmash]"
    echo "    -i, --input-fasta FILE      input FASTA/FASTQ file"
    echo "    -s, --segment-length N      segment length for mapping [default: "$SEGMENT_LENGTH"]"
    echo "    -l, --block-length N        minimum block length filter for mapping [default: 5*segment-length]"
    echo "    -p, --map-pct-id PCT        percent identity for mapping/alignment [default: "$MAP_PCT_ID"]"
    echo "    -n, --n-haplotypes N        number of haplotypes"
    echo "    -N, --no-split              disable splitting of input sequences during mapping [default: enabled]"
    echo "    -x, --sparse-map N          keep this fraction of mappings ('auto' for giant component heuristic) [default: 1.0]"
    echo "    -K, --mash-kmer N           kmer size for mapping [default: "$MASH_KMER"]"
    echo "    -F, --mash-kmer-thres N     ignore the top % most-frequent kmers [default: "$MASH_KMER_THRES"]"
    echo "    -Y, --exclude-delim C       skip mappings between sequences with the same name prefix before"
    echo "                                the given delimiter character [default: all-vs-all and !self]"
    echo "   [seqwish]"
    echo "    -k, --min-match-len N       filter exact matches below this length [default: "$MIN_MATCH_LENGTH"]"
    echo "    -f, --sparse-factor N       keep this randomly selected fraction of input matches [default: no sparsification]"
    echo "    -B, --transclose-batch      number of bp to use for transitive closure batch [default: "$TRANSCLOSE_BATCH"]"
    echo "   [smoothxg]"
    echo "    -X, --skip-normalization    do not normalize the final graph [default: normalize the graph]"
    echo "    -H, --n-haplotypes-smooth N number of haplotypes, if different than that set with -n [default: -n]"
    echo "    -j, --path-jump-max         maximum path jump to include in block [default: "$MAX_PATH_JUMP"]"
    echo "    -e, --edge-jump-max N       maximum edge jump before breaking [default: "$MAX_EDGE_JUMP"]"
    echo "    -G, --poa-length-target N,M target sequence length for POA, one per pass [default: "$TARGET_POA_LENGTH"]"
    echo "    -P, --poa-params PARAMS     score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2"
    echo "                                may also be given as presets: asm5, asm10, asm15, asm20"
    echo "                                [default: 1,19,39,3,81,1 = asm5]"
    echo "    -O, --poa-padding N         pad each end of each sequence in POA with N*(mean_seq_len) bp [default: "$POA_PADDING"]"
    echo "    -d, --pad-max-depth N       depth/haplotype at which we don't pad the POA problem [default: "$PAD_MAX_DEPTH"]"
    echo "    -b, --run-abpoa             run abPOA [default: SPOA]"
    echo "    -z, --global-poa            run the POA in global mode [default: local mode]"
    echo "    -M, --write-maf             write MAF output representing merged POA blocks [default: off]"
    echo "    -Q, --consensus-prefix P    use this prefix for consensus path names [default: "$CONSENSUS_PREFIX"]"
    #echo "    -C, --consensus-spec SPEC   consensus graph specification: write consensus graphs to"
    #echo "                                BASENAME.cons_[spec].gfa; where each spec contains at least a min_len parameter"
    #echo "                                (which defines the length of divergences from consensus paths to preserve in the"
    #echo "                                output), optionally a file containing reference paths to preserve in the output,"
    #echo "                                a flag (y/n) indicating whether we should also use the POA consensus paths, a"
    #echo "                                minimum coverage of consensus paths to retain (min_cov), and a maximum allele"
    #echo "                                length (max_len, defaults to 1e6); implies -a; example:"
    #echo "                                cons,100,1000:refs1.txt:n,1000:refs2.txt:y:2.3:1000000,10000"
    #echo "                                [default: off]"
    echo "   [odgi]"
    echo "    -v, --skip-viz              don't render visualizations of the graph in 1D and 2D [default: make them]"
    echo "    -S, --stats                 generate statistics of the seqwish and smoothxg graph [default: off]"
    echo "   [vg]"
    echo "    -V, --vcf-spec SPEC         specify a set of VCFs to produce with SPEC = REF:DELIM[:LEN][,REF:DELIM:[LEN]]*"
    echo "                                the paths matching ^REF are used as a reference, while the sample haplotypes"
    echo "                                are derived from path names, e.g. when DELIM=# and with '-V chm13:#',"
    echo "                                a path named HG002#1#ctg would be assigned to sample HG002 phase 1."
    echo "                                If LEN is specified and greater than 0, the VCFs are decomposed, filtering "
    echo "                                sites whose max allele length is greater than LEN. [default: off]"
    echo "   [multiqc]"
    echo "    -m, --multiqc               generate MultiQC report of graphs' statistics and visualizations,"
    echo "                                automatically runs odgi stats [default: off]"
    echo "   [general]"
    echo "    -o, --output-dir PATH       output directory"
    echo "    -D, --temp-dir PATH         directory for temporary files"
    echo "    -a, --input-paf FILE        input PAF file; the wfmash alignment step is skipped"
    echo "    -r, --resume                do not overwrite existing outputs in the given directory"
    echo "                                [default: start pipeline from scratch]"
    echo "    -t, --threads N             number of compute threads to use in parallel steps [default: "$threads"]"
    echo "    -T, --poa-threads N         number of compute threads to use during POA (set lower if you OOM during smoothing)"
    echo "    -A, --keep-temp-files       keep intermediate graphs"
    echo "    -Z, --compress              compress alignment (.paf), graph (.gfa, .og), and MSA (.maf) outputs with pigz,"
    echo "                                and variant (.vcf) outputs with bgzip"
    echo "    --version                   display the version of pggb"
    echo "    -h, --help                  this text"
    echo
    echo "Use wfmash, seqwish, smoothxg, odgi, gfaffix, and vg to build, project and display a pangenome graph."
    exit
fi


# Alignment
if [[ $input_paf == false ]]; then
  mapper=wfmash
  mapper_version=$(wfmash --version 2>&1)
  mapper_letter='W'
else
  mapper=external
  mapper_version=unknown
  mapper_letter='E'
fi

if [[ $block_length == false ]]; then
    block_length=$(echo "$segment_length * 5" | bc)
fi

n_mappings_minus_1=$( echo "$n_mappings - 1" | bc )

paf_spec=$mapper_letter-s$segment_length-l$block_length-p$map_pct_id-n$n_mappings_minus_1-K$mash_kmer-F$mash_kmer_thres-x$sparse_map

split_cmd=""
if [[ $no_splits == true ]]; then
    split_cmd=-N
    paf_spec="$paf_spec"-N
fi

# Normalization ($n_haps is checked in this part of the script because it is also used for the 'auto' mapping sparsification)
if [[ $n_haps == false ]]; then
    n_haps=$n_mappings
fi

sparse_map_cmd=""
if [[ $sparse_map == "auto" ]]; then
    # set sparse mapping using giant component heuristic
    # we keep 10x log(n)/n mappings
    # if this is < 1, otherwise we keep all
    n=$n_haps
    sparse_map_frac=$(echo "x=l($n)/$n * 10; if (x < 1) { x } else { 1 }"  | bc -l | cut -c -8)
    sparse_map_cmd="-x $sparse_map_frac"
elif [[ $sparse_map != false ]]; then
    sparse_map_cmd="-x $sparse_map"
fi

merge_cmd=""
if [[ $no_merge_segments == true ]]; then
    merge_cmd=-M
    paf_spec="$paf_spec"-M
fi

if [[ $exclude_delim != false ]]; then
    exclude_cmd="-Y "$exclude_delim
    paf_spec="$paf_spec"-Y
else
    exclude_cmd=-X
    paf_spec="$paf_spec"-X
fi

if [[ "$input_paf" == false ]]; then
    prefix_paf="$input_fasta".$(echo "$paf_spec" | sha256sum | head -c 7)
else
    prefix_paf="$input_paf"
fi


# Graph induction
prefix_seqwish="$prefix_paf".$(echo k$min_match_length-f$sparse_factor-B$transclose_batch | sha256sum | head -c 7)

# poa param suggestions from minimap2
# - asm5, --poa-params 1,19,39,3,81,1, ~0.1 divergence
# - asm10, --poa-params 1,9,16,2,41,1, ~1 divergence
# - asm20, --poa-params 1,4,6,2,26,1, ~5% divergence
# between asm10 and asm20 ~ 1,7,11,2,33,1
poa_params_cmd=""
if [[ $poa_params == false ]]; then
    poa_params_cmd="-P 1,19,39,3,81,1"
else
    if [[ $poa_params == "asm5" ]]; then
        poa_params_cmd="-P 1,19,39,3,81,1"
    elif [[ $poa_params == "asm10" ]]; then
        poa_params_cmd="-P 1,9,16,2,41,1"
    elif [[ $poa_params == "asm15" ]]; then
        poa_params_cmd="-P 1,7,11,2,33,1"
    elif [[ $poa_params == "asm20" ]]; then
        poa_params_cmd="-P 1,4,6,2,26,1"
    else
        poa_params_cmd="-P $poa_params"
    fi
fi

block_id_min=$(echo "scale=4; $map_pct_id / 100.0" | bc)
prefix_smoothed="$prefix_seqwish".$(echo h$n_haps-G$target_poa_length-j$max_path_jump-e$max_edge_jump-d$pad_max_depth-I$block_id_min-R$block_ratio_min-p$poa_params-O$poa_padding | sha256sum | head -c 7).smooth
prefix_smoothed_output="$prefix_smoothed"


fmt="%C\n%Us user %Ss system %P cpu %es total %MKb max memory"
timer=$(which time)


# Directories
if [ ! -e "$output_dir" ]; then
  mkdir "$output_dir"
fi
prefix_paf="$output_dir"/$(basename "$prefix_paf")
prefix_smoothed_output="$output_dir"/$(basename "$prefix_smoothed")

# If the temporary directory is not explicitly set, set it equal to the output directory
if [[ "$input_temp_dir" == false ]]; then
  temp_dir="$output_dir" # NOTE: the output directory always exists before this statement
else
  temp_dir="$input_temp_dir"
fi
temp_dir_was_created=false
if [ ! -e "$temp_dir" ]; then
  mkdir "$temp_dir"
  temp_dir_was_created=true
fi
prefix_mappings_paf="$temp_dir"/$(basename "$prefix_paf")
prefix_seqwish="$temp_dir"/$(basename "$prefix_seqwish")
prefix_smoothed="$temp_dir"/$(basename "$prefix_smoothed")

if [[ $poa_threads == 0 ]]; then
    poa_threads=$threads
fi

date=`date "+%m-%d-%Y_%H:%M:%S"`
log_file="$prefix_smoothed_output".$date.log
param_file="$prefix_smoothed_output".$date.params.yml

# write parameters to log_file:
echo -e "Starting pggb on $date\n" > "$log_file"
echo -e "Command: $cmd\n" >> "$log_file"
echo -e "PARAMETERS\n" >> "$log_file"
cat <<EOT | tee -a "$log_file" "$param_file" >/dev/null
general:
  input-fasta:        $input_fasta
  output-dir:         $output_dir
  temp-dir:           $temp_dir
  resume:             $resume
  compress:           $compress
  threads:            $threads
  poa_threads:        $poa_threads
$mapper:
  version:            $mapper_version
  segment-length:     $segment_length
  block-length:       $block_length
  map-pct-id:         $map_pct_id
  n-mappings:         $n_mappings
  no-splits:          $no_splits
  sparse-map:         $sparse_map
  mash-kmer:          $mash_kmer
  mash-kmer-thres:    $mash_kmer_thres
  exclude-delim:      $exclude_delim
  no-merge-segments:  $no_merge_segments
seqwish:
  version:            $(seqwish --version 2>&1)
  min-match-len:      $min_match_length
  sparse-factor:      $sparse_factor
  transclose-batch:   $transclose_batch
smoothxg:
  version:            $(smoothxg --version 2>&1)
  skip-normalization: $skip_normalization
  n-haps:             $n_haps
  path-jump-max:      $max_path_jump
  edge-jump-max:      $max_edge_jump
  poa-length-target:  $target_poa_length
  poa-params:         ${poa_params_cmd:3}
  poa_padding:        $poa_padding
  run_abpoa:          $run_abpoa
  run_global_poa:     $run_global_poa
  pad-max-depth:      $pad_max_depth
  write-maf:          $write_maf
  consensus-spec:     $consensus_spec
  consensus-prefix:   $consensus_prefix
  block-id-min:       $block_id_min
  block-ratio-min:    $block_ratio_min
odgi:
  version:            $(odgi version -v 2>&1)
  viz:                $do_viz
  layout:             $do_layout
  stats:              $do_stats
gfaffix:
  version:            v$(gfaffix --version | cut -f 2 -d ' ')
  reduce-redundancy:  $reduce_redundancy
vg:
  version:            $(vg version | head -n 1 | cut -f 3 -d ' ')
  deconstruct:        $vcf_spec
reporting:
  version:            v$(multiqc --version | cut -f 3 -d ' ')
  multiqc:            $multiqc
EOT


echo -e "\nRunning pggb\n" >> "$log_file"
if [[ "$input_paf" == false ]]; then
  if [[ ! -s "$prefix_paf".alignments.$mapper.paf || $resume == false ]]; then

    # Check if also the mappings are missing, else do not recompute them when $resume == true
    if [[ ! -s "$prefix_mappings_paf".mappings.$mapper.paf || $resume == false ]]; then
      ($timer -f "$fmt" wfmash \
          -s $segment_length \
          -l $block_length \
          -p $map_pct_id \
          -n $n_mappings_minus_1 \
          $split_cmd \
          $sparse_map_cmd \
          -k $mash_kmer \
          -H $mash_kmer_thres \
          $exclude_cmd \
          -t $threads \
          --tmp-base $temp_dir \
          $merge_cmd \
          "$input_fasta" \
          --approx-map \
          > "$prefix_mappings_paf".mappings.$mapper.paf) 2> >(tee -a "$log_file")
    fi

    ($timer -f "$fmt" wfmash \
        -s $segment_length \
        -l $block_length \
        -p $map_pct_id \
        -n $n_mappings_minus_1 \
        $split_cmd \
        $sparse_map_cmd \
        -k $mash_kmer \
        -H $mash_kmer_thres \
        $exclude_cmd \
        -t $threads \
        --tmp-base $temp_dir \
        $merge_cmd \
        "$input_fasta" \
        -i "$prefix_mappings_paf".mappings.$mapper.paf --invert-filtering \
        > "$prefix_paf".alignments.$mapper.paf) 2> >(tee -a "$log_file")
  fi
fi

# adjust seqwish_paf
if [[ "$input_paf" == false ]]; then
    seqwish_paf="$prefix_paf".alignments.$mapper.paf
else
    seqwish_paf="$input_paf"
fi
if [[ ! -s $prefix_seqwish.seqwish.gfa || $resume == false ]]; then
    $timer -f "$fmt" seqwish \
        -s "$input_fasta" \
        -p "$seqwish_paf" \
        -k $min_match_length \
        -f $sparse_factor \
        -g "$prefix_seqwish".seqwish.gfa \
        -B $transclose_batch \
        -t $threads \
        --temp-dir "$temp_dir" \
        -P \
        2> >(tee -a "$log_file")
fi

if [[ $skip_normalization == false ]]; then
  if [[ $consensus_spec != false ]]; then
      # for merging consensus (currently problematic) we should add "-M -J 1 -G 150" here
      consensus_params="-C ${prefix_smoothed_output}.cons,$consensus_spec"
  else
      consensus_params="-V"
  fi

  if [[ $write_maf != false ]]; then
      maf_params="-m ${prefix_smoothed_output}.maf"
  fi

  keep_temp_cmd=""
  if [[ $keep_intermediate_files == true ]]; then
      keep_temp_cmd="-K"
  fi

  smoothxg_xpoa_cmd="-S"
  if [[ "$run_abpoa" == true ]]; then
    smoothxg_xpoa_cmd=""
  fi

  smoothxg_poa_mode_cmd=""
  if [[ "$run_global_poa" == true ]]; then
    smoothxg_poa_mode_cmd="-Z"
  fi

  if [[ ! -s $prefix_smoothed.gfa || ($write_maf != false && ! -s ${prefix_smoothed_output}.maf) || $resume == false ]]; then
      resume=false # smoothxg is not deterministic, then all subsequent steps need to be rerun for consistency

      $timer -f "$fmt" smoothxg \
             -t $threads \
             -T $poa_threads \
             -g "$prefix_seqwish".seqwish.gfa \
             -r "$n_haps" \
             --base "$temp_dir" \
             $keep_temp_cmd \
             --chop-to 100 \
             -I "$block_id_min" \
             -R "$block_ratio_min" \
             -j $max_path_jump \
             -e $max_edge_jump \
             -l $target_poa_length \
             $poa_params_cmd \
             -O "$poa_padding" \
             -Y $(echo "$pad_max_depth * $n_haps" | bc) \
             -d 0 -D 0 \
             $smoothxg_xpoa_cmd $smoothxg_poa_mode_cmd \
             $maf_params \
             -Q "$consensus_prefix" \
             $consensus_params \
             -o "$prefix_smoothed".gfa \
             2> >(tee -a "$log_file")
  fi
else
  mv "$prefix_seqwish".seqwish.gfa "$prefix_smoothed".gfa
fi

if [[ $reduce_redundancy == true ]]; then
    # Collapse redundant nodes where possible
    if [[ ! -s "$prefix_smoothed_output".fix.gfa || ! -s "$prefix_smoothed_output".fix.affixes.tsv.gz || $resume == false ]]; then
      ( $timer -f "$fmt" gfaffix "$prefix_smoothed".gfa -o "$prefix_smoothed".fix.gfa | $timer -f "$fmt" pigz > "$prefix_smoothed_output".fix.affixes.tsv.gz ) 2> >(tee -a "$log_file")
    fi

    # Sort
    if [[ ! -s "$prefix_smoothed_output".final.og || $resume == false ]]; then
      ( $timer -f "$fmt" odgi build -t $threads -P -g "$prefix_smoothed".fix.gfa -o - -O \
          | $timer -f "$fmt" odgi unchop -P -t $threads -i - -o - \
          | $timer -f "$fmt" odgi sort -P -p Ygs --temp-dir "$temp_dir" -t $threads -i - -o "$prefix_smoothed_output".final.og ) 2> >(tee -a "$log_file")

      resume=false # The PG-SGD sorting (`-pY`) is not deterministic, then all subsequent steps need to be rerun for consistency
    fi

    if [[ ! -s "$prefix_smoothed_output".final.gfa || $resume == false ]]; then
      ( $timer -f "$fmt" odgi view -i "$prefix_smoothed_output".final.og -g > "$prefix_smoothed_output".final.gfa ) 2> >(tee -a "$log_file")
    fi
else
    if [[ ! -s "$prefix_smoothed_output".final.gfa || $resume == false ]]; then
      mv "$prefix_smoothed".gfa "$prefix_smoothed_output".final.gfa
    fi

    if [[ ! -s "$prefix_smoothed_output".final.og || $resume == false ]]; then
      $timer -f "$fmt" odgi build -t $threads -P -g "$prefix_smoothed_output".final.gfa -o "$prefix_smoothed_output".final.og 2> >(tee -a "$log_file")
    fi
fi


if [[ $vcf_spec != false ]]; then
    for s in $( echo "$vcf_spec" | tr ',' ' ' );
    do
        ref=$(echo "$s" | cut -f 1 -d: )
        delim=$(echo "$s" | cut -f 2 -d: )
        pop_length=$(echo "$s" | cut -f 3 -d: )
        if [[ -z $pop_length ]]; then
          pop_length=0
        fi
        vcf="$prefix_smoothed_output".final.$(echo $ref | tr '/|' '_').vcf
        if [[ ! -s $vcf || $resume == false ]]; then
          echo "[vg::deconstruct] making VCF with reference=$ref and delim=$delim"
          ( TEMPDIR=$(pwd) $timer -f "$fmt" vg deconstruct -P "$ref" \
                   -H "$delim" -e -a -t $threads "$prefix_smoothed_output".final.gfa >"$vcf" ) 2> >(tee -a "$log_file")
          bcftools stats "$vcf" > "$vcf".stats
        fi

        if [[ $pop_length -gt 0 ]]; then
          vcf_decomposed="$prefix_smoothed_output".final.$(echo $ref | tr '/|' '_').decomposed.vcf
          if [[ ! -s $vcf_decomposed || $resume == false ]]; then
            echo "[vg::deconstruct] decompose VCF"
            vcf_decomposed_tmp=$vcf_decomposed.tmp.vcf
            bgzip -c -@ 48 "$vcf" > "$vcf".gz
            TEMPDIR=$(pwd) $timer -f "$fmt" vcfbub -l 0 -a $pop_length --input "$vcf".gz | TEMPDIR=$(pwd) $timer -f "$fmt" vcfwave -I 1000 -t $threads > "$vcf_decomposed_tmp"

            #TODO: to remove when vcfwave will be bug-free
            # The TYPE info sometimes is wrong/missing
            # There are variants without the ALT allele
            bcftools annotate -x INFO/TYPE "$vcf_decomposed_tmp"  | awk '$5 != "."' > "$vcf_decomposed"
            rm "$vcf_decomposed_tmp" "$vcf".gz

            bcftools stats "$vcf_decomposed" > "$vcf_decomposed".stats
          fi
        fi
    done
fi

if [[ $multiqc == true ]]; then
    do_stats=true
fi

if [[ $do_stats == true ]]; then
    $timer -f "$fmt" odgi build -t $threads -P -g "$prefix_seqwish".seqwish.gfa -o "$prefix_seqwish".seqwish.og 2> >(tee -a "$log_file")
    odgi stats -i "$prefix_seqwish".seqwish.og -m > "$prefix_seqwish".seqwish.og.stats.yaml 2>&1 | tee -a "$log_file"
    odgi stats -i "$prefix_smoothed_output".final.og -m  > "$prefix_smoothed_output".final.og.stats.yaml 2>&1 | tee -a "$log_file"
    if [[ $consensus_spec != false ]]; then
        for consensus_graph in "$prefix_smoothed_output"*.cons*.gfa; do
            odgi build -t $threads -P -g "$consensus_graph" -o "$consensus_graph".og 2> >(tee -a "$log_file")
            odgi stats -i "$consensus_graph".og -m >"$consensus_graph".og.stats.yaml 2>&1 | tee -a "$log_file"
        done
    fi
fi

if [[ $do_viz == true ]]; then
    # big problem: this assumes that there is no "Consensus_" in the input sequences

    # default visualization
    if [[ ! -s "$prefix_smoothed_output".final.og.viz_multiqc.png || $resume == false ]]; then
      $timer -f "$fmt" odgi viz -i "$prefix_smoothed_output".final.og \
                      -o "$prefix_smoothed_output".final.og.viz_multiqc.png \
                      -x 1500 -y 500 -a 10 -I "$consensus_prefix" \
                      2> >(tee -a "$log_file")
    fi

    # color by nucleotide position in the path
    if [[ ! -s "$prefix_smoothed_output".final.og.viz_pos_multiqc.png || $resume == false ]]; then
      $timer -f "$fmt" odgi viz -i "$prefix_smoothed_output".final.og \
                      -o "$prefix_smoothed_output".final.og.viz_pos_multiqc.png \
                      -x 1500 -y 500 -a 10 -u -d -I "$consensus_prefix" \
                      2> >(tee -a "$log_file")
    fi

    # color by mean depth
    if [[ ! -s "$prefix_smoothed_output".final.og.viz_depth_multiqc.png || $resume == false ]]; then
      $timer -f "$fmt" odgi viz -i "$prefix_smoothed_output".final.og \
                      -o "$prefix_smoothed_output".final.og.viz_depth_multiqc.png \
                      -x 1500 -y 500 -a 10 -m -I "$consensus_prefix" \
                      2> >(tee -a "$log_file")
    fi

    # color by mean inversion rate
    if [[ ! -s "$prefix_smoothed_output".final.og.viz_inv_multiqc.png || $resume == false ]]; then
      $timer -f "$fmt" odgi viz -i "$prefix_smoothed_output".final.og \
                      -o "$prefix_smoothed_output".final.og.viz_inv_multiqc.png \
                      -x 1500 -y 500 -a 10 -z -I "$consensus_prefix" \
                      2> >(tee -a "$log_file")
    fi

    # compressed mode
    if [[ ! -s "$prefix_smoothed_output".final.og.viz_O_multiqc.png || $resume == false ]]; then
      $timer -f "$fmt" odgi viz -i "$prefix_smoothed_output".final.og \
                      -o "$prefix_smoothed_output".final.og.viz_O_multiqc.png \
                      -x 1500 -y 500 -a 10 -O -I "$consensus_prefix" \
                      2> >(tee -a "$log_file")
    fi

    # color by uncalled bases
    if [[ ! -s "$prefix_smoothed_output".final.og.viz_uncalled_multiqc.png || $resume == false ]]; then
      $timer -f "$fmt" odgi viz -i "$prefix_smoothed_output".final.og \
                      -o "$prefix_smoothed_output".final.og.viz_uncalled_multiqc.png \
                      -x 1500 -y 500 -a 10 -N -I "$consensus_prefix" \
                      2> >(tee -a "$log_file")
    fi
fi

if [[ $do_layout == true ]]; then
    # the 2D layout is "smoother" when we chop the graph nodes to a fixed maximum length, but it becomes heavier to compute
    #$timer -f "$fmt" odgi chop -i "$prefix_smoothed_output".final.og -c 100 -o ""$prefix_smoothed_output".final.chop.og \
    #    2> >(tee -a "$log_file")

    if [[ ! -s "$prefix_smoothed_output".final.og.lay || $resume == false ]]; then
      # adding `-N g` to this call can help when rendering large, complex graphs that aren't globally linear
      $timer -f "$fmt" odgi layout -i "$prefix_smoothed_output".final.og \
                         -o "$prefix_smoothed_output".final.og.lay \
                         -T "$prefix_smoothed_output".final.og.lay.tsv \
                         -t $threads --temp-dir "$temp_dir" -P \
                         2> >(tee -a "$log_file")
    fi

    # the graph can be drawn in 2D in different ways, based on the same layout
    if [[ ! -s "$prefix_smoothed_output".final.og.lay.draw.png || $resume == false ]]; then
      # here we draw in default mode
      $timer -f "$fmt" odgi draw -i "$prefix_smoothed_output".final.og \
                       -c "$prefix_smoothed_output".final.og.lay \
                       -p "$prefix_smoothed_output".final.og.lay.draw.png \
                       -H 1000 \
                       2> >(tee -a "$log_file")
    fi

    if [[ ! -s "$prefix_smoothed_output".final.og.lay.draw_multiqc.png || $resume == false ]]; then
      # this attempts to add paths
      $timer -f "$fmt" odgi draw -i "$prefix_smoothed_output".final.og \
                       -c "$prefix_smoothed_output".final.og.lay \
                       -p "$prefix_smoothed_output".final.og.lay.draw_multiqc.png \
                       -C -w 20 \
                       -H 1000 \
                       2> >(tee -a "$log_file")
    fi
fi

multiqc_out_dir=$(dirname "$input_fasta")

multiqc_config="# Report section config for nice titles and descriptions
custom_data:
  odgi_O:
    section_name: ODGI Compressed 1D visualization
    description: This image shows a 1D rendering of the built pangenome graph. The graph nodes are arranged from left to right, forming the pangenome sequence. Summarization of path coverage across all paths. A heatmap color-coding from https://colorbrewer2.org/#type=diverging&scheme=RdBu&n=11 is used. Dark blue means highest coverage. Dark red means lowest coverage. The path names are placed on the left. The black lines under the paths are the links, which represent the graph topology.
  odgi_viz:
    section_name: ODGI 1D visualization
    description: This image shows a 1D rendering of the built pangenome graph. The graph nodes are arranged from left to right, forming the pangenome sequence. The colored bars represent the paths versus the pangenome sequence in a binary matrix. The path names are placed on the left. The black lines under the paths are the links, which represent the graph topology.
  odgi_viz_pos:
    section_name: ODGI 1D visualization by path position
    description: This shows a 1D rendering of the built pangenome graph where the paths are colored according to their nucleotide position. Light grey means a low path position, black is the highest path position.
  odgi_viz_inv:
    section_name: ODGI 1D visualization by path orientation
    description: This image shows a 1D rendering of the built pangenome graph where the paths are colored by orientation. Forward is black, reverse is red.
  odgi_viz_depth:
    section_name: ODGI 1D visualization by node depth
    description: This shows a 1D rendering of the built pangenome graph where the paths are colored according to path depth. Using the Spectra color palette with 4 levels of path depths, white indicates no depth, while grey, red, and yellow indicate depth 1, 2, and greater than or equal to 3, respectively.
  odgi_viz_uncalled:
    section_name: ODGI 1D visualization by uncalled bases
    description: This shows a 1D rendering of the built pangenome graph where the paths are colored according to the coverage of uncalled bases. The lighter the green, the higher the 'N' content of a node is.
  odgi_draw:
    section_name: ODGI 2D drawing
    description: This image shows a 2D rendering of the built pangenome graph.

# Custom search patterns to find the image outputs
sp:
  odgi_O:
    fn: \"*O_multiqc.png\"
  odgi_draw:
    fn: \"*draw_multiqc.png\"
  odgi_viz:
    fn: \"*viz_multiqc.png\"
  odgi_viz_pos:
    fn: \"*viz_pos_multiqc.png\"
  odgi_viz_inv:
    fn: \"*viz_inv_multiqc.png\"
  odgi_viz_depth:
    fn: \"*viz_depth_multiqc.png\"
  odgi_viz_uncalled:
    fn: \"*viz_uncalled_multiqc.png\"    
  testing_name:
    fn: \"*draw.png\"
ignore_images: false

# Make the custom content stuff come after the ODGI module output
module_order:
  - odgi
  - custom_content

# Set the order that the custom content plots should come in
custom_content:
  order:
    - odgi_O
    - odgi_viz
    - odgi_viz_pos
    - odgi_viz_inv
    - odgi_viz_depth
    - odgi_viz_uncalled
    - odgi_draw
fn_clean_exts:
  - \".gfa\""

if [[ $multiqc == true ]]; then
    echo "$multiqc_config" > "$output_dir"/multiqc_config.yaml
    if [[ $output_dir == "" ]]; then
        $timer -f "$fmt" multiqc "$multiqc_out_dir" \
        -s \
        -o "$multiqc_out_dir" \
        -c "$output_dir"/multiqc_config.yaml \
        2> >(tee -a "$log_file")
    else
        $timer -f "$fmt" multiqc "$output_dir" \
        -s \
        -o "$output_dir" \
        -c "$output_dir"/multiqc_config.yaml \
        2> >(tee -a "$log_file")
    fi
fi

if [[ $keep_intermediate_files == true ]]; then
  if [[ "$output_dir" != "$temp_dir" ]]; then
    # `|| true` to avoid `mv` fail if there are missing files to move
    mv -f "$prefix_mappings_paf".mappings.$mapper.paf "$output_dir" 2> /dev/null || true
    mv -f "$prefix_seqwish".seqwish.{gfa,og} "$output_dir" 2> /dev/null || true
    mv -f "$prefix_seqwish".*.prep.gfa "$output_dir" 2> /dev/null || true

    mv -f "$prefix_seqwish".seqwish.gfa.smooth.[0-9].{gfa,og} "$output_dir" 2> /dev/null || true
    mv -f "$prefix_smoothed".fix.gfa "$output_dir" 2> /dev/null || true

    if [[ $reduce_redundancy == true ]]; then
      mv -f "$prefix_smoothed".{gfa,og} "$output_dir" 2> /dev/null || true
    fi

    # Since these temporary files have just been moved to a different folder
    prefix_seqwish="$output_dir"/$(basename "$prefix_seqwish")
    prefix_smoothed="$output_dir"/$(basename "$prefix_smoothed")
  fi
else
  rm -f "$prefix_mappings_paf".mappings.$mapper.paf
  rm -f "$prefix_seqwish".seqwish.{gfa,og}
  rm -f "$prefix_seqwish".seqwish.gfa.smooth.[0-9].{gfa,og}

  if [[ $reduce_redundancy == true ]]; then
    rm -f "$prefix_smoothed".{gfa,og}
    rm -f "$prefix_smoothed".fix.gfa
  fi
fi

if [[ $temp_dir_was_created == true ]]; then
  rm -r "$temp_dir"
fi

if [[ $compress == true ]]; then
    if [[ $input_paf == false ]]; then
        pigz -f -q -p $threads "$prefix_paf"*.paf -v
    fi
    pigz -f -q -p $threads "$prefix_seqwish"*.{gfa,og} -v
    if [[ $write_maf != false ]]; then
      pigz -f -q -p $threads "$prefix_seqwish"*.maf -v
    fi
    if [[ $vcf_spec != false ]]; then
      ls "$prefix_seqwish"*.vcf | while read f; do bgzip "$f" -f -@ $threads; done
    fi
    if [[ $do_layout == true ]]; then
        pigz -f -q -p $threads "$prefix_smoothed_output".final.og.lay.tsv -v
    fi
fi
