Source code for ppanggolin.main

#!/usr/bin/env python3
#coding:utf-8

#default libraries
import sys
if sys.version_info < (3, 6):#minimum is python3.6
    raise AssertionError("Minimum python version to run PPanGGOLiN is 3.6. Your current python version is " + ".".join(map(str,sys.version_info)))
import argparse
import logging
import resource
import pkg_resources
import tempfile
import os
from multiprocessing import Process 
import time

#local modules
import ppanggolin.pangenome
import ppanggolin.nem.partition
import ppanggolin.nem.rarefaction
import ppanggolin.graph
import ppanggolin.annotate
import ppanggolin.cluster
import ppanggolin.workflow.workflow
import ppanggolin.workflow.panRGP
import ppanggolin.figures
import ppanggolin.formats
import ppanggolin.info
import ppanggolin.align
import ppanggolin.RGP

[docs]def checkTsvSanity(tsv):
    f = open(tsv,"r")
    nameSet = set()
    duplicatedNames = set()
    nonExistingFiles = set()
    for line in f:
        elements = [el.strip() for el in line.split("\t")]
        if len(elements)<=1:
            raise Exception(f"No tabulation separator found in given file: {tsv}")
        if " " in elements[0]:
            raise Exception(f"Your genome names contain spaces (The first encountered genome name that had this string : '{elements[0]}'). To ensure compatibility with all of the dependencies of PPanGGOLiN this is not allowed. Please remove spaces from your genome names.")
        oldLen = len(nameSet)
        nameSet.add(elements[0])
        if len(nameSet) == oldLen:
            duplicatedNames.add(elements[0])
        if not os.path.exists(elements[1]):
            nonExistingFiles.add(elements[1])
    if len(nonExistingFiles) != 0:
        raise Exception(f"Some of the given files do not exist. The non-existing files are the following : '{' '.join(nonExistingFiles)}'")
    if len(duplicatedNames) != 0:
        raise Exception(f"Some of your genomes have identical names. The duplicated names are the following : '{' '.join(duplicatedNames)}'")

[docs]def checkInputFiles(anno=None, pangenome=None, fasta=None):
    """
        Checks if the provided input files exist and are of the proper format
    """
    if pangenome is not None:
        if not os.path.exists(pangenome):
            raise FileNotFoundError(f"No such file or directory: '{pangenome}'")

    if anno is not None:
        if not os.path.exists(anno):
            raise FileNotFoundError(f"No such file or directory: '{anno}'")
        checkTsvSanity(anno)

    if fasta is not None:
        if not os.path.exists(fasta):
            raise FileNotFoundError(f"No such file or directory: '{fasta}'")
        checkTsvSanity(fasta)

[docs]def checkLog(name):
    if name == "stdout":
        return sys.stdout
    elif name == "stderr":
        return sys.stderr
    else:
        return open(name,"w")

[docs]def cmdLine():

    #need to manually write the description so that it's displayed into groups of subcommands ....
    desc = "\n"
    desc += "All of the following subcommands have their own set of options. To see them for a given subcommand, use it with -h or --help, as such:\n"
    desc += "  ppanggolin <subcommand> -h\n"
    desc += "\n"
    desc += "  Basic:\n"
    desc += "    workflow      Easy workflow to run a pangenome analysis in one go\n"
    desc += "    panrgp        Easy workflow to run a pangenome analysis with genomic islands and spots of insertion detection\n"
    desc += "  \n"
    desc += "  Expert:\n"
    desc += "    annotate      Annotate genomes\n"
    desc += "    cluster       Cluster proteins in protein families\n"
    desc += "    graph         Create the pangenome graph\n"
    desc += "    partition     Partition the pangenome graph\n"
    desc += "    rarefaction   Compute the rarefaction curve of the pangenome\n"
    desc += "  \n"
    desc += "  Output:\n"
    desc += "    draw          Draw figures representing the pangenome through different aspects\n"
    desc += "    write         Writes 'flat' files representing the pangenome that can be used with other softwares\n"
    desc += "    info          Prints information about a given pangenome graph file\n"
    desc += "  \n"
    desc += "  Regions of genomic Plasticity:\n"
    desc += "    align         Aligns a genome or a set of proteins to the pangenome gene families representatives and predict informations from it\n"
    desc += "    rgp           Predicts Regions of Genomic Plasticity in the genomes of your pangenome\n"
    desc += "    spot          Predicts spots in your pangenome\n"

    parser = argparse.ArgumentParser(description = "Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-v','--version', action='version', version='%(prog)s ' + pkg_resources.get_distribution("ppanggolin").version)
    subparsers = parser.add_subparsers( metavar = "", dest="subcommand", title="subcommands", description = desc)
    subparsers.required = True#because python3 sent subcommands to hell apparently

    subs = []#subparsers
    subs.append(ppanggolin.annotate.syntaSubparser(subparsers))
    subs.append(ppanggolin.cluster.clusterSubparser(subparsers))
    subs.append(ppanggolin.graph.graphSubparser(subparsers))
    subs.append(ppanggolin.nem.partition.partitionSubparser(subparsers))
    subs.append(ppanggolin.nem.rarefaction.rarefactionSubparser(subparsers))
    subs.append(ppanggolin.workflow.workflow.workflowSubparser(subparsers))
    subs.append(ppanggolin.workflow.panRGP.panRGPSubparser(subparsers))
    subs.append(ppanggolin.figures.figureSubparser(subparsers))
    subs.append(ppanggolin.formats.writeFlat.writeFlatSubparser(subparsers))
    subs.append(ppanggolin.align.alignSubparser(subparsers))
    subs.append(ppanggolin.RGP.genomicIsland.rgpSubparser(subparsers))
    subs.append(ppanggolin.RGP.spot.spotSubparser(subparsers))
    ppanggolin.info.infoSubparser(subparsers)#not adding to subs because the 'common' options are not needed for this.

    for sub in subs:#add options common to all subcommands
        common = sub._action_groups.pop(1)#get the 'optional arguments' action group.
        common.title = "Common arguments"
        common.add_argument("--tmpdir", required=False, type=str, default=tempfile.gettempdir(), help = "directory for storing temporary files")
        common.add_argument("--verbose",required=False, type=int,default=1,choices=[0,1,2], help = "Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)")
        common.add_argument("--log", required=False, type=checkLog, default="stdout", help = "log output file")
        common.add_argument("-c","--cpu",required = False, default = 1,type=int, help = "Number of available cpus")
        common.add_argument('-f', '--force', action="store_true", help="Force writing in output directory and in pangenome output file.")
        sub._action_groups.append(common)
        if (len(sys.argv) == 2 and sub.prog.split()[1] == sys.argv[1]):
            sub.print_help()
            exit(1)

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)

    args = parser.parse_args()
    if args.subcommand == "annotate":
        if args.fasta is None and args.anno is None:
            raise Exception( "You must provide at least a file with the --fasta option to annotate from sequences, or a file with the --gff option to load annotations from.")
    return args

[docs]def main():
    args = cmdLine()

    if hasattr(args, "pangenome"):
        checkInputFiles(pangenome = args.pangenome)
    if hasattr(args, "fasta"):
        checkInputFiles(fasta = args.fasta)
    if hasattr(args,"anno"):
        checkInputFiles(anno = args.anno)

    if hasattr(args, "verbose"):
        if args.verbose == 2:
            level = logging.DEBUG#info, debug, warnings and errors
        elif args.verbose == 1:
            level = logging.INFO#info, warnings and errors
        elif args.verbose == 0:
            level = logging.WARNING#only warnings and errors
        logging.basicConfig(stream=args.log, level = level, format = '%(asctime)s %(filename)s:l%(lineno)d %(levelname)s\t%(message)s', datefmt='%Y-%m-%d %H:%M:%S')
        logging.getLogger().info("Command: "+" ".join([arg for arg in sys.argv]))
        logging.getLogger().info("PPanGGOLiN version: "+pkg_resources.get_distribution("ppanggolin").version)
    if args.subcommand == "annotate":
        ppanggolin.annotate.launch(args)
    elif args.subcommand == "cluster":
        ppanggolin.cluster.launch(args)
    elif args.subcommand == "graph":
        ppanggolin.graph.launch(args)
    elif args.subcommand == "partition":
        ppanggolin.nem.partition.launch(args)
    elif args.subcommand == "workflow":
        ppanggolin.workflow.workflow.launch(args)
    elif args.subcommand == "rarefaction":
        ppanggolin.nem.rarefaction.launch(args)
    elif args.subcommand == "draw":
        ppanggolin.figures.launch(args)
    elif args.subcommand == "write":
        ppanggolin.formats.launch(args)
    elif args.subcommand == "info":
        ppanggolin.info.launch(args)
    elif args.subcommand == "align":
        ppanggolin.align.launch(args)
    elif args.subcommand == "rgp":
        ppanggolin.RGP.genomicIsland.launch(args)
    elif args.subcommand == "spot":
        ppanggolin.RGP.spot.launch(args)
    elif args.subcommand == "panrgp":
        ppanggolin.workflow.panRGP.launch(args)

if __name__ == "__main__":
    main()