Source code for ppanggolin.formats.writeFlat

#!/usr/bin/env python3
#coding:utf-8

#default libraries
import argparse
from multiprocessing import Pool
from collections import Counter, defaultdict
import logging
import pkg_resources
from statistics import median, mean, stdev
import os
from random import randint, shuffle

#local libraries
from ppanggolin.pangenome import Pangenome
from ppanggolin.utils import write_compressed_or_not, mkOutdir, restricted_float
from ppanggolin.formats import checkPangenomeInfo, getGeneSequencesFromFile

#installed libraries
from tqdm import tqdm, trange
from numpy import percentile

#global variable to store the pangenome
pan = None

[docs]def writeJSONheader(json):
    json.write('{"directed": false, "multigraph": false,')
    json.write(' "graph": {')
    json.write(' "organisms": {')
    orgstr = []
    for org in pan.organisms:
        orgstr.append('"'+org.name+'": {')
        contigstr = []
        for contig in org.contigs:
            contigstr.append('"' + contig.name + '": {"is_circular": ' + ('true' if contig.is_circular else 'false')+'}')
        orgstr[-1] += ', '.join(contigstr) + "}"

    json.write(', '.join(orgstr) + "}")
    ##if other things are to be written such as the parameters, write them here
    json.write('},')

[docs]def writeJSONGeneFam(geneFam, json):
        json.write('{'+ f'"id": "{geneFam.name}", "nb_genes": {len(geneFam.genes)}, "partition": "{geneFam.namedPartition}", "subpartition": "{geneFam.partition}"')
        orgDict = {}
        name_counts = Counter()
        product_counts = Counter()
        length_counts = Counter()
        for gene in geneFam.genes:
            name_counts[gene.name] += 1
            product_counts[gene.product] += 1
            length_counts[gene.stop - gene.start] += 1
            try:
                orgDict[gene.organism][gene.contig].append(gene)
            except KeyError:
                try:
                    orgDict[gene.organism][gene.contig] = [gene]
                except KeyError:
                    orgDict[gene.organism] = {gene.contig : [gene]}
        json.write(f', "name": "{name_counts.most_common(1)[0][0]}", "product": "{product_counts.most_common(1)[0][0]}", "length": {length_counts.most_common(1)[0][0]}')
        json.write(', "organisms": {')
        orgstr = []
        for org in orgDict:
            orgstr.append('"' + org.name + '": {')
            contigstr = []
            for contig in orgDict[org]:
                contigstr.append('"' + contig.name + '": {')
                genestr = []
                for gene in orgDict[org][contig]:
                    identifier = gene.ID if gene.local_identifier == "" else gene.local_identifier
                    genestr.append('"' + identifier + '": {' + f'"name": "{gene.name}", "product": "{gene.product}", "is_fragment": {"true" if gene.is_fragment else "false"}, "position": {gene.position}, "strand": "{gene.strand}", "end": {gene.stop}, "start": {gene.start}'+'}')
                contigstr[-1] += ", ".join(genestr) + "}"
            orgstr[-1] += ", ".join(contigstr) + "}"
        json.write(", ".join(orgstr) + "}}")

[docs]def writeJSONnodes(json):
    json.write('"nodes": [')
    famList = list(pan.geneFamilies)
    firstFam = famList[0]
    writeJSONGeneFam(firstFam, json)
    for geneFam in famList[1:]:
        json.write(', ')
        writeJSONGeneFam(geneFam, json)
    json.write(']')

[docs]def writeJSONedge(edge, json):
    json.write("{")
    json.write(f'"weight": {len(edge.genePairs)}, "source": "{edge.source.name}", "target": "{edge.target.name}"')
    json.write(', "organisms": {')
    orgstr = []
    for org in edge.getOrgDict():
        orgstr.append('"' + org.name + '": [')
        genepairstr = []
        for genepair in  edge.getOrgDict()[org]:
            genepairstr.append('{"source": "' + genepair[0].ID + '", "target": "' + genepair[1].ID + f'", "length": {genepair[0].start - genepair[1].stop}' + '}')
        orgstr[-1] += ', '.join(genepairstr) + ']'
    json.write(', '.join(orgstr) + "}}")

[docs]def writeJSONedges(json):
    json.write(', "links": [')
    edgelist = list(pan.edges)
    writeJSONedge(edgelist[0], json)
    for edge in edgelist[1:]:
        json.write(", ")
        writeJSONedge(edge, json)
    json.write(']')

[docs]def writeJSON(output, compress):
    logging.getLogger().info("Writing the json file for the pangenome graph...")
    outname = output + "/pangenomeGraph.json"
    with write_compressed_or_not(outname, compress) as json:
        writeJSONheader(json)
        writeJSONnodes(json)
        writeJSONedges(json)
        json.write("}")
    logging.getLogger().info(f"Done writing the json file : '{outname}'")

[docs]def writeGEXFheader(gexf, light):
    if not light:
        index = pan.getIndex()#has been computed already
    gexf.write('<?xml version="1.1" encoding="UTF-8"?>\n<gexf xmlns:viz="http://www.gexf.net/1.2draft/viz" xmlns="http://www.gexf.net/1.2draft" version="1.2">\n')
    gexf.write('  <graph mode="static" defaultedgetype="undirected">\n')
    gexf.write('    <attributes class="node" mode="static">\n')
    gexf.write('      <attribute id="0" title="nb_genes" type="long" />\n')
    gexf.write('      <attribute id="1" title="name" type="string" />\n')
    gexf.write('      <attribute id="2" title="product" type="string" />\n')
    gexf.write('      <attribute id="3" title="type" type="string" />\n')
    gexf.write('      <attribute id="4" title="partition" type="string" />\n')
    gexf.write('      <attribute id="5" title="subpartition" type="string" />\n')
    gexf.write('      <attribute id="6" title="partition_exact" type="string" />\n')
    gexf.write('      <attribute id="7" title="partition_soft" type="string" />\n')
    gexf.write('      <attribute id="8" title="length_avg" type="double" />\n')
    gexf.write('      <attribute id="9" title="length_med" type="long" />\n')
    gexf.write('      <attribute id="10" title="nb_organisms" type="long" />\n')
    if not light:
        for org, orgIndex in index.items():
            gexf.write(f'      <attribute id="{orgIndex + 12}" title="{org.name}" type="string" />\n')

    gexf.write('    </attributes>\n')
    gexf.write('    <attributes class="edge" mode="static">\n')
    gexf.write('      <attribute id="11" title="nb_genes" type="long" />\n')
    if not light:
        for org, orgIndex in index.items():
            gexf.write(f'      <attribute id="{orgIndex + len(index) + 12}" title="{org.name}" type="long" />\n')
    # gexf.write('      <attribute id="12" title="nb_organisms" type="long" />\n')#useless because it's the weight of the edge
    gexf.write('    </attributes>\n')
    gexf.write('    <meta>\n')
    gexf.write(f'      <creator>PPanGGOLiN {pkg_resources.get_distribution("ppanggolin").version}</creator>\n')
    gexf.write('    </meta>\n')

[docs]def writeGEXFnodes(gexf, light, soft_core = 0.95):
    gexf.write('    <nodes>\n')
    colors = {"persistent":'a="0" b="7" g="165" r="247"','shell':'a="0" b="96" g="216" r="0"', 'cloud':'a="0" b="255" g="222" r="121"'}
    if not light:
        index = pan.getIndex()

    for fam in pan.geneFamilies:
        name = Counter()
        product = Counter()
        gtype = Counter()
        l = []
        for gene in fam.genes:
            name[gene.name] +=1
            product[gene.product] += 1
            gtype[gene.type] += 1
            l.append(gene.stop - gene.start)

        gexf.write(f'      <node id="{fam.ID}" label="{fam.name}">\n')
        gexf.write(f'        <viz:color {colors[fam.namedPartition]} />\n')
        gexf.write(f'        <viz:size value="{len(fam.organisms)}" />\n')
        gexf.write(f'        <attvalues>\n')
        gexf.write(f'          <attvalue for="0" value="{len(fam.genes)}" />\n')
        gexf.write(f'          <attvalue for="1" value="{name.most_common(1)[0][0]}" />\n')
        gexf.write(f'          <attvalue for="2" value="{product.most_common(1)[0][0]}" />\n')
        gexf.write(f'          <attvalue for="3" value="{gtype.most_common(1)[0][0]}" />\n')
        gexf.write(f'          <attvalue for="4" value="{fam.namedPartition}" />\n')
        gexf.write(f'          <attvalue for="5" value="{fam.partition}" />\n')
        gexf.write(f'          <attvalue for="6" value="{"exact_accessory" if len(fam.organisms) != len(pan.organisms) else "exact_core"}" />\n')
        gexf.write(f'          <attvalue for="7" value="{"soft_core" if len(fam.organisms) > (len(pan.organisms)*soft_core) else "soft_accessory"}" />\n')
        gexf.write(f'          <attvalue for="8" value="{round(sum(l) / len(l),2)}" />\n')
        gexf.write(f'          <attvalue for="9" value="{ int(median(l))}" />\n')
        gexf.write(f'          <attvalue for="10" value="{len(fam.organisms)}" />\n')
        if not light:
            for org, genes in fam.getOrgDict().items():
                gexf.write(f'          <attvalue for="{index[org]+12}" value="{"|".join([ gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in genes])}" />\n')
        gexf.write(f'        </attvalues>\n')
        gexf.write(f'      </node>\n')
    gexf.write('    </nodes>\n')

[docs]def writeGEXFedges(gexf, light):
    gexf.write('    <edges>\n')
    edgeids = 0
    index = pan.getIndex()

    for edge in pan.edges:
        gexf.write(f'      <edge id="{edgeids}" source="{edge.source.ID}" target="{edge.target.ID}" weight="{len(edge.organisms)}">\n')
        gexf.write(f'        <viz:thickness value="{len(edge.organisms)}" />\n')
        gexf.write('        <attvalues>\n')
        gexf.write(f'          <attribute id="11" value="{len(edge.genePairs)}" />\n')
        if not light:
            for org, genes in edge.getOrgDict().items():
                gexf.write(f'          <attvalue for="{index[org]+len(index)+12}" value="{len(genes)}" />\n')
        gexf.write('        </attvalues>\n')
        gexf.write('      </edge>\n')
        edgeids+=1
    gexf.write('    </edges>\n')

[docs]def writeGEXFend(gexf):
    gexf.write("  </graph>")
    gexf.write("</gexf>")

[docs]def writeGEXF(output, light = True, soft_core = 0.95, compress=False):
    txt = "Writing the gexf file for the pangenome graph..."
    if light:
        txt = "Writing the light gexf file for the pangenome graph..."
    logging.getLogger().info(txt)
    outname = output + "/pangenomeGraph"
    outname += "_light" if light else ""
    outname += ".gexf"
    with write_compressed_or_not(outname,compress) as gexf:
        writeGEXFheader(gexf, light)
        writeGEXFnodes(gexf, light)
        writeGEXFedges(gexf, light)
        writeGEXFend(gexf)
    logging.getLogger().info(f"Done writing the gexf file : '{outname}'")

[docs]def writeMatrix(sep, ext, output, compress=False, geneNames = False):
    logging.getLogger().info(f"Writing the .{ext} file ...")
    outname = output + "/matrix." + ext
    with write_compressed_or_not(outname,compress) as matrix:

        index_org = {}
        default_dat = []
        for index, org in enumerate(pan.organisms):
            default_dat.append('0')
            index_org[org] = index

        matrix.write(sep.join(['"Gene"',#1
                                '"Non-unique Gene name"',#2
                                '"Annotation"',#3
                                '"No. isolates"',#4
                                '"No. sequences"',#5
                                '"Avg sequences per isolate"',#6
                                '"Accessory Fragment"',#7
                                '"Genome Fragment"',#8
                                '"Order within Fragment"',#9
                                '"Accessory Order with Fragment"',#10
                                '"QC"',#11
                                '"Min group size nuc"',#12
                                '"Max group size nuc"',#13
                                '"Avg group size nuc"']#14
                                +['"'+str(org)+'"' for org in pan.organisms])+"\n")#15
        default_genes = ['""'] * len(pan.organisms) if geneNames else ["0"] * len(pan.organisms)
        org_index = pan.getIndex()#should just return things
        for fam in pan.geneFamilies:
            genes = default_genes.copy()
            l = []
            alt = fam.namedPartition if fam.partition != "" else False
            genenames = Counter()
            product = Counter()
            for org, gene_list in fam.getOrgDict().items():
                genes[org_index[org]] = " ".join([ '"' + str(gene) + '"' for gene in gene_list]) if geneNames else str(len(gene_list))
                for gene in gene_list:
                    l.append(gene.stop - gene.start)
                    product[gene.product] +=1
                    genenames[gene.name] += 1

            if fam.partition != "":
                alt = fam.namedPartition
            else:
                alt = str(product.most_common(1)[0][0])

            l = [ gene.stop - gene.start for gene in fam.genes ]
            matrix.write(sep.join(['"'+fam.name+'"',#1
                                    '"'+alt+'"',#2
                                    '"'+ str(product.most_common(1)[0][0])  +'"',#3
                                    '"' + str(len(fam.organisms)) + '"',#4
                                    '"' + str(len(fam.genes)) + '"',#5
                                    '"' + str(round(len(fam.genes)/len(fam.organisms),2)) + '"',#6
                                    '"NA"',#7
                                    '"NA"',#8
                                    '""',#9
                                    '""',#10
                                    '""',#11
                                    '"' + str(min(l)) + '"',#12
                                    '"' + str(max(l)) + '"',#13
                                    '"' + str(round(sum(l)/len(l),2)) + '"']#14
                                    +genes)+"\n")#15
    logging.getLogger().info(f"Done writing the matrix : '{outname}'")

[docs]def writeGenePresenceAbsence(output, compress=False):
    logging.getLogger().info(f"Writing the gene presence absence file ...")
    outname = output + "/gene_presence_absence.Rtab"
    with write_compressed_or_not(outname,compress) as matrix:
        index_org = {}
        default_dat = []
        for index, org in enumerate(pan.organisms):
            default_dat.append('0')
            index_org[org] = index

        matrix.write('\t'.join(['Gene']#14
                                +[str(org) for org in pan.organisms])+"\n")#15
        default_genes =  ["0"] * len(pan.organisms)
        org_index = pan.getIndex()#should just return things
        for fam in pan.geneFamilies:
            genes = default_genes.copy()
            for org in fam.organisms:
                genes[org_index[org]] = "1"

            matrix.write('\t'.join([fam.name]#14
                                    +genes)+"\n")#15
    logging.getLogger().info(f"Done writing the gene presence absence file : '{outname}'")

[docs]def writeStats(output, soft_core, dup_margin, compress=False):
    logging.getLogger().info("Writing pangenome statistics...")
    logging.getLogger().info("Writing statistics on persistent duplication...")
    single_copy_markers = set()#could use bitarrays if speed is needed
    with write_compressed_or_not(output + "/mean_persistent_duplication.tsv", compress) as outfile:
        outfile.write(f"#duplication_margin={round(dup_margin,3)}\n")
        outfile.write("\t".join(["persistent_family","duplication_ratio","mean_presence","is_single_copy_marker"]) + "\n")
        for fam in pan.geneFamilies:
            if fam.namedPartition == "persistent":
                mean_pres = len(fam.genes) / len(fam.organisms)
                nb_multi = 0
                for gene_list in fam.getOrgDict().values():
                    if len(gene_list) > 1:
                        nb_multi +=1
                dup_ratio = nb_multi / len(fam.organisms)
                is_SCM = False
                if dup_ratio < dup_margin:
                    is_SCM = True
                    single_copy_markers.add(fam)
                outfile.write("\t".join([fam.name,
                                         str(round(dup_ratio,3)),
                                         str(round(mean_pres,3)),
                                         str(is_SCM)]) + "\n")
    logging.getLogger().info("Done writing stats on persistent duplication")
    logging.getLogger().info("Writing genome per genome statistics (completeness and counts)...")
    soft = set()#could use bitarrays if speed is needed
    core = set()
    for fam in pan.geneFamilies:
        if len(fam.organisms) >= pan.number_of_organisms() * soft_core:
            soft.add(fam)
        if len(fam.organisms) == pan.number_of_organisms():
            core.add(fam)

    with write_compressed_or_not(output + "/organisms_statistics.tsv", compress) as outfile:
        outfile.write(f"#soft_core={round(soft_core,3)}\n")
        outfile.write(f"#duplication_margin={round(dup_margin,3)}\n")
        outfile.write("\t".join(["organism","nb_families","nb_persistent_families","nb_shell_families","nb_cloud_families","nb_exact_core","nb_soft_core","nb_genes","nb_persistent_genes","nb_shell_genes","nb_cloud_genes","nb_exact_core_genes","nb_soft_core_genes","completeness","nb_single_copy_markers"]) + "\n")

        for org in pan.organisms:
            fams = org.families
            nb_pers = 0
            nb_shell = 0
            nb_cloud = 0
            for fam in fams:
                if fam.namedPartition == "persistent":
                    nb_pers+=1
                elif fam.namedPartition == "shell":
                    nb_shell+=1
                else:
                    nb_cloud+=1

            nb_gene_pers = 0
            nb_gene_shell = 0
            nb_gene_soft = 0
            nb_gene_cloud = 0
            nb_gene_core = 0
            for gene in org.genes:
                if gene.family.namedPartition == "persistent":
                    nb_gene_pers +=1
                elif gene.family.namedPartition == "shell":
                    nb_gene_shell +=1
                else:
                    nb_gene_cloud += 1
                if gene.family in soft:
                    nb_gene_soft+=1
                    if gene.family in core:
                        nb_gene_core+=1
            completeness = "NA"
            if len(single_copy_markers) > 0:
                completeness = round((len(fams & single_copy_markers) / len(single_copy_markers))*100,2)
            outfile.write("\t".join(map(str,[org.name,
                                    len(fams),
                                    nb_pers,
                                    nb_shell,
                                    nb_cloud,
                                    len(core & fams),
                                    len(soft & fams),
                                    org.number_of_genes(),
                                    nb_gene_pers,
                                    nb_gene_shell,
                                    nb_gene_cloud,
                                    nb_gene_core,
                                    nb_gene_soft,
                                    completeness,
                                    len(fams & single_copy_markers)])) + "\n")

    logging.getLogger().info("Done writing genome per genome statistics")

[docs]def writeOrgFile(org, output, compress=False):
    with write_compressed_or_not(output + "/" + org.name + ".tsv",compress) as outfile:
        outfile.write("\t".join(["gene","contig","start","stop","strand","ori","family","nb_copy_in_org","partition","persistent_neighbors","shell_neighbors","cloud_neighbors"]) + "\n")
        for contig in org.contigs:
            for gene in contig.genes:
                nb_pers = 0
                nb_shell = 0
                nb_cloud = 0
                for neighbor in gene.family.neighbors:
                    if neighbor.namedPartition == "persistent":
                        nb_pers+=1
                    elif neighbor.namedPartition == "shell":
                        nb_shell+=1
                    else:
                        nb_cloud+=1
                outfile.write("\t".join(map(str,[ gene.ID if gene.local_identifier == "" else gene.local_identifier,
                                        contig.name,
                                        gene.start,
                                        gene.stop,
                                        gene.strand,
                                        "T" if (gene.name.upper() == "DNAA" or gene.product.upper() == "DNAA") else "F",
                                        gene.family.name,
                                        len(gene.family.getGenesPerOrg(org)),
                                        gene.family.namedPartition,
                                        nb_pers,
                                        nb_shell,
                                        nb_cloud
                                        ])) + "\n")

[docs]def writeProjections(output, compress=False):
    logging.getLogger().info("Writing the projection files...")
    outdir = output+"/projection"
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    for org in pan.organisms:
        writeOrgFile(org, outdir, compress)
    logging.getLogger().info("Done writing the projection files")

[docs]def writeParts(output, soft_core, compress=False):
    logging.getLogger().info("Writing the list of gene families for each partitions...")
    if not os.path.exists(output + "/partitions"):
        os.makedirs(output + "/partitions")
    partSets = defaultdict(set)
    #initializing key, value pairs so that files exist even if they are empty
    for neededKey in ["undefined","soft_core","exact_core","exact_accessory","soft_accessory","persistent","shell","cloud"]:
        partSets[neededKey] = set()
    for fam in pan.geneFamilies:
        partSets[fam.namedPartition].add(fam.name)
        if fam.partition.startswith("S"):
            partSets[fam.partition].add(fam.name)
        if len(fam.organisms) >= len(pan.organisms) * soft_core:
            partSets["soft_core"].add(fam.name)
            if len(fam.organisms) == len(pan.organisms):
                partSets["exact_core"].add(fam.name)
            else:
                partSets["exact_accessory"].add(fam.name)
        else:
            partSets["soft_accessory"].add(fam.name)
            partSets["exact_accessory"].add(fam.name)

    for key, val in partSets.items():
        currKeyFile = open(output + "/partitions/" + key + ".txt","w")
        if len(val) > 0:
            currKeyFile.write('\n'.join(val) + "\n")
        currKeyFile.close()
    logging.getLogger().info("Done writing the list of gene families for each partition")

[docs]def writeGeneFamiliesTSV(output, compress=False):
    logging.getLogger().info("Writing the file providing the association between genes and gene families...")
    outname = output + "/gene_families.tsv"
    with write_compressed_or_not(outname,compress) as tsv:
        for fam in pan.geneFamilies:
            for gene in fam.genes:
                tsv.write("\t".join([fam.name, gene.ID if gene.local_identifier == "" else gene.local_identifier, "F" if gene.is_fragment else ""])+"\n")
    logging.getLogger().info(f"Done writing the file providing the association between genes and gene families : '{outname}'")

[docs]def writeFastaGenFam(output, compress=False):
    logging.getLogger().info("Writing the representative nucleic sequences of all the gene families...")
    outname = output + "/representative_gene_families.fna"
    with write_compressed_or_not(outname,compress) as fasta:
        getGeneSequencesFromFile(pan,fasta,[fam.name for fam in pan.geneFamilies])
    logging.getLogger().info(f"Done writing the representative nucleic sequences of all the gene families : '{outname}'")

[docs]def writeFastaProtFam(output, compress=False):
    logging.getLogger().info("Writing the representative proteic sequences of all the gene families...")
    outname = output + "/representative_gene_families.faa"
    with write_compressed_or_not(outname,compress) as fasta:
        bar = tqdm(range(pan.number_of_geneFamilies()),unit="prot families")
        for fam in list(pan.geneFamilies):
            fasta.write('>' +fam.name + "\n")
            fasta.write(fam.sequence + "\n")
            bar.update()
        bar.close()
    logging.getLogger().info(f"Done writing the representative proteic sequences of all the gene families : '{outname}'")

[docs]def writeGeneSequencesFromAnnotations(pangenome, fileObj, verbose = False):
    """
        Writes the CDS sequences of the Pangenome object to a tmpFile object
        Loads the sequences from previously computed or loaded annotations
    """
    logging.getLogger().info("Writing all of the CDS sequences...")
    bar =  tqdm(pangenome.genes, unit="gene", disable= not verbose)
    for gene in bar:
        if gene.type == "CDS":
            fileObj.write('>' + gene.ID + "\n")
            fileObj.write(gene.dna + "\n")
    fileObj.flush()
    bar.close()

[docs]def writeGeneSequences(output, compress=False):
    logging.getLogger().info("Writing all the gene nucleic sequences...")
    outname = output + "/all_genes.fna"
    
    with write_compressed_or_not(outname,compress) as fasta:
        if pan.status["geneSequences"] in ["inFile"]:
            getGeneSequencesFromFile(pan,fasta)
        elif pan.status["geneSequences"] in ["Computed","Loaded"]:
            writeGeneSequencesFromAnnotations(pan, fasta)
        else:
            #this should never happen if the pangenome has been properly checked before launching this function.
            raise Exception("The pangenome does not include gene sequences")
    logging.getLogger().info(f"Done writing all the gene sequences : '{outname}'")

[docs]def writeRegions(output, compress = False):
    fname = output + "/plastic_regions.tsv"
    with write_compressed_or_not(fname, compress) as tab:
        tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\n")
        regions = sorted(pan.regions, key = lambda x : (x.organism.name, x.contig.name, x.start))
        for region in regions:
            tab.write('\t'.join(map(str,[region.name, region.organism, region.contig, region.start, region.stop, len(region.genes), region.isContigBorder, region.isWholeContig]))+"\n")

[docs]def summarize_spots(spots, output, compress):

    def r_and_s(value):
        """ rounds to dp figures and returns a str of the provided value"""
        if isinstance(value, float):
            return str(round(value,3))
        else:
            return str(value)

    with write_compressed_or_not(output + "/summarize_spots.tsv", compress) as fout:
        fout.write("spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\tstdev_nb_genes\tmax_nb_genes\tmin_nb_genes\n")
        for spot in sorted(spots, key=lambda x : len(x.regions), reverse=True):
            tot_fams = set()
            rgp_list = list(spot.regions)
            len_uniq_content = len(spot.getUniqContent())
            size_list = []
            for rgp in spot.regions:
                tot_fams |= rgp.families
                size_list.append(len(rgp.genes))
            mean_size = mean(size_list)
            stdev_size = stdev(size_list) if len(size_list) > 1 else 0
            max_size = max(size_list)
            min_size = min(size_list)
            fout.write("\t".join(map(r_and_s,[f"spot_{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, mean_size,stdev_size,max_size, min_size])) + "\n")
    logging.getLogger().info(f"Done writing spots in : '{output + '/summarize_spots.tsv'}'")

[docs]def spot2rgp(spots, output, compress):
    with write_compressed_or_not(output + "/spots.tsv", compress) as fout:
        fout.write("spot_id\trgp_id\n")
        n_spot = 0
        for spot in spots:
            for rgp in spot.regions:
                fout.write(f"spot_{spot.ID}\t{rgp.name}\n")
            n_spot+=1

[docs]def writeSpots(output, compress):
    if len(pan.spots) > 0:
        spot2rgp(pan.spots, output, compress)
        summarize_spots(pan.spots, output, compress)

[docs]def writeBorders(output, dup_margin, compress):
    multigenics = pan.get_multigenics(dup_margin=dup_margin)
    all_fams = set()
    with write_compressed_or_not(output+"/spot_borders.tsv",compress) as fout:
        fout.write("spot_id\tnumber\tborder1\tborder2\n")
        for spot in sorted(pan.spots, key= lambda x: len(x.regions), reverse=True):
            curr_borders=spot.borders(pan.parameters["spots"]["set_size"], multigenics)
            for c, border in curr_borders:
                famstring1 = ",".join([ fam.name for fam in border[0] ])
                famstring2 = ",".join([ fam.name for fam in border[1]])
                all_fams |= set(border[0])
                all_fams |= set(border[1])
                fout.write(f"{spot.ID}\t{c}\t{famstring1}\t{famstring2}\n")

    with write_compressed_or_not(output + "/border_protein_genes.fasta",compress) as fout:
        for fam in all_fams:
            fout.write(f">{fam.name}\n")
            fout.write(f"{fam.sequence}\n")

[docs]def writeFlatFiles(pangenome, output, cpu = 1, soft_core = 0.95, dup_margin = 0.05, csv=False, genePA = False, gexf = False, light_gexf = False, projection = False, stats = False, json = False, partitions=False,regions = False, families_tsv = False, all_genes = False, all_prot_families = False, all_gene_families = False, spots = False, borders=False, compress = False):
    
    if not any(x for x in [csv, genePA, gexf, light_gexf, projection, stats, json, partitions, regions, spots, borders, families_tsv, all_genes, all_prot_families, all_gene_families]):
        raise Exception("You did not indicate what file you wanted to write.")

    global pan
    pan = pangenome
    processes = []
    needAnnotations = False
    needFamilies = False
    needGraph = False
    needPartitions = False
    needSpots = False
    needRegions = False

    if csv or genePA or gexf or light_gexf or projection or stats or json or partitions or regions or spots or families_tsv or borders:
        needAnnotations = True 
    if csv or genePA or gexf or light_gexf or projection or stats or json or partitions or regions or spots or families_tsv or all_prot_families or all_gene_families or borders:
        needFamilies = True
    if projection or stats or partitions or regions or spots or borders:
        needPartitions = True
    if gexf or light_gexf or json:
        needGraph = True
    if regions or spots or borders:
        needRegions = True
    if spots or borders:
        needSpots = True

    #need to deal with sequence-related flags outside of checkPangenomeInfo since 
    ex_geneSequences = Exception("The provided pangenome has no gene sequences. This is not compatible with any of the following options : --all_genes, --all_gene_families")
    ex_geneFamilySequences = Exception("The provided pangenome has no gene families. This is not compatible with any of the following options : --all_prot_families, all_gene_families")
    if not pan.status["geneSequences"] in ["inFile"] and (all_genes or all_gene_families):
        raise ex_geneSequences
    if not pan.status["geneFamilySequences"] in ["Loaded","Computed","inFile"] and (all_prot_families):
        raise ex_geneFamilySequences

    checkPangenomeInfo(pan, needAnnotations=needAnnotations, needFamilies=needFamilies, needGraph=needGraph, needPartitions= needPartitions, needRGP = needRegions, needSpots = needSpots)
    pan.getIndex()#make the index because it will be used most likely
    with Pool(processes = cpu) as p:
        if csv:
            processes.append(p.apply_async(func = writeMatrix, args = (',', "csv", output, compress, True)))
        if genePA:
            processes.append(p.apply_async(func = writeGenePresenceAbsence, args = (output, compress)))
        if gexf:
            processes.append(p.apply_async(func = writeGEXF, args = (output, False, soft_core, compress)))
        if light_gexf:
            processes.append(p.apply_async(func = writeGEXF, args = (output, True, soft_core, compress)))
        if projection:
            processes.append(p.apply_async(func = writeProjections, args = (output, compress)))
        if stats:
            processes.append(p.apply_async(func = writeStats, args = (output, soft_core, dup_margin, compress)))
        if json:
            processes.append(p.apply_async(func = writeJSON, args = (output, compress)))
        if partitions:
            processes.append(p.apply_async(func = writeParts, args = (output, soft_core, compress)))
        if families_tsv:
            processes.append(p.apply_async(func = writeGeneFamiliesTSV, args = (output, compress)))
        if all_genes:
            processes.append(p.apply_async(func = writeGeneSequences, args = (output, compress)))
        if all_prot_families:
            processes.append(p.apply_async(func = writeFastaProtFam, args = (output, compress)))
        if all_gene_families:
            processes.append(p.apply_async(func = writeFastaGenFam, args = (output, compress)))
        if regions:
            processes.append(p.apply_async(func = writeRegions, args = (output, compress)))
        if spots:
            processes.append(p.apply_async(func = writeSpots, args=(output, compress)))
        if borders:
            processes.append(p.apply_async(func=writeBorders, args=(output, dup_margin, compress)))

        for process in processes:
            process.get()#get all the results

[docs]def launch(args):
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    writeFlatFiles(pangenome, args.output,cpu= args.cpu, soft_core=args.soft_core,dup_margin= args.dup_margin,csv= args.csv,genePA= args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, stats=args.stats, json=args.json, partitions=args.partitions, regions=args.regions, families_tsv=args.families_tsv, all_genes=args.all_genes, all_prot_families=args.all_prot_families, all_gene_families= args.all_gene_families,spots= args.spots, borders=args.borders, compress=args.compress)

[docs]def writeFlatSubparser(subparser):
    parser = subparser.add_parser("write", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    required = parser.add_argument_group(title = "Required arguments", description = "One of the following arguments is required :")
    required.add_argument('-p','--pangenome',  required=True, type=str, help="The pangenome .h5 file")
    required.add_argument('-o','--output', required=True, type=str, help="Output directory where the file(s) will be written")
    optional = parser.add_argument_group(title = "Optional arguments")
    optional.add_argument("--soft_core",required=False, type=restricted_float, default = 0.95, help = "Soft core threshold to use")
    optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, help = "minimum ratio of organisms in which the family must have multiple genes for it to be considered 'duplicated'")
    optional.add_argument("--gexf",required = False, action = "store_true", help = "write a gexf file with all the annotations and all the genes of each gene family")
    optional.add_argument("--light_gexf",required = False, action="store_true",help = "write a gexf file with the gene families and basic informations about them")
    optional.add_argument("--csv", required=False, action = "store_true",help = "csv file format as used by Roary, among others. The alternative gene ID will be the partition, if there is one")
    optional.add_argument("--Rtab", required=False, action = "store_true",help = "tabular file for the gene binary presence absence matrix")
    optional.add_argument("--projection", required=False, action = "store_true",help = "a csv file for each organism providing informations on the projection of the graph on the organism")
    optional.add_argument("--stats",required=False, action = "store_true",help = "tsv files with some statistics for each organism and for each gene family")
    optional.add_argument("--partitions", required=False, action = "store_true", help = "list of families belonging to each partition, with one file per partitions and one family per line")
    optional.add_argument("--compress",required=False, action="store_true",help="Compress the files in .gz")
    optional.add_argument("--json", required=False, action = "store_true", help = "Writes the graph in a json file format")
    optional.add_argument("--regions", required=False, action = "store_true", help = "Write the RGP in a tab format, one file per genome")
    optional.add_argument("--spots", required=False, action = "store_true", help = "Write spot summary and a list of all rgp in each spot")
    optional.add_argument("--borders", required=False, action = "store_true", help = "List all borders of each spot")
    optional.add_argument("--families_tsv", required=False, action = "store_true", help = "Write a tsv file providing the association between genes and gene families")
    optional.add_argument("--all_genes", required=False, action = "store_true", help = "Write all nucleotic CDS sequences")
    optional.add_argument("--all_prot_families", required=False, action = "store_true", help = "Write Write representative proteic sequences of all the gene families")
    optional.add_argument("--all_gene_families", required=False, action = "store_true", help = "Write representative nucleic sequences of all the gene families")
    return parser