Source code for macsypy.serialization

#########################################################################
# MacSyFinder - Detection of macromolecular systems in protein dataset  #
#               using systems modelling and similarity search.          #
# Authors: Sophie Abby, Bertrand Neron                                  #
# Copyright (c) 2014-2023  Institut Pasteur (Paris) and CNRS.           #
# See the COPYRIGHT file for details                                    #
#                                                                       #
# This file is part of MacSyFinder package.                             #
#                                                                       #
# MacSyFinder is free software: you can redistribute it and/or modify   #
# it under the terms of the GNU General Public License as published by  #
# the Free Software Foundation, either version 3 of the License, or     #
# (at your option) any later version.                                   #
#                                                                       #
# MacSyFinder is distributed in the hope that it will be useful,        #
# but WITHOUT ANY WARRANTY; without even the implied warranty of        #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          #
# GNU General Public License for more details .                         #
#                                                                       #
# You should have received a copy of the GNU General Public License     #
# along with MacSyFinder (COPYING).                                     #
# If not, see <https://www.gnu.org/licenses/>.                          #
#########################################################################

"""
This module focus on the way to serialize the different macsyfinder modules
"""

import abc
from string import Template

from macsypy.gene import GeneStatus


[docs]class SystemSerializer(metaclass=abc.ABCMeta): """ handle the different way to serialize a system """ @abc.abstractmethod def serialize(self, system, hit_system_tracker): pass
[docs]class TxtSystemSerializer(SystemSerializer): """ Handle System serialization in text """
[docs] def serialize(self, system, hit_system_tracker): """ :return: a string representation of system readable by human """ clst = ", ".join(["[" + ", ".join([str((v_h.id, v_h.gene.name, v_h.position)) for v_h in cluster.hits]) + "]" for cluster in system.clusters]) s = f"""system id = {system.id} model = {system.model.fqn} replicon = {system.replicon_name} clusters = {clst} occ = {system.occurrence()} wholeness = {system.wholeness:.3f} loci nb = {system.loci_nb} score = {system.score:.3f} """ for title, genes in (("mandatory", system.mandatory_occ), ("accessory", system.accessory_occ), ("neutral", system.neutral_occ)): s += f"\n{title} genes:\n" for g_name, hits in genes.items(): s += f"\t- {g_name}: {len(hits)} " all_hits_str = [] for h in hits: used_in_systems = [s.id for s in hit_system_tracker[h.hit] if s.model.fqn != system.model.fqn] used_in_systems.sort() if used_in_systems: hit_str = f"{h.gene.name} [{', '.join(used_in_systems)}]" else: hit_str = f"{h.gene.name}" all_hits_str.append(hit_str) s += f'({", ".join(all_hits_str)})\n' return s
[docs]class TsvSystemSerializer(SystemSerializer): """ Handle System serialization in tsv format """ header = "replicon\thit_id\tgene_name\thit_pos\tmodel_fqn" \ "\tsys_id\tsys_loci\tlocus_num\tsys_wholeness\tsys_score\tsys_occ" \ "\thit_gene_ref\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov\thit_seq_cov\t" \ "hit_begin_match\thit_end_match\tcounterpart\tused_in" template = Template("$sys_replicon_name\t$mh_id\t$mh_gene_name\t$mh_position\t$sys_model_fqn\t" "$sys_id\t$sys_loci\t$locus_num\t$sys_wholeness\t$sys_score\t" "$sys_occurrence\t$mh_gene_role\t$mh_status\t$mh_seq_length\t$mh_i_eval\t" "$mh_score\t$mh_profile_coverage\t$mh_sequence_coverage\t$mh_begin_match" "\t$mh_end_match\t$mh_counterpart\t$used_in_systems\n")
[docs] def serialize(self, system, hit_system_tracker): r""" :param :class:`macsypy.system.System` system: The system to serialize. :param hit_system_tracker: The hit_system_tracker which allow to know for each hit in which system it is implied. :type hit_system_tracker: :class:`macsypy.system.HitSystemTracker` object :return: a serialisation of this system in tabulated separated value format each line represent a hit and have the following structure: .. code-block:: text replicon\thit_id\tgene_name\thit_pos\tmodel_fqn\tsys_id\tsys_loci\tlocus_num\tsys_wholeness\tsys_score \tsys_occ\thit_gene_ref.alternate_of\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov \thit_seq_cov\tit_begin_match\thit_end_match\tcounterpart\tused_in_systems :rtype: str """ tsv = '' loci_num = system.loci_num for locus_num, cluster in zip(loci_num, system.clusters): for mh in sorted(cluster.hits, key=lambda mh: mh.position): used_in_systems = [s.id for s in hit_system_tracker[mh.hit] if s.model.fqn != system.model.fqn] used_in_systems.sort() tsv += self.template.substitute( sys_replicon_name=system.replicon_name, mh_id=mh.id, mh_gene_name=mh.gene.name, mh_position=mh.position, sys_model_fqn=system.model.fqn, sys_id=system.id, sys_loci=system.loci_nb, locus_num=locus_num, sys_wholeness=f"{system.wholeness:.3f}", sys_score=f"{system.score:.3f}", sys_occurrence=system.occurrence(), mh_gene_role=mh.gene_ref.alternate_of().name, mh_status=mh.status, mh_seq_length=mh.seq_length, mh_i_eval=mh.i_eval, mh_score=f"{mh.score:.3f}", mh_profile_coverage=f"{mh.profile_coverage:.3f}", mh_sequence_coverage=f"{mh.sequence_coverage:.3f}", mh_begin_match=mh.begin_match, mh_end_match=mh.end_match, mh_counterpart=','.join([h.id for h in mh.counterpart]), used_in_systems=','.join(used_in_systems) ) return tsv
[docs]class TsvSolutionSerializer: """ Handle Solution (list of Systems) serialization in tsv format """ header = 'sol_id\t' + TsvSystemSerializer.header template = Template(f"$$sol_id\t{TsvSystemSerializer.template.template}")
[docs] def serialize(self, solution, sol_id, hit_system_tracker): """ :param solution: the solution to serialize :type solution: list of :class:`macsypy.system.System` object :param int sol_id: the solution identifier :param hit_system_tracker: :type hit_system_tracker: :class:`macsypy.system.HitSystemTracker` object :return: a serialisation of this solution (a list of systems) in tabulated separated value format each line represent a hit and have the same structure as system serialization :meth:`macsypy.serialization.TsvSystemSerializer.serialize` but with an extra column sol_id which is a technical id to identified the different solutions. """ tsv = '' sys_ser = TsvSystemSerializer() sys_ser.template = self.template for system in solution: sol_temp = Template(sys_ser.serialize(system, hit_system_tracker)) tsv += f"{sol_temp.substitute(sol_id=sol_id)}\n" return tsv
[docs]class TxtLikelySystemSerializer(SystemSerializer): """ Handle System serialization in text """
[docs] def serialize(self, system, hit_system_tracker): """ :param :class:`macsypy.system.LikelySystem` system: The likely system to serialize. Use only for unordered db-type :param hit_system_tracker: The hit_system_tracker which allow to know for each hit in which system it is implied. :type hit_system_tracker: :class:`macsypy.system.HitSystemTracker` object :return: a string representation of system readable by human """ hits = ", ".join([str((h.id, h.gene.name, h.position)) for h in system.hits]) if system.forbidden_hits: warning = "WARNING there quorum is reached but there is also some forbidden genes.\n" else: warning = '\n' s = f"""This replicon contains genetic materials needed for system {system.model.fqn} {warning} system id = {system.id} model = {system.model.fqn} replicon = {system.replicon_name} hits = [{hits}] wholeness = {system.wholeness:.3f} """ for title, genes in (("mandatory", system.mandatory_occ), ("accessory", system.accessory_occ), ("neutral", system.neutral_occ), ("forbidden", system.forbidden_occ)): s += f"\n{title} genes:\n" for g_name, hits in genes.items(): s += f"\t- {g_name}: {len(hits)} " all_hits_str = [] for h in hits: used_in_systems = [s.id for s in hit_system_tracker[h.hit] if s.model.fqn != system.model.fqn] used_in_systems.sort() if used_in_systems: hit_str = f"{h.gene.name} [{', '.join(used_in_systems)}]" else: hit_str = f"{h.gene.name}" all_hits_str.append(hit_str) s += f'({", ".join(all_hits_str)})\n' s += "\nUse ordered replicon to have better prediction.\n" return s
[docs]class TsvLikelySystemSerializer(SystemSerializer): """ Handle potential System from unordered replicon serialization in tsv format """ header = "replicon\thit_id\tgene_name\thit_pos\tmodel_fqn\tsys_id\tsys_wholeness" \ "\thit_gene_ref\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov\thit_seq_cov\t" \ "hit_begin_match\thit_end_match\tused_in" template = Template("$sys_replicon_name\t$mh_id\t$mh_gene_name\t$mh_position\t$sys_model_fqn\t" "$sys_id\t$sys_wholeness\t" "$mh_gene_role\t$mh_status\t$mh_seq_length\t$mh_i_eval\t" "$mh_score\t$mh_profile_coverage\t$mh_sequence_coverage\t$mh_begin_match" "\t$mh_end_match\t$used_in_systems\n")
[docs] def serialize(self, system, hit_system_tracker): r""" :param :class:`macsypy.system.LikelySystem` system: The likely system to serialize. Use only for unordered db-type :param hit_system_tracker: The hit_system_tracker which allow to know for each hit in which system it is implied. :type hit_system_tracker: :class:`macsypy.system.HitSystemTracker` object :return: a serialisation of this system in tabulated separated value format each line represent a hit and have the following structure: .. code-block:: text replicon\thit_id\tgene_name\thit_pos\tmodel_fqn\tsys_id\tsys_wholeness \thit_gene_ref.alternate_of\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov \thit_seq_cov\tit_begin_match\thit_end_match\t$used_in_systems :rtype: str """ tsv = '' for status in (s.lower() for s in GeneStatus.__members__): try: hits = getattr(system, f"{status}_hits") hits = sorted(hits, key=lambda mh: mh.gene.name) except AttributeError: continue for mh in hits: used_in_systems = [s.id for s in hit_system_tracker[mh.hit] if s.model.fqn != system.model.fqn] used_in_systems.sort() tsv += self.template.substitute( sys_replicon_name=system.replicon_name, mh_id=mh.id, mh_gene_name=mh.gene.name, mh_position=mh.position, sys_model_fqn=system.model.fqn, sys_id=system.id, sys_wholeness=f"{system.wholeness:.3f}", mh_gene_role=mh.gene_ref.alternate_of().name, mh_status=mh.status, mh_seq_length=mh.seq_length, mh_i_eval=mh.i_eval, mh_score=f"{mh.score:.3f}", mh_profile_coverage=f"{mh.profile_coverage:.3f}", mh_sequence_coverage=f"{mh.sequence_coverage:.3f}", mh_begin_match=mh.begin_match, mh_end_match=mh.end_match, used_in_systems=','.join(used_in_systems) ) return tsv
[docs]class TxtUnikelySystemSerializer(SystemSerializer): """ Handle System serialization in text """
[docs] def serialize(self, system): """ :param system: The unlikely system to serialize. (used only if db-type is "unordered_replicon") :type system: :class:`macsypy.system.UnlikelySystem` object :return: a string representation of system readable by human """ hits = ", ".join([str((h.id, h.gene.name, h.position)) for h in system.hits]) reasons = '\n'.join(system.reasons) s = f"""This replicon probably not contains a system {system.model.fqn}: {reasons} system id = {system.id} model = {system.model.fqn} replicon = {system.replicon_name} hits = [{hits}] wholeness = {system.wholeness:.3f} """ for title, genes in (("mandatory", system.mandatory_occ), ("accessory", system.accessory_occ), ("neutral", system.neutral_occ), ("forbidden", system.forbidden_occ)): s += f"\n{title} genes:\n" for g_name, hits in genes.items(): s += f"\t- {g_name}: {len(hits)} " all_hits_str = [f"{h.gene.name}" for h in hits] s += f'({", ".join(all_hits_str)})\n' s += "\nUse ordered replicon to have better prediction.\n" return s
[docs]class TsvSpecialHitSerializer: """ Serialize special hits: :class:`macsypy.hit.Loner` and :class:`macsypy.hit.MultiSystem` in tsv format """
[docs] def serialize(self, best_hits): """ :param best_hits: the special hits to serialized :type best_hits: sequence of :class:`macsypy.hit.Loner` or :class:`macsypy.hit.MultiSystem` objects """ s = "" if best_hits: header = "replicon\tmodel_fqn\tfunction\tgene_name\t" \ "hit_id\thit_pos\thit_status\thit_seq_len\t" \ "hit_i_eval\thit_score\thit_profile_cov\t" \ "hit_seq_cov\thit_begin_match\thit_end_match\n" s += header special_hits = set(best_hits) for best_hit in best_hits: special_hits.update(best_hit.counterpart) special_hits = list(special_hits) special_hits.sort(key=lambda h: h.position) for one_hit in special_hits: row = f"{one_hit.replicon_name}\t{one_hit.gene_ref.model.fqn}\t{one_hit.gene_ref.alternate_of().name}\t" \ f"{one_hit.gene_ref.name}\t{one_hit.id}\t{one_hit.position:d}\t{one_hit.status}\t" \ f"{one_hit.seq_length:d}\t{one_hit.i_eval:.3e}\t{one_hit.score:.3f}\t" \ f"{one_hit.profile_coverage:.3f}\t{one_hit.sequence_coverage:.3f}\t" \ f"{one_hit.begin_match:d}\t{one_hit.end_match:d}\n" s += row return s
[docs]class TsvRejectedCandidatesSerializer: """ Serialize Rejected Cluster in tsv format """
[docs] def serialize(self, candidates): """ :param candidates: list of rejected candidates to serialize :type candidates: [ :class:`macsypy.system.RejectedCandidate` object, ...] """ s = "" if candidates: header = "candidate_id\treplicon\tmodel_fqn\tcluster_id\thit_id\thit_pos\tgene_name\tfunction\treasons\n" s += header for candidate in candidates: reasons = '/'.join(candidate.reasons) for cluster in candidate.clusters: for hit in cluster.hits: row = f"{candidate.id}\t{candidate.replicon_name}\t{candidate.model.fqn}\t" \ f"{cluster.id}\t{hit.id}\t{hit.position}\t{hit.gene_ref.name}\t{hit.gene_ref.alternate_of().name}\t" \ f"{reasons}\n" s += row s += '\n' return s