Go to the documentation of this file. 1 #ifndef BIFROST_COMPACTED_DBG_HPP
2 #define BIFROST_COMPACTED_DBG_HPP
18 #include <unordered_map>
19 #include <unordered_set>
26 #include "BlockedBloomFilter.hpp"
28 #include "File_Parser.hpp"
29 #include "FASTX_Parser.hpp"
30 #include "GFA_Parser.hpp"
32 #include "KmerCovIndex.hpp"
33 #include "KmerHashTable.hpp"
34 #include "KmerIterator.hpp"
35 #include "KmerStream.hpp"
37 #include "minHashIterator.hpp"
38 #include "MinimizerIndex.hpp"
39 #include "RepHash.hpp"
40 #include "TinyVector.hpp"
47 #define MASK_CONTIG_ID (0xffffffff00000000)
48 #define MASK_CONTIG_TYPE (0x80000000)
49 #define MASK_CONTIG_POS (0x7fffffff)
50 #define RESERVED_ID (0xffffffff)
54 #define DEFAULT_G_DEC1 8
55 #define DEFAULT_G_DEC2 4
173 string filename_index_in;
175 vector<string> filename_query_in;
177 CDBG_Build_opt() : nb_threads(1), k(DEFAULT_K), g(-1), nb_bits_kmers_bf(14), ratio_kmers(0.8), min_count_km(1),
178 build(false), update(false), query(false), clipTips(false), deleteIsolated(false),
179 inexact_search(false), writeIndexFile(true), useMercyKmers(false), outputGFA(true),
180 outputFASTA(false), outputBFG(false), compressOutput(true), verbose(false) {}
219 template<
typename Unitig_data_t,
typename Graph_data_t =
void>
312 template<
typename Unitig_data_t =
void,
typename Graph_data_t =
void>
316 "Type of data associated with vertices of class CompactedDBG must be void (no data) or a class extending class CDBG_Data_t");
318 typedef Unitig_data_t U;
319 typedef Graph_data_t G;
323 template<
typename U,
typename G,
bool is_const>
friend class UnitigMap;
324 template<
typename U,
typename G,
bool is_const>
friend class unitigIterator;
325 template<
typename U,
typename G,
bool is_const>
friend class neighborIterator;
327 template<
typename X,
typename Y>
friend class CompactedDBG;
336 CompactedDBG(
const int kmer_length = DEFAULT_K,
const int minimizer_length = -1);
413 bool simplify(
const bool delete_short_isolated_unitigs =
true,
const bool clip_short_tips =
true,
const bool verbose =
false);
427 bool write(
const string& output_fn,
const size_t nb_threads = 1,
const bool GFA_output =
true,
const bool FASTA_output =
false,
428 const bool BFG_output =
false,
const bool write_index_file =
true,
const bool compressed_output =
false,
429 const bool verbose =
false)
const;
442 bool read(
const string& input_graph_fn,
const size_t nb_threads = 1,
const bool verbose =
false);
455 bool read(
const string& input_graph_fn,
const string& input_index_fn,
const size_t nb_threads = 1,
const bool verbose =
false);
507 vector<pair<size_t, UnitigMap<U, G>>>
searchSequence(
const string& s,
const bool exact,
const bool insertion,
const bool deletion,
508 const bool substitution,
const bool or_exclusive_match =
false);
522 vector<pair<size_t, const_UnitigMap<U, G>>>
searchSequence(
const string& s,
const bool exact,
const bool insertion,
const bool deletion,
523 const bool substitution,
const bool or_exclusive_match =
false)
const;
532 bool add(
const string& seq,
const bool verbose =
false);
565 bool merge(
const vector<CompactedDBG>& v,
const size_t nb_threads = 1,
const bool verbose =
false);
605 inline int getK()
const {
return k_; }
610 inline int getG()
const {
return g_; }
615 inline size_t size()
const {
return v_unitigs.size() + km_unitigs.size() + h_kmers_ccov.size(); }
625 inline const G*
getData()
const {
return data.getData(); }
627 bool search(
const vector<string>& query_filenames,
const string& out_filename_prefix,
628 const double ratio_kmers,
const bool inexact_search,
const size_t nb_threads,
629 const size_t verbose =
false)
const;
631 bool writeBinary(
const string& fn,
const size_t nb_threads = 1)
const;
632 bool writeBinary(ostream& out,
const size_t nb_threads = 1)
const;
634 bool readBinary(
const string& fn);
635 bool readBinary(istream& in);
639 bool annotateSplitUnitigs(
const CompactedDBG<U, G>& o,
const size_t nb_threads = 1,
const bool verbose =
false);
641 pair<size_t, size_t> splitAllUnitigs();
642 pair<size_t, size_t> getSplitInfoAllUnitigs()
const;
644 inline size_t joinUnitigs(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1) {
646 return joinUnitigs_<is_void<U>::value>(v_joins, nb_threads);
649 bool mergeData(
const CompactedDBG<U, G>& o,
const size_t nb_threads = 1,
const bool verbose =
false);
650 bool mergeData(
CompactedDBG<U, G>&& o,
const size_t nb_threads = 1,
const bool verbose =
false);
654 bool writeBinaryGraph(ostream& out,
const size_t nb_threads = 1)
const;
655 bool writeBinaryGraph(
const string& fn,
const size_t nb_threads = 1)
const;
657 bool writeBinaryIndex(ostream& out,
const uint64_t checksum,
const size_t nb_threads = 1)
const;
658 bool writeBinaryIndex(
const string& fn,
const uint64_t checksum,
const size_t nb_threads = 1)
const;
660 pair<uint64_t, bool> readBinaryGraph(istream& in);
661 pair<uint64_t, bool> readBinaryGraph(
const string& fn);
663 bool readBinaryIndex(istream& in,
const uint64_t checksum);
664 bool readBinaryIndex(
const string& fn,
const uint64_t checksum);
666 bool readBinaryIndexHead(
const string& fn,
size_t& file_format_version,
size_t& v_unitigs_sz,
size_t& km_unitigs_sz,
667 size_t& h_kmers_ccov_sz,
size_t& hmap_min_unitigs_sz, uint64_t& read_checksum)
const;
669 bool readBinaryIndexHead(istream& in,
size_t& file_format_version,
size_t& v_unitigs_sz,
size_t& km_unitigs_sz,
670 size_t& h_kmers_ccov_sz,
size_t& hmap_min_unitigs_sz, uint64_t& read_checksum)
const;
672 uint64_t checksum()
const;
676 pair<bool, pair<BlockedBloomFilter, Roaring>> filter(
const CDBG_Build_opt& opt,
const size_t nb_unique_kmers,
const size_t nb_non_unique_kmers);
677 bool construct(
const CDBG_Build_opt& opt, BlockedBloomFilter& bf, Roaring& r,
const size_t nb_unique_minimizers,
const size_t nb_non_unique_minimizers);
679 void addUnitigSequence(
const Kmer km,
const string& seq,
const size_t pos_match_km,
const size_t len_match_km, LockGraph& lck_g,
const bool map_read =
true);
682 size_t findUnitigSequenceBBF(
const BlockedBloomFilter& bf,
const Kmer km,
string& s,
bool& isIsolated, vector<Kmer>& l_ignored_km_tip);
683 size_t findUnitigSequenceBBF(
const BlockedBloomFilter& bf,
const Kmer km,
string& s,
bool& isIsolated, vector<Kmer>& l_ignored_km_tip, LockGraph& lck_g);
685 bool bwStepBBF(
const BlockedBloomFilter& bf,
const Kmer km,
Kmer& front,
char& c,
bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip,
const bool check_fp_cand =
true)
const;
686 bool fwStepBBF(
const BlockedBloomFilter& bf,
const Kmer km,
Kmer&
end,
char& c,
bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip,
const bool check_fp_cand =
true)
const;
688 inline size_t find(
const preAllocMinHashIterator<RepHash>& it_min_h)
const {
690 const int pos = it_min_h.getPosition();
691 return (hmap_min_unitigs.find(Minimizer(it_min_h.s + pos).rep()) != hmap_min_unitigs.end() ? 0 : pos - it_min_h.p);
694 UnitigMap<U, G> find(
const char* s,
const size_t pos_km,
const minHashIterator<RepHash>& it_min,
const bool extremities_only =
false);
695 const_UnitigMap<U, G> find(
const char* s,
const size_t pos_km,
const minHashIterator<RepHash>& it_min,
const bool extremities_only =
false)
const;
701 vector<const_UnitigMap<U, G>> findPredecessors(
const Kmer& km,
const bool extremities_only =
false)
const;
702 vector<const_UnitigMap<U, G>> findSuccessors(
const Kmer& km,
const size_t limit = 4,
const bool extremities_only =
false)
const;
704 vector<UnitigMap<U, G>> findPredecessors(
const Kmer& km,
const bool extremities_only =
false);
705 vector<UnitigMap<U, G>> findSuccessors(
const Kmer& km,
const size_t limit = 4,
const bool extremities_only =
false);
713 bool addUnitig(
const string& str_unitig,
const size_t id_unitig);
714 bool addUnitig(
const string& str_unitig,
const size_t id_unitig,
const size_t id_unitig_r,
const size_t is_short_r);
715 bool addUnitig(
const string& str_unitig,
const size_t id_unitig, SpinLock& lck_unitig, SpinLock& lck_kmer);
716 void swapUnitigs(
const bool isShort,
const size_t id_a,
const size_t id_b);
718 bool mergeUnitig(
const string& seq,
const bool verbose =
false);
719 bool annotateSplitUnitig(
const string& seq,
const bool verbose =
false);
720 bool annotateSplitUnitig(
const string& seq, LockGraph& lck_g,
const bool verbose =
false);
722 template<
bool is_
void>
728 template<
bool is_
void>
731 template<
bool is_
void>
732 typename std::enable_if<!is_void, void>::type deleteUnitig_(
const bool isShort,
const bool isAbundant,
733 const size_t id_unitig,
const bool delete_data =
true);
735 template<
bool is_
void>
736 typename std::enable_if<is_void, void>::type deleteUnitig_(
const bool isShort,
const bool isAbundant,
737 const size_t id_unitig,
const bool delete_data =
true);
739 void deleteUnitig_(
const bool isShort,
const bool isAbundant,
const size_t id_unitig,
const string& str);
741 template<
bool is_
void>
742 typename std::enable_if<!is_void, bool>::type extractUnitig_(
size_t& pos_v_unitigs,
size_t& nxt_pos_insert_v_unitigs,
743 size_t& v_unitigs_sz,
size_t& v_kmers_sz,
const vector<pair<int,int>>& sp);
744 template<
bool is_
void>
745 typename std::enable_if<is_void, bool>::type extractUnitig_(
size_t& pos_v_unitigs,
size_t& nxt_pos_insert_v_unitigs,
746 size_t& v_unitigs_sz,
size_t& v_kmers_sz,
const vector<pair<int,int>>& sp);
748 pair<size_t, size_t> extractAllUnitigs();
750 template<
bool is_
void>
751 typename std::enable_if<!is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1);
753 template<
bool is_
void>
754 typename std::enable_if<is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1);
756 void moveToAbundant();
757 void setFullCoverage(
const size_t cov)
const;
759 void createJoinHT(vector<Kmer>* v_joins, KmerHashTable<Kmer>& joins,
const size_t nb_threads)
const;
760 void createJoinHT(vector<Kmer>* v_joins, KmerHashTable<char>& joins,
const size_t nb_threads)
const;
763 void check_fp_tips(KmerHashTable<bool>& ignored_km_tips);
764 size_t removeUnitigs(
bool rmIsolated,
bool clipTips, vector<Kmer>& v);
766 size_t joinTips(
string filename_MBBF_uniq_kmers,
const size_t nb_threads = 1,
const bool verbose =
false);
767 vector<Kmer> extractMercyKmers(
const BlockedBloomFilter& bf_uniq_km,
const size_t nb_threads = 1,
const bool verbose =
false);
769 bool writeGFA(
const string& fn,
const size_t nb_threads = 1,
const bool compressed_output =
false)
const;
770 bool writeFASTA(
const string& fn,
const bool compressed_output =
false)
const;
772 void makeGraphFromGFA(
const string& fn,
const size_t nb_threads = 1);
773 void makeGraphFromFASTA(
const string& fn,
const size_t nb_threads = 1);
775 pair<uint64_t, bool> readGraphFromIndexGFA(
const string& graph_fn,
const string& index_fn,
const size_t k,
const size_t g);
776 pair<uint64_t, bool> readGraphFromIndexFASTA(
const string& graph_fn,
const string& index_fn,
const size_t k,
const size_t g);
778 template<
bool is_
void>
779 typename std::enable_if<!is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap)
const;
780 template<
bool is_
void>
781 typename std::enable_if<is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap)
const;
789 void setKmerGmerLength(
const int kmer_length,
const int minimizer_length = -1);
792 vector<pair<size_t, UnitigMap<U, G>>>
searchSequence(
const string& seq,
const bool exact,
const bool insertion,
const bool deletion,
793 const bool substitution,
const double ratio_kmers,
const bool or_exclusive_match);
795 vector<pair<size_t, const_UnitigMap<U, G>>>
searchSequence(
const string& seq,
const bool exact,
const bool insertion,
const bool deletion,
796 const bool substitution,
const double ratio_kmers,
const bool or_exclusive_match)
const;
803 static const int tiny_vector_sz = 2;
804 static const int min_abundance_lim = 15;
805 static const int max_abundance_lim = 15;
807 typedef KmerHashTable<CompressedCoverage_t<U>> h_kmers_ccov_t;
809 vector<Unitig<U>*> v_unitigs;
810 KmerCovIndex<U> km_unitigs;
811 h_kmers_ccov_t h_kmers_ccov;
813 MinimizerIndex hmap_min_unitigs;
818 #include "CompactedDBG.tcc"
819 #include "Search.tcc"
bool outputBFG
Boolean indicating if the graph is written to a BFG/BFI file.
Definition: CompactedDBG.hpp:161
size_t nb_bits_kmers_bf
Number of Bloom filter bits per k-mer occurring in the FASTA/FASTQ/GFA files of CDBG_Build_opt::filen...
Definition: CompactedDBG.hpp:136
size_t size() const
Return the number of unitigs in the graph.
Definition: CompactedDBG.hpp:615
const_UnitigMap< U, G > find(const Kmer &km, const bool extremities_only=false) const
Find the unitig containing the queried k-mer in the Compacted de Bruijn graph.
CompactedDBG< U, G > & operator=(CompactedDBG< U, G > &&o)
Move assignment operator (move a compacted de Bruijn graph).
bool simplify(const bool delete_short_isolated_unitigs=true, const bool clip_short_tips=true, const bool verbose=false)
Simplify the Compacted de Bruijn graph: clip short (< 2k length) tips and/or delete short (< 2k lengt...
bool outputFASTA
Boolean indicating if the graph is written to a FASTA file.
Definition: CompactedDBG.hpp:160
bool useMercyKmers
Keep in the graph low coverage k-mers (cov=1) connecting tips of the graph.
Definition: CompactedDBG.hpp:157
void extract(const UnitigMap< Unitig_data_t, Graph_data_t > &um_src, bool last_extraction)
Extract data corresponding to a sub-unitig of a unitig A.
Definition: CompactedDBG.hpp:272
CompactedDBG< U, G > & operator+=(const CompactedDBG< U, G > &o)
Addition assignment operator (merge a compacted de Bruijn graph).
CompactedDBG< U, G > & operator=(const CompactedDBG< U, G > &o)
Copy assignment operator (copy a compacted de Bruijn graph).
string outFilenameBBF
String containing the name of a Bloom filter file that will be generated by CompactedDBG<U,...
Definition: CompactedDBG.hpp:139
string serialize(const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src) const
Serialize the data to a GFA-formatted string.
Definition: CompactedDBG.hpp:282
bool read(const string &input_graph_fn, const string &input_index_fn, const size_t nb_threads=1, const bool verbose=false)
Read a Compacted de Bruijn graph from disk (GFA1, FASTA or BFG format) using an index file (BFI forma...
bool add(const string &seq, const bool verbose=false)
Add a sequence to the Compacted de Bruijn graph.
bool isInvalid() const
Return a boolean indicating if the graph is invalid (wrong input parameters/files,...
Definition: CompactedDBG.hpp:600
UnitigMap< U, G > findUnitig(const char *s, const size_t pos, const size_t len)
Find the unitig containing the k-mer starting at a given position in a query sequence and extends the...
const G * getData() const
Return a constant pointer to the graph data.
Definition: CompactedDBG.hpp:625
bool operator==(const CompactedDBG< U, G > &o) const
Equality operator.
Interface to store and manipulate k-mers.
Definition: Kmer.hpp:42
unitigIterator< U, G, true > const_iterator
A constant iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:330
bool operator!=(const CompactedDBG< U, G > &o) const
Inequality operator.
bool remove(const const_UnitigMap< U, G > &um, const bool verbose=false)
Remove a unitig from the Compacted de Bruijn graph.
string prefixFilenameOut
Prefix for the name of the file to which the graph must be written.
Definition: CompactedDBG.hpp:170
int getK() const
Return the length of k-mers of the graph.
Definition: CompactedDBG.hpp:605
int getG() const
Return the length of minimizers of the graph.
Definition: CompactedDBG.hpp:610
CompactedDBG(const CompactedDBG< U, G > &o)
Copy constructor (copy a compacted de Bruijn graph).
CompactedDBG(CompactedDBG< U, G > &&o)
Move constructor (move a compacted de Bruijn graph).
CompactedDBG(const int kmer_length=31, const int minimizer_length=-1)
Constructor (set up an empty compacted dBG).
UnitigMap type interface.
bool build(CDBG_Build_opt &opt)
Build the Compacted de Bruijn graph.
bool build
Boolean indicating if the graph must be built.
Definition: CompactedDBG.hpp:151
vector< string > filename_seq_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:141
If data are to be associated with the unitigs of the compacted de Bruijn graph, those data must be wr...
Definition: CompactedDBG.hpp:220
Contain all the information for the mapping of a k-mer or a sequence to a unitig of a Compacted de Br...
Definition: UnitigMap.hpp:92
bool verbose
Print information messages during execution if true.
Definition: CompactedDBG.hpp:130
bool read(const string &input_graph_fn, const size_t nb_threads=1, const bool verbose=false)
Load a Compacted de Bruijn graph from disk (GFA1 or FASTA format).
string filename_graph_in
String containing the name of a GFA file to read using CompactedDBG<U, G>::read.
Definition: CompactedDBG.hpp:172
iterator begin()
Create an iterator to the first unitig of the Compacted de Bruijn graph (unitigs are NOT sorted lexic...
const_iterator end() const
Create a constant iterator to the "past-the-last" unitig of the Compacted de Bruijn graph (unitigs ar...
UnitigMap< U, G > find(const Kmer &km, const bool extremities_only=false)
Find the unitig containing the queried k-mer in the Compacted de Bruijn graph.
bool clipTips
Clip short tips (length < 2k) of the graph (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:155
const_UnitigMap< U, G > findUnitig(const char *s, const size_t pos, const size_t len) const
Find the unitig containing the k-mer starting at a given position in a query sequence and extends the...
const_iterator begin() const
Create an constant iterator to the first unitig of the Compacted de Bruijn graph (unitigs are NOT sor...
bool update
Boolean indicating if the graph must be updated.
Definition: CompactedDBG.hpp:152
void clear(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest)
Clear the data associated with a unitig.
Definition: CompactedDBG.hpp:229
vector< pair< size_t, const_UnitigMap< U, G > > > searchSequence(const string &s, const bool exact, const bool insertion, const bool deletion, const bool substitution, const bool or_exclusive_match=false) const
Performs exact and/or inexact search of the k-mers of a sequence query in the Compacted de Bruijn gra...
Iterator for the neighbors (predecessors or successors) of a reference unitig used in a UnitigMap obj...
Definition: NeighborIterator.hpp:34
vector< string > filename_ref_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:142
size_t nb_threads
Number of threads to use for building the graph.
Definition: CompactedDBG.hpp:132
Iterator for the unitigs of a Compacted de Bruijn graph.
Definition: UnitigIterator.hpp:36
void concat(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Join data of two unitigs which are going to be concatenated.
Definition: CompactedDBG.hpp:244
bool merge(const vector< CompactedDBG > &v, const size_t nb_threads=1, const bool verbose=false)
Merge multiple compacted de Bruijn graphs.
Most members of this structure are parameters for CompactedDBG<U, G>::build(), except for:
Definition: CompactedDBG.hpp:128
G * getData()
Return a pointer to the graph data.
Definition: CompactedDBG.hpp:620
void clear()
Clear the graph: empty the graph and reset its parameters.
iterator end()
Create an iterator to the "past-the-last" unitig of the Compacted de Bruijn graph (unitigs are NOT so...
Interface for the class Kmer:
vector< pair< size_t, UnitigMap< U, G > > > searchSequence(const string &s, const bool exact, const bool insertion, const bool deletion, const bool substitution, const bool or_exclusive_match=false)
Performs exact and/or inexact search of the k-mers of a sequence query in the Compacted de Bruijn gra...
Unitig_data_ptr_t getData() const
Get a pointer to the data associated with the reference unitig used in the mapping.
The unitigIterator type interface.
virtual ~CompactedDBG()
Destructor.
bool deleteIsolated
Remove short isolated unitigs (length < 2k) of the graph (not used by CompactedDBG<U,...
Definition: CompactedDBG.hpp:156
size_t length() const
Return the sum of the unitigs length.
Definition: ColorSet.hpp:16
void merge(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Merge the data of a sub-unitig B to the data of a sub-unitig A.
Definition: CompactedDBG.hpp:256
bool write(const string &output_fn, const size_t nb_threads=1, const bool GFA_output=true, const bool FASTA_output=false, const bool BFG_output=false, const bool write_index_file=true, const bool compressed_output=false, const bool verbose=false) const
Write the Compacted de Bruijn graph to disk (GFA1 format).
unitigIterator< U, G, false > iterator
An iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:329
Represent a Compacted de Bruijn graph.
Definition: CompactedDBG.hpp:313
bool outputGFA
Boolean indicating if the graph is written to a GFA file.
Definition: CompactedDBG.hpp:159
bool merge(const CompactedDBG &o, const size_t nb_threads=1, const bool verbose=false)
Merge a compacted de Bruijn graph.
size_t nbKmers() const
Return the number of k-mers in the graph.
string inFilenameBBF
String containing the name of a Bloom filter file that is generated by CompactedDBG<U,...
Definition: CompactedDBG.hpp:138
int k
Length of k-mers (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:149