Go to the documentation of this file. 1 #ifndef BIFROST_COMPACTED_DBG_HPP
2 #define BIFROST_COMPACTED_DBG_HPP
16 #include <unordered_map>
17 #include <unordered_set>
24 #include "BlockedBloomFilter.hpp"
26 #include "File_Parser.hpp"
27 #include "FASTX_Parser.hpp"
28 #include "GFA_Parser.hpp"
30 #include "KmerCovIndex.hpp"
31 #include "KmerHashTable.hpp"
32 #include "KmerIterator.hpp"
33 #include "KmerStream.hpp"
35 #include "minHashIterator.hpp"
36 #include "MinimizerIndex.hpp"
37 #include "RepHash.hpp"
38 #include "TinyVector.hpp"
45 #define MASK_CONTIG_ID (0xffffffff00000000)
46 #define MASK_CONTIG_TYPE (0x80000000)
47 #define MASK_CONTIG_POS (0x7fffffff)
48 #define RESERVED_ID (0xffffffff)
52 #define DEFAULT_G_DEC1 8
53 #define DEFAULT_G_DEC2 4
168 vector<string> filename_query_in;
170 CDBG_Build_opt() : nb_threads(1), k(DEFAULT_K), g(-1), nb_bits_unique_kmers_bf(14),
171 nb_bits_non_unique_kmers_bf(14), read_chunksize(64), ratio_kmers(0.8),
172 build(false), update(false), query(false), clipTips(false), deleteIsolated(false),
173 inexact_search(false), useMercyKmers(false), outputGFA(true), verbose(false) {}
212 template<
typename Unitig_data_t,
typename Graph_data_t =
void>
305 template<
typename Unitig_data_t =
void,
typename Graph_data_t =
void>
309 "Type of data associated with vertices of class CompactedDBG must be void (no data) or a class extending class CDBG_Data_t");
311 typedef Unitig_data_t U;
312 typedef Graph_data_t G;
316 template<
typename U,
typename G,
bool is_const>
friend class UnitigMap;
317 template<
typename U,
typename G,
bool is_const>
friend class unitigIterator;
318 template<
typename U,
typename G,
bool is_const>
friend class neighborIterator;
320 template<
typename X,
typename Y>
friend class CompactedDBG;
411 bool simplify(
const bool delete_short_isolated_unitigs =
true,
const bool clip_short_tips =
true,
const bool verbose =
false);
420 bool write(
const string& output_filename,
const size_t nb_threads = 1,
const bool GFA_output =
true,
const bool verbose =
false)
const;
431 bool read(
const string& input_filename,
const size_t nb_threads = 1,
const bool verbose =
false);
483 vector<pair<size_t, UnitigMap<U, G>>>
searchSequence(
const string& seq,
const bool exact,
const bool insertion,
const bool deletion,
484 const bool substitution,
const bool or_exclusive_match =
false);
498 vector<pair<size_t, const_UnitigMap<U, G>>>
searchSequence(
const string& seq,
const bool exact,
const bool insertion,
const bool deletion,
499 const bool substitution,
const bool or_exclusive_match =
false)
const;
508 bool add(
const string& seq,
const bool verbose =
false);
541 bool merge(
const vector<CompactedDBG>& v,
const size_t nb_threads = 1,
const bool verbose =
false);
581 inline int getK()
const {
return k_; }
586 inline int getG()
const {
return g_; }
591 inline size_t size()
const {
return v_unitigs.size() + km_unitigs.size() + h_kmers_ccov.size(); }
601 inline const G*
getData()
const {
return data.getData(); }
603 bool search(
const vector<string>& query_filenames,
const string& out_filename_prefix,
604 const double ratio_kmers,
const bool inexact_search,
const size_t nb_threads,
605 const size_t verbose =
false)
const;
609 bool annotateSplitUnitigs(
const CompactedDBG<U, G>& o,
const size_t nb_threads = 1,
const bool verbose =
false);
611 pair<size_t, size_t> splitAllUnitigs();
612 pair<size_t, size_t> getSplitInfoAllUnitigs()
const;
614 inline size_t joinUnitigs(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1) {
616 return joinUnitigs_<is_void<U>::value>(v_joins, nb_threads);
619 bool mergeData(
const CompactedDBG<U, G>& o,
const size_t nb_threads = 1,
const bool verbose =
false);
620 bool mergeData(
CompactedDBG<U, G>&& o,
const size_t nb_threads = 1,
const bool verbose =
false);
626 bool filter(
const CDBG_Build_opt& opt,
const size_t nb_unique_kmers,
const size_t nb_non_unique_kmers);
627 bool construct(
const CDBG_Build_opt& opt,
const size_t nb_unique_minimizers,
const size_t nb_non_unique_minimizers);
629 bool addUnitigSequenceBBF(
const Kmer km,
const string& seq,
const size_t pos_match_km,
const size_t len_match_km, LockGraph& lck_g);
631 size_t findUnitigSequenceBBF(
Kmer km,
string& s,
bool& isIsolated, vector<Kmer>& l_ignored_km_tip);
632 bool bwStepBBF(
const Kmer km,
Kmer& front,
char& c,
bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip,
const bool check_fp_cand =
true)
const;
633 bool fwStepBBF(
const Kmer km,
Kmer&
end,
char& c,
bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip,
const bool check_fp_cand =
true)
const;
635 inline size_t find(
const preAllocMinHashIterator<RepHash>& it_min_h)
const {
637 const int pos = it_min_h.getPosition();
638 return (hmap_min_unitigs.find(Minimizer(it_min_h.s + pos).rep()) != hmap_min_unitigs.end() ? 0 : pos - it_min_h.p);
641 UnitigMap<U, G> find(
const char* s,
const size_t pos_km,
const minHashIterator<RepHash>& it_min,
const bool extremities_only =
false);
642 const_UnitigMap<U, G> find(
const char* s,
const size_t pos_km,
const minHashIterator<RepHash>& it_min,
const bool extremities_only =
false)
const;
646 vector<const_UnitigMap<U, G>> findPredecessors(
const Kmer& km,
const bool extremities_only =
false)
const;
647 vector<const_UnitigMap<U, G>> findSuccessors(
const Kmer& km,
const size_t limit = 4,
const bool extremities_only =
false)
const;
649 vector<UnitigMap<U, G>> findPredecessors(
const Kmer& km,
const bool extremities_only =
false);
650 vector<UnitigMap<U, G>> findSuccessors(
const Kmer& km,
const size_t limit = 4,
const bool extremities_only =
false);
658 bool addUnitig(
const string& str_unitig,
const size_t id_unitig);
659 bool addUnitig(
const string& str_unitig,
const size_t id_unitig,
const size_t id_unitig_r,
const size_t is_short_r);
660 bool addUnitig(
const string& str_unitig,
const size_t id_unitig, SpinLock& lck_unitig, SpinLock& lck_kmer);
661 void swapUnitigs(
const bool isShort,
const size_t id_a,
const size_t id_b);
663 bool mergeUnitig(
const string& seq,
const bool verbose =
false);
664 bool annotateSplitUnitig(
const string& seq,
const bool verbose =
false);
665 bool annotateSplitUnitig(
const string& seq, LockGraph& lck_g,
const bool verbose =
false);
667 template<
bool is_
void>
673 template<
bool is_
void>
676 template<
bool is_
void>
677 typename std::enable_if<!is_void, void>::type deleteUnitig_(
const bool isShort,
const bool isAbundant,
678 const size_t id_unitig,
const bool delete_data =
true);
680 template<
bool is_
void>
681 typename std::enable_if<is_void, void>::type deleteUnitig_(
const bool isShort,
const bool isAbundant,
682 const size_t id_unitig,
const bool delete_data =
true);
684 void deleteUnitig_(
const bool isShort,
const bool isAbundant,
const size_t id_unitig,
const string& str);
686 template<
bool is_
void>
687 typename std::enable_if<!is_void, bool>::type extractUnitig_(
size_t& pos_v_unitigs,
size_t& nxt_pos_insert_v_unitigs,
688 size_t& v_unitigs_sz,
size_t& v_kmers_sz,
const vector<pair<int,int>>& sp);
689 template<
bool is_
void>
690 typename std::enable_if<is_void, bool>::type extractUnitig_(
size_t& pos_v_unitigs,
size_t& nxt_pos_insert_v_unitigs,
691 size_t& v_unitigs_sz,
size_t& v_kmers_sz,
const vector<pair<int,int>>& sp);
693 pair<size_t, size_t> extractAllUnitigs();
695 template<
bool is_
void>
696 typename std::enable_if<!is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1);
698 template<
bool is_
void>
699 typename std::enable_if<is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1);
701 void moveToAbundant();
702 void setFullCoverage(
const size_t cov)
const;
704 void createJoinHT(vector<Kmer>* v_joins, KmerHashTable<Kmer>& joins,
const size_t nb_threads)
const;
705 void createJoinHT(vector<Kmer>* v_joins, KmerHashTable<char>& joins,
const size_t nb_threads)
const;
708 void check_fp_tips(KmerHashTable<bool>& ignored_km_tips);
709 size_t removeUnitigs(
bool rmIsolated,
bool clipTips, vector<Kmer>& v);
711 size_t joinTips(
string filename_MBBF_uniq_kmers,
const size_t nb_threads = 1,
const bool verbose =
false);
712 vector<Kmer> extractMercyKmers(BlockedBloomFilter& bf_uniq_km,
const size_t nb_threads = 1,
const bool verbose =
false);
714 void writeGFA(
const string& graphfilename,
const size_t nb_threads = 1)
const;
715 void writeFASTA(
const string& graphfilename)
const;
717 void readGFA(
const string& graphfilename,
const size_t nb_threads = 1);
718 void readFASTA(
const string& graphfilename,
const size_t nb_threads = 1);
720 template<
bool is_
void>
721 typename std::enable_if<!is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap)
const;
722 template<
bool is_
void>
723 typename std::enable_if<is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap)
const;
731 void setKmerGmerLength(
const int kmer_length,
const int minimizer_length = -1);
734 vector<pair<size_t, UnitigMap<U, G>>>
searchSequence(
const string& seq,
const bool exact,
const bool insertion,
const bool deletion,
735 const bool substitution,
const double ratio_kmers,
const bool or_exclusive_match);
737 vector<pair<size_t, const_UnitigMap<U, G>>>
searchSequence(
const string& seq,
const bool exact,
const bool insertion,
const bool deletion,
738 const bool substitution,
const double ratio_kmers,
const bool or_exclusive_match)
const;
745 static const int tiny_vector_sz = 2;
746 static const int min_abundance_lim = 15;
747 static const int max_abundance_lim = 15;
749 typedef KmerHashTable<CompressedCoverage_t<U>> h_kmers_ccov_t;
751 vector<Unitig<U>*> v_unitigs;
753 KmerCovIndex<U> km_unitigs;
754 MinimizerIndex hmap_min_unitigs;
756 h_kmers_ccov_t h_kmers_ccov;
758 BlockedBloomFilter bf;
763 #include "CompactedDBG.tcc"
764 #include "Search.tcc"
size_t size() const
Return the number of unitigs in the graph.
Definition: CompactedDBG.hpp:591
const_UnitigMap< U, G > find(const Kmer &km, const bool extremities_only=false) const
Find the unitig containing the queried k-mer in the Compacted de Bruijn graph.
CompactedDBG< U, G > & operator=(CompactedDBG< U, G > &&o)
Move assignment operator (move a compacted de Bruijn graph).
bool simplify(const bool delete_short_isolated_unitigs=true, const bool clip_short_tips=true, const bool verbose=false)
Simplify the Compacted de Bruijn graph: clip short (< 2k length) tips and/or delete short (< 2k lengt...
bool useMercyKmers
Keep in the graph low coverage k-mers (cov=1) connecting tips of the graph.
Definition: CompactedDBG.hpp:157
bool write(const string &output_filename, const size_t nb_threads=1, const bool GFA_output=true, const bool verbose=false) const
Write the Compacted de Bruijn graph to disk (GFA1 format).
void extract(const UnitigMap< Unitig_data_t, Graph_data_t > &um_src, bool last_extraction)
Extract data corresponding to a sub-unitig of a unitig A.
Definition: CompactedDBG.hpp:265
CompactedDBG< U, G > & operator+=(const CompactedDBG< U, G > &o)
Addition assignment operator (merge a compacted de Bruijn graph).
CompactedDBG< U, G > & operator=(const CompactedDBG< U, G > &o)
Copy assignment operator (copy a compacted de Bruijn graph).
string outFilenameBBF
String containing the name of a Bloom filter file that will be generated by CompactedDBG<U,...
Definition: CompactedDBG.hpp:139
string serialize(const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src) const
Serialize the data to a GFA-formatted string.
Definition: CompactedDBG.hpp:275
bool add(const string &seq, const bool verbose=false)
Add a sequence to the Compacted de Bruijn graph.
bool isInvalid() const
Return a boolean indicating if the graph is invalid (wrong input parameters/files,...
Definition: CompactedDBG.hpp:576
UnitigMap< U, G > findUnitig(const char *s, const size_t pos, const size_t len)
Find the unitig containing the k-mer starting at a given position in a query sequence and extends the...
const G * getData() const
Return a constant pointer to the graph data.
Definition: CompactedDBG.hpp:601
bool operator==(const CompactedDBG< U, G > &o) const
Equality operator.
Interface to store and manipulate k-mers.
Definition: Kmer.hpp:40
unitigIterator< U, G, true > const_iterator
A constant iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:323
vector< pair< size_t, UnitigMap< U, G > > > searchSequence(const string &seq, const bool exact, const bool insertion, const bool deletion, const bool substitution, const bool or_exclusive_match=false)
Performs exact and/or inexact search of the k-mers of a sequence query in the Compacted de Bruijn gra...
bool operator!=(const CompactedDBG< U, G > &o) const
Inequality operator.
bool remove(const const_UnitigMap< U, G > &um, const bool verbose=false)
Remove a unitig from the Compacted de Bruijn graph.
string prefixFilenameOut
Prefix for the name of the file to which the graph must be written.
Definition: CompactedDBG.hpp:164
int getK() const
Return the length of k-mers of the graph.
Definition: CompactedDBG.hpp:581
bool read(const string &input_filename, const size_t nb_threads=1, const bool verbose=false)
Read a Compacted de Bruijn graph from disk (GFA1 or FASTA format).
int getG() const
Return the length of minimizers of the graph.
Definition: CompactedDBG.hpp:586
size_t nb_bits_unique_kmers_bf
Number of Bloom filter bits per k-mer occurring at least once in the FASTA/FASTQ/GFA files of CDBG_Bu...
Definition: CompactedDBG.hpp:135
CompactedDBG(const CompactedDBG< U, G > &o)
Copy constructor (copy a compacted de Bruijn graph).
CompactedDBG(CompactedDBG< U, G > &&o)
Move constructor (move a compacted de Bruijn graph).
UnitigMap type interface.
bool build(CDBG_Build_opt &opt)
Build the Compacted de Bruijn graph.
CompactedDBG(const int kmer_length, const int minimizer_length)
Constructor (set up an empty compacted dBG).
bool build
Boolean indicating if the graph must be built.
Definition: CompactedDBG.hpp:151
vector< string > filename_seq_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:141
If data are to be associated with the unitigs of the compacted de Bruijn graph, those data must be wr...
Definition: CompactedDBG.hpp:213
Contain all the information for the mapping of a k-mer or a sequence to a unitig of a Compacted de Br...
Definition: UnitigMap.hpp:92
bool verbose
Print information messages during execution if true.
Definition: CompactedDBG.hpp:130
string filename_graph_in
String containing the name of a GFA file to read using CompactedDBG<U, G>::read.
Definition: CompactedDBG.hpp:166
iterator begin()
Create an iterator to the first unitig of the Compacted de Bruijn graph (unitigs are NOT sorted lexic...
const_iterator end() const
Create a constant iterator to the "past-the-last" unitig of the Compacted de Bruijn graph (unitigs ar...
UnitigMap< U, G > find(const Kmer &km, const bool extremities_only=false)
Find the unitig containing the queried k-mer in the Compacted de Bruijn graph.
bool clipTips
Clip short tips (length < 2k) of the graph (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:155
const_UnitigMap< U, G > findUnitig(const char *s, const size_t pos, const size_t len) const
Find the unitig containing the k-mer starting at a given position in a query sequence and extends the...
size_t nb_bits_non_unique_kmers_bf
Number of Bloom filter bits per k-mer occurring at least twice in the FASTA/FASTQ/GFA files of CDBG_B...
Definition: CompactedDBG.hpp:136
const_iterator begin() const
Create an constant iterator to the first unitig of the Compacted de Bruijn graph (unitigs are NOT sor...
vector< pair< size_t, const_UnitigMap< U, G > > > searchSequence(const string &seq, const bool exact, const bool insertion, const bool deletion, const bool substitution, const bool or_exclusive_match=false) const
Performs exact and/or inexact search of the k-mers of a sequence query in the Compacted de Bruijn gra...
CompactedDBG(const int kmer_length=31)
Constructor (set up an empty compacted dBG).
bool update
Boolean indicating if the graph must be updated.
Definition: CompactedDBG.hpp:152
void clear(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest)
Clear the data associated with a unitig.
Definition: CompactedDBG.hpp:222
Iterator for the neighbors (predecessors or successors) of a reference unitig used in a UnitigMap obj...
Definition: NeighborIterator.hpp:34
vector< string > filename_ref_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:142
size_t nb_threads
Number of threads to use for building the graph.
Definition: CompactedDBG.hpp:132
Iterator for the unitigs of a Compacted de Bruijn graph.
Definition: UnitigIterator.hpp:36
void concat(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Join data of two unitigs which are going to be concatenated.
Definition: CompactedDBG.hpp:237
bool merge(const vector< CompactedDBG > &v, const size_t nb_threads=1, const bool verbose=false)
Merge multiple compacted de Bruijn graphs.
Most members of this structure are parameters for CompactedDBG<U, G>::build(), except for:
Definition: CompactedDBG.hpp:128
G * getData()
Return a pointer to the graph data.
Definition: CompactedDBG.hpp:596
void clear()
Clear the graph: empty the graph and reset its parameters.
iterator end()
Create an iterator to the "past-the-last" unitig of the Compacted de Bruijn graph (unitigs are NOT so...
Interface for the class Kmer:
size_t read_chunksize
Number of reads a thread can read and process at a time.
Definition: CompactedDBG.hpp:133
Unitig_data_ptr_t getData() const
Get a pointer to the data associated with the reference unitig used in the mapping.
The unitigIterator type interface.
virtual ~CompactedDBG()
Destructor.
bool deleteIsolated
Remove short isolated unitigs (length < 2k) of the graph (not used by CompactedDBG<U,...
Definition: CompactedDBG.hpp:156
size_t length() const
Return the sum of the unitigs length.
Definition: ColorSet.hpp:16
void merge(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Merge the data of a sub-unitig B to the data of a sub-unitig A.
Definition: CompactedDBG.hpp:249
unitigIterator< U, G, false > iterator
An iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:322
Represent a Compacted de Bruijn graph.
Definition: CompactedDBG.hpp:306
bool outputGFA
Boolean indicating if the graph is written to a GFA file (true) or if the unitigs are written to a FA...
Definition: CompactedDBG.hpp:159
bool merge(const CompactedDBG &o, const size_t nb_threads=1, const bool verbose=false)
Merge a compacted de Bruijn graph.
size_t nbKmers() const
Return the number of k-mers in the graph.
string inFilenameBBF
String containing the name of a Bloom filter file that is generated by CompactedDBG<U,...
Definition: CompactedDBG.hpp:138
int k
Length of k-mers (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:149