Go to the documentation of this file. 1 #ifndef BIFROST_COMPACTED_DBG_HPP
2 #define BIFROST_COMPACTED_DBG_HPP
18 #include <unordered_map>
19 #include <unordered_set>
26 #include "BlockedBloomFilter.hpp"
28 #include "File_Parser.hpp"
29 #include "FASTX_Parser.hpp"
30 #include "GFA_Parser.hpp"
32 #include "KmerCovIndex.hpp"
33 #include "KmerHashTable.hpp"
34 #include "KmerIterator.hpp"
35 #include "KmerStream.hpp"
37 #include "minHashIterator.hpp"
38 #include "MinimizerIndex.hpp"
39 #include "RepHash.hpp"
40 #include "TinyVector.hpp"
47 #define MASK_CONTIG_ID (0xffffffff00000000)
48 #define MASK_CONTIG_TYPE (0x80000000)
49 #define MASK_CONTIG_POS (0x7fffffff)
50 #define RESERVED_ID (0xffffffff)
54 #define DEFAULT_G_DEC1 8
55 #define DEFAULT_G_DEC2 4
167 vector<string> filename_query_in;
169 CDBG_Build_opt() : nb_threads(1), k(DEFAULT_K), g(-1), nb_bits_unique_kmers_bf(14),
170 nb_bits_non_unique_kmers_bf(14), ratio_kmers(0.8),
171 build(false), update(false), query(false), clipTips(false), deleteIsolated(false),
172 inexact_search(false), useMercyKmers(false), outputGFA(true), verbose(false) {}
211 template<
typename Unitig_data_t,
typename Graph_data_t =
void>
304 template<
typename Unitig_data_t =
void,
typename Graph_data_t =
void>
308 "Type of data associated with vertices of class CompactedDBG must be void (no data) or a class extending class CDBG_Data_t");
310 typedef Unitig_data_t U;
311 typedef Graph_data_t G;
315 template<
typename U,
typename G,
bool is_const>
friend class UnitigMap;
316 template<
typename U,
typename G,
bool is_const>
friend class unitigIterator;
317 template<
typename U,
typename G,
bool is_const>
friend class neighborIterator;
319 template<
typename X,
typename Y>
friend class CompactedDBG;
328 CompactedDBG(
const int kmer_length = DEFAULT_K,
const int minimizer_length = -1);
405 bool simplify(
const bool delete_short_isolated_unitigs =
true,
const bool clip_short_tips =
true,
const bool verbose =
false);
414 bool write(
const string& output_filename,
const size_t nb_threads = 1,
const bool GFA_output =
true,
const bool verbose =
false)
const;
425 bool read(
const string& input_filename,
const size_t nb_threads = 1,
const bool verbose =
false);
477 vector<pair<size_t, UnitigMap<U, G>>>
searchSequence(
const string& s,
const bool exact,
const bool insertion,
const bool deletion,
478 const bool substitution,
const bool or_exclusive_match =
false);
492 vector<pair<size_t, const_UnitigMap<U, G>>>
searchSequence(
const string& s,
const bool exact,
const bool insertion,
const bool deletion,
493 const bool substitution,
const bool or_exclusive_match =
false)
const;
502 bool add(
const string& seq,
const bool verbose =
false);
535 bool merge(
const vector<CompactedDBG>& v,
const size_t nb_threads = 1,
const bool verbose =
false);
575 inline int getK()
const {
return k_; }
580 inline int getG()
const {
return g_; }
585 inline size_t size()
const {
return v_unitigs.size() + km_unitigs.size() + h_kmers_ccov.size(); }
595 inline const G*
getData()
const {
return data.getData(); }
597 bool search(
const vector<string>& query_filenames,
const string& out_filename_prefix,
598 const double ratio_kmers,
const bool inexact_search,
const size_t nb_threads,
599 const size_t verbose =
false)
const;
603 bool annotateSplitUnitigs(
const CompactedDBG<U, G>& o,
const size_t nb_threads = 1,
const bool verbose =
false);
605 pair<size_t, size_t> splitAllUnitigs();
606 pair<size_t, size_t> getSplitInfoAllUnitigs()
const;
608 inline size_t joinUnitigs(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1) {
610 return joinUnitigs_<is_void<U>::value>(v_joins, nb_threads);
613 bool mergeData(
const CompactedDBG<U, G>& o,
const size_t nb_threads = 1,
const bool verbose =
false);
614 bool mergeData(
CompactedDBG<U, G>&& o,
const size_t nb_threads = 1,
const bool verbose =
false);
620 bool filter(
const CDBG_Build_opt& opt,
const size_t nb_unique_kmers,
const size_t nb_non_unique_kmers);
621 bool construct(
const CDBG_Build_opt& opt,
const size_t nb_unique_minimizers,
const size_t nb_non_unique_minimizers);
623 bool addUnitigSequenceBBF(
const Kmer km,
const string& seq,
const size_t pos_match_km,
const size_t len_match_km, LockGraph& lck_g);
625 size_t findUnitigSequenceBBF(
Kmer km,
string& s,
bool& isIsolated, vector<Kmer>& l_ignored_km_tip);
626 bool bwStepBBF(
const Kmer km,
Kmer& front,
char& c,
bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip,
const bool check_fp_cand =
true)
const;
627 bool fwStepBBF(
const Kmer km,
Kmer&
end,
char& c,
bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip,
const bool check_fp_cand =
true)
const;
629 inline size_t find(
const preAllocMinHashIterator<RepHash>& it_min_h)
const {
631 const int pos = it_min_h.getPosition();
632 return (hmap_min_unitigs.find(Minimizer(it_min_h.s + pos).rep()) != hmap_min_unitigs.end() ? 0 : pos - it_min_h.p);
635 UnitigMap<U, G> find(
const char* s,
const size_t pos_km,
const minHashIterator<RepHash>& it_min,
const bool extremities_only =
false);
636 const_UnitigMap<U, G> find(
const char* s,
const size_t pos_km,
const minHashIterator<RepHash>& it_min,
const bool extremities_only =
false)
const;
642 vector<const_UnitigMap<U, G>> findPredecessors(
const Kmer& km,
const bool extremities_only =
false)
const;
643 vector<const_UnitigMap<U, G>> findSuccessors(
const Kmer& km,
const size_t limit = 4,
const bool extremities_only =
false)
const;
645 vector<UnitigMap<U, G>> findPredecessors(
const Kmer& km,
const bool extremities_only =
false);
646 vector<UnitigMap<U, G>> findSuccessors(
const Kmer& km,
const size_t limit = 4,
const bool extremities_only =
false);
654 bool addUnitig(
const string& str_unitig,
const size_t id_unitig);
655 bool addUnitig(
const string& str_unitig,
const size_t id_unitig,
const size_t id_unitig_r,
const size_t is_short_r);
656 bool addUnitig(
const string& str_unitig,
const size_t id_unitig, SpinLock& lck_unitig, SpinLock& lck_kmer);
657 void swapUnitigs(
const bool isShort,
const size_t id_a,
const size_t id_b);
659 bool mergeUnitig(
const string& seq,
const bool verbose =
false);
660 bool annotateSplitUnitig(
const string& seq,
const bool verbose =
false);
661 bool annotateSplitUnitig(
const string& seq, LockGraph& lck_g,
const bool verbose =
false);
663 template<
bool is_
void>
669 template<
bool is_
void>
672 template<
bool is_
void>
673 typename std::enable_if<!is_void, void>::type deleteUnitig_(
const bool isShort,
const bool isAbundant,
674 const size_t id_unitig,
const bool delete_data =
true);
676 template<
bool is_
void>
677 typename std::enable_if<is_void, void>::type deleteUnitig_(
const bool isShort,
const bool isAbundant,
678 const size_t id_unitig,
const bool delete_data =
true);
680 void deleteUnitig_(
const bool isShort,
const bool isAbundant,
const size_t id_unitig,
const string& str);
682 template<
bool is_
void>
683 typename std::enable_if<!is_void, bool>::type extractUnitig_(
size_t& pos_v_unitigs,
size_t& nxt_pos_insert_v_unitigs,
684 size_t& v_unitigs_sz,
size_t& v_kmers_sz,
const vector<pair<int,int>>& sp);
685 template<
bool is_
void>
686 typename std::enable_if<is_void, bool>::type extractUnitig_(
size_t& pos_v_unitigs,
size_t& nxt_pos_insert_v_unitigs,
687 size_t& v_unitigs_sz,
size_t& v_kmers_sz,
const vector<pair<int,int>>& sp);
689 pair<size_t, size_t> extractAllUnitigs();
691 template<
bool is_
void>
692 typename std::enable_if<!is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1);
694 template<
bool is_
void>
695 typename std::enable_if<is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1);
697 void moveToAbundant();
698 void setFullCoverage(
const size_t cov)
const;
700 void createJoinHT(vector<Kmer>* v_joins, KmerHashTable<Kmer>& joins,
const size_t nb_threads)
const;
701 void createJoinHT(vector<Kmer>* v_joins, KmerHashTable<char>& joins,
const size_t nb_threads)
const;
704 void check_fp_tips(KmerHashTable<bool>& ignored_km_tips);
705 size_t removeUnitigs(
bool rmIsolated,
bool clipTips, vector<Kmer>& v);
707 size_t joinTips(
string filename_MBBF_uniq_kmers,
const size_t nb_threads = 1,
const bool verbose =
false);
708 vector<Kmer> extractMercyKmers(BlockedBloomFilter& bf_uniq_km,
const size_t nb_threads = 1,
const bool verbose =
false);
710 void writeGFA(
const string& graphfilename,
const size_t nb_threads = 1)
const;
711 void writeFASTA(
const string& graphfilename)
const;
713 void readGFA(
const string& graphfilename,
const size_t nb_threads = 1);
714 void readFASTA(
const string& graphfilename,
const size_t nb_threads = 1);
716 template<
bool is_
void>
717 typename std::enable_if<!is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap)
const;
718 template<
bool is_
void>
719 typename std::enable_if<is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap)
const;
727 void setKmerGmerLength(
const int kmer_length,
const int minimizer_length = -1);
730 vector<pair<size_t, UnitigMap<U, G>>>
searchSequence(
const string& seq,
const bool exact,
const bool insertion,
const bool deletion,
731 const bool substitution,
const double ratio_kmers,
const bool or_exclusive_match);
733 vector<pair<size_t, const_UnitigMap<U, G>>>
searchSequence(
const string& seq,
const bool exact,
const bool insertion,
const bool deletion,
734 const bool substitution,
const double ratio_kmers,
const bool or_exclusive_match)
const;
741 static const int tiny_vector_sz = 2;
742 static const int min_abundance_lim = 15;
743 static const int max_abundance_lim = 15;
745 typedef KmerHashTable<CompressedCoverage_t<U>> h_kmers_ccov_t;
747 vector<Unitig<U>*> v_unitigs;
749 KmerCovIndex<U> km_unitigs;
750 MinimizerIndex hmap_min_unitigs;
752 h_kmers_ccov_t h_kmers_ccov;
754 BlockedBloomFilter bf;
759 #include "CompactedDBG.tcc"
760 #include "Search.tcc"
size_t size() const
Return the number of unitigs in the graph.
Definition: CompactedDBG.hpp:585
const_UnitigMap< U, G > find(const Kmer &km, const bool extremities_only=false) const
Find the unitig containing the queried k-mer in the Compacted de Bruijn graph.
CompactedDBG< U, G > & operator=(CompactedDBG< U, G > &&o)
Move assignment operator (move a compacted de Bruijn graph).
bool simplify(const bool delete_short_isolated_unitigs=true, const bool clip_short_tips=true, const bool verbose=false)
Simplify the Compacted de Bruijn graph: clip short (< 2k length) tips and/or delete short (< 2k lengt...
bool useMercyKmers
Keep in the graph low coverage k-mers (cov=1) connecting tips of the graph.
Definition: CompactedDBG.hpp:156
bool write(const string &output_filename, const size_t nb_threads=1, const bool GFA_output=true, const bool verbose=false) const
Write the Compacted de Bruijn graph to disk (GFA1 format).
void extract(const UnitigMap< Unitig_data_t, Graph_data_t > &um_src, bool last_extraction)
Extract data corresponding to a sub-unitig of a unitig A.
Definition: CompactedDBG.hpp:264
CompactedDBG< U, G > & operator+=(const CompactedDBG< U, G > &o)
Addition assignment operator (merge a compacted de Bruijn graph).
CompactedDBG< U, G > & operator=(const CompactedDBG< U, G > &o)
Copy assignment operator (copy a compacted de Bruijn graph).
string outFilenameBBF
String containing the name of a Bloom filter file that will be generated by CompactedDBG<U,...
Definition: CompactedDBG.hpp:138
string serialize(const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src) const
Serialize the data to a GFA-formatted string.
Definition: CompactedDBG.hpp:274
bool add(const string &seq, const bool verbose=false)
Add a sequence to the Compacted de Bruijn graph.
bool isInvalid() const
Return a boolean indicating if the graph is invalid (wrong input parameters/files,...
Definition: CompactedDBG.hpp:570
UnitigMap< U, G > findUnitig(const char *s, const size_t pos, const size_t len)
Find the unitig containing the k-mer starting at a given position in a query sequence and extends the...
const G * getData() const
Return a constant pointer to the graph data.
Definition: CompactedDBG.hpp:595
bool operator==(const CompactedDBG< U, G > &o) const
Equality operator.
Interface to store and manipulate k-mers.
Definition: Kmer.hpp:42
unitigIterator< U, G, true > const_iterator
A constant iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:322
bool operator!=(const CompactedDBG< U, G > &o) const
Inequality operator.
bool remove(const const_UnitigMap< U, G > &um, const bool verbose=false)
Remove a unitig from the Compacted de Bruijn graph.
string prefixFilenameOut
Prefix for the name of the file to which the graph must be written.
Definition: CompactedDBG.hpp:163
int getK() const
Return the length of k-mers of the graph.
Definition: CompactedDBG.hpp:575
bool read(const string &input_filename, const size_t nb_threads=1, const bool verbose=false)
Read a Compacted de Bruijn graph from disk (GFA1 or FASTA format).
int getG() const
Return the length of minimizers of the graph.
Definition: CompactedDBG.hpp:580
size_t nb_bits_unique_kmers_bf
Number of Bloom filter bits per k-mer occurring at least once in the FASTA/FASTQ/GFA files of CDBG_Bu...
Definition: CompactedDBG.hpp:134
CompactedDBG(const CompactedDBG< U, G > &o)
Copy constructor (copy a compacted de Bruijn graph).
CompactedDBG(CompactedDBG< U, G > &&o)
Move constructor (move a compacted de Bruijn graph).
CompactedDBG(const int kmer_length=31, const int minimizer_length=-1)
Constructor (set up an empty compacted dBG).
UnitigMap type interface.
bool build(CDBG_Build_opt &opt)
Build the Compacted de Bruijn graph.
bool build
Boolean indicating if the graph must be built.
Definition: CompactedDBG.hpp:150
vector< string > filename_seq_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:140
If data are to be associated with the unitigs of the compacted de Bruijn graph, those data must be wr...
Definition: CompactedDBG.hpp:212
Contain all the information for the mapping of a k-mer or a sequence to a unitig of a Compacted de Br...
Definition: UnitigMap.hpp:92
bool verbose
Print information messages during execution if true.
Definition: CompactedDBG.hpp:130
string filename_graph_in
String containing the name of a GFA file to read using CompactedDBG<U, G>::read.
Definition: CompactedDBG.hpp:165
iterator begin()
Create an iterator to the first unitig of the Compacted de Bruijn graph (unitigs are NOT sorted lexic...
const_iterator end() const
Create a constant iterator to the "past-the-last" unitig of the Compacted de Bruijn graph (unitigs ar...
UnitigMap< U, G > find(const Kmer &km, const bool extremities_only=false)
Find the unitig containing the queried k-mer in the Compacted de Bruijn graph.
bool clipTips
Clip short tips (length < 2k) of the graph (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:154
const_UnitigMap< U, G > findUnitig(const char *s, const size_t pos, const size_t len) const
Find the unitig containing the k-mer starting at a given position in a query sequence and extends the...
size_t nb_bits_non_unique_kmers_bf
Number of Bloom filter bits per k-mer occurring at least twice in the FASTA/FASTQ/GFA files of CDBG_B...
Definition: CompactedDBG.hpp:135
const_iterator begin() const
Create an constant iterator to the first unitig of the Compacted de Bruijn graph (unitigs are NOT sor...
bool update
Boolean indicating if the graph must be updated.
Definition: CompactedDBG.hpp:151
void clear(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest)
Clear the data associated with a unitig.
Definition: CompactedDBG.hpp:221
vector< pair< size_t, const_UnitigMap< U, G > > > searchSequence(const string &s, const bool exact, const bool insertion, const bool deletion, const bool substitution, const bool or_exclusive_match=false) const
Performs exact and/or inexact search of the k-mers of a sequence query in the Compacted de Bruijn gra...
Iterator for the neighbors (predecessors or successors) of a reference unitig used in a UnitigMap obj...
Definition: NeighborIterator.hpp:34
vector< string > filename_ref_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:141
size_t nb_threads
Number of threads to use for building the graph.
Definition: CompactedDBG.hpp:132
Iterator for the unitigs of a Compacted de Bruijn graph.
Definition: UnitigIterator.hpp:36
void concat(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Join data of two unitigs which are going to be concatenated.
Definition: CompactedDBG.hpp:236
bool merge(const vector< CompactedDBG > &v, const size_t nb_threads=1, const bool verbose=false)
Merge multiple compacted de Bruijn graphs.
Most members of this structure are parameters for CompactedDBG<U, G>::build(), except for:
Definition: CompactedDBG.hpp:128
G * getData()
Return a pointer to the graph data.
Definition: CompactedDBG.hpp:590
void clear()
Clear the graph: empty the graph and reset its parameters.
iterator end()
Create an iterator to the "past-the-last" unitig of the Compacted de Bruijn graph (unitigs are NOT so...
Interface for the class Kmer:
vector< pair< size_t, UnitigMap< U, G > > > searchSequence(const string &s, const bool exact, const bool insertion, const bool deletion, const bool substitution, const bool or_exclusive_match=false)
Performs exact and/or inexact search of the k-mers of a sequence query in the Compacted de Bruijn gra...
Unitig_data_ptr_t getData() const
Get a pointer to the data associated with the reference unitig used in the mapping.
The unitigIterator type interface.
virtual ~CompactedDBG()
Destructor.
bool deleteIsolated
Remove short isolated unitigs (length < 2k) of the graph (not used by CompactedDBG<U,...
Definition: CompactedDBG.hpp:155
size_t length() const
Return the sum of the unitigs length.
Definition: ColorSet.hpp:16
void merge(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Merge the data of a sub-unitig B to the data of a sub-unitig A.
Definition: CompactedDBG.hpp:248
unitigIterator< U, G, false > iterator
An iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:321
Represent a Compacted de Bruijn graph.
Definition: CompactedDBG.hpp:305
bool outputGFA
Boolean indicating if the graph is written to a GFA file (true) or if the unitigs are written to a FA...
Definition: CompactedDBG.hpp:158
bool merge(const CompactedDBG &o, const size_t nb_threads=1, const bool verbose=false)
Merge a compacted de Bruijn graph.
size_t nbKmers() const
Return the number of k-mers in the graph.
string inFilenameBBF
String containing the name of a Bloom filter file that is generated by CompactedDBG<U,...
Definition: CompactedDBG.hpp:137
int k
Length of k-mers (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:148