Bifrost
CompactedDBG.hpp
Go to the documentation of this file.
1 #ifndef BIFROST_COMPACTED_DBG_HPP
2 #define BIFROST_COMPACTED_DBG_HPP
3 
4 #include <cmath>
5 #include <cstdlib>
6 #include <cstring>
7 #include <ctime>
8 #include <cstdio>
9 #include <climits>
10 #include <functional>
11 #include <getopt.h>
12 #include <iostream>
13 #include <map>
14 #include <set>
15 #include <sstream>
16 #include <stdint.h>
17 #include <string>
18 #include <unordered_map>
19 #include <unordered_set>
20 #include <vector>
21 
22 #include <thread>
23 #include <atomic>
24 #include <mutex>
25 
26 #include "BlockedBloomFilter.hpp"
27 #include "Common.hpp"
28 #include "File_Parser.hpp"
29 #include "FASTX_Parser.hpp"
30 #include "GFA_Parser.hpp"
31 #include "Kmer.hpp"
32 #include "KmerCovIndex.hpp"
33 #include "KmerHashTable.hpp"
34 #include "KmerIterator.hpp"
35 #include "KmerStream.hpp"
36 #include "Lock.hpp"
37 #include "minHashIterator.hpp"
38 #include "MinimizerIndex.hpp"
39 #include "RepHash.hpp"
40 #include "TinyVector.hpp"
41 #include "Unitig.hpp"
42 #include "UnitigIterator.hpp"
43 #include "UnitigMap.hpp"
44 
45 #include "roaring.hh"
46 
47 #define MASK_CONTIG_ID (0xffffffff00000000)
48 #define MASK_CONTIG_TYPE (0x80000000)
49 #define MASK_CONTIG_POS (0x7fffffff)
50 #define RESERVED_ID (0xffffffff)
51 
52 #define DEFAULT_K 31
53 
54 #define DEFAULT_G_DEC1 8
55 #define DEFAULT_G_DEC2 4
56 
62 using namespace std;
63 
129 
130  bool verbose;
131 
132  size_t nb_threads;
133 
134  size_t min_count_km;
135 
137 
140 
141  vector<string> filename_seq_in;
142  vector<string> filename_ref_in;
143 
144  // The following members are NOT used by CompactedDBG<U, G>::build
145  // but you can set them to use them as parameters for other functions
146  // such as CompactedDBG<U, G>::simplify, CompactedDBG<U, G>::read or
147  // CompactedDBG<U, G>::write.
148 
149  int k, g;
150 
151  bool build;
152  bool update;
153  bool query;
154 
155  bool clipTips;
158 
159  bool outputGFA;
161  bool outputBFG;
162 
163  bool compressOutput;
164  bool inexact_search;
165 
166  bool writeIndexFile;
167 
168  double ratio_kmers;
169 
171 
173  string filename_index_in;
174 
175  vector<string> filename_query_in;
176 
177  CDBG_Build_opt() : nb_threads(1), k(DEFAULT_K), g(-1), nb_bits_kmers_bf(14), ratio_kmers(0.8), min_count_km(1),
178  build(false), update(false), query(false), clipTips(false), deleteIsolated(false),
179  inexact_search(false), writeIndexFile(true), useMercyKmers(false), outputGFA(true),
180  outputFASTA(false), outputBFG(false), compressOutput(true), verbose(false) {}
181 };
182 
188 template<typename U = void, typename G = void> using const_UnitigMap = UnitigMap<U, G, true>;
189 
219 template<typename Unitig_data_t, typename Graph_data_t = void> //Curiously Recurring Template Pattern (CRTP)
220 class CDBG_Data_t {
221 
222  public:
223 
230 
245 
257 
272  void extract(const UnitigMap<Unitig_data_t, Graph_data_t>& um_src, bool last_extraction){}
273 
283 
284  return string();
285  }
286 };
287 
312 template<typename Unitig_data_t = void, typename Graph_data_t = void>
314 
315  static_assert(is_void<Unitig_data_t>::value || is_base_of<CDBG_Data_t<Unitig_data_t, Graph_data_t>, Unitig_data_t>::value,
316  "Type of data associated with vertices of class CompactedDBG must be void (no data) or a class extending class CDBG_Data_t");
317 
318  typedef Unitig_data_t U;
319  typedef Graph_data_t G;
320 
321  public:
322 
323  template<typename U, typename G, bool is_const> friend class UnitigMap;
324  template<typename U, typename G, bool is_const> friend class unitigIterator;
325  template<typename U, typename G, bool is_const> friend class neighborIterator;
326 
327  template<typename X, typename Y> friend class CompactedDBG;
328 
336  CompactedDBG(const int kmer_length = DEFAULT_K, const int minimizer_length = -1);
337 
343  CompactedDBG(const CompactedDBG<U, G>& o); // Copy constructor
344 
350  CompactedDBG(CompactedDBG<U, G>&& o); // Move constructor
351 
354  virtual ~CompactedDBG();
355 
363 
371 
384 
389  bool operator==(const CompactedDBG<U, G>& o) const;
390 
395  inline bool operator!=(const CompactedDBG<U, G>& o) const;
396 
399  void clear();
400 
405  bool build(CDBG_Build_opt& opt);
406 
413  bool simplify(const bool delete_short_isolated_unitigs = true, const bool clip_short_tips = true, const bool verbose = false);
414 
427  bool write( const string& output_fn, const size_t nb_threads = 1, const bool GFA_output = true, const bool FASTA_output = false,
428  const bool BFG_output = false, const bool write_index_file = true, const bool compressed_output = false,
429  const bool verbose = false) const;
430 
442  bool read(const string& input_graph_fn, const size_t nb_threads = 1, const bool verbose = false);
443 
455  bool read(const string& input_graph_fn, const string& input_index_fn, const size_t nb_threads = 1, const bool verbose = false);
456 
464  UnitigMap<U, G> find(const Kmer& km, const bool extremities_only = false);
465 
473  const_UnitigMap<U, G> find(const Kmer& km, const bool extremities_only = false) const;
474 
483  UnitigMap<U, G> findUnitig(const char* s, const size_t pos, const size_t len);
484 
493  const_UnitigMap<U, G> findUnitig(const char* s, const size_t pos, const size_t len) const;
494 
507  vector<pair<size_t, UnitigMap<U, G>>> searchSequence( const string& s, const bool exact, const bool insertion, const bool deletion,
508  const bool substitution, const bool or_exclusive_match = false);
509 
522  vector<pair<size_t, const_UnitigMap<U, G>>> searchSequence( const string& s, const bool exact, const bool insertion, const bool deletion,
523  const bool substitution, const bool or_exclusive_match = false) const;
524 
532  bool add(const string& seq, const bool verbose = false);
533 
539  bool remove(const const_UnitigMap<U, G>& um, const bool verbose = false);
540 
553  bool merge(const CompactedDBG& o, const size_t nb_threads = 1, const bool verbose = false);
554 
565  bool merge(const vector<CompactedDBG>& v, const size_t nb_threads = 1, const bool verbose = false);
566 
571 
576 
581 
586 
590  size_t length() const;
591 
595  size_t nbKmers() const;
596 
600  inline bool isInvalid() const { return invalid; }
601 
605  inline int getK() const { return k_; }
606 
610  inline int getG() const { return g_; }
611 
615  inline size_t size() const { return v_unitigs.size() + km_unitigs.size() + h_kmers_ccov.size(); }
616 
620  inline G* getData() { return data.getData(); }
621 
625  inline const G* getData() const { return data.getData(); }
626 
627  bool search(const vector<string>& query_filenames, const string& out_filename_prefix,
628  const double ratio_kmers, const bool inexact_search, const size_t nb_threads,
629  const size_t verbose = false) const;
630 
631  bool writeBinary(const string& fn, const size_t nb_threads = 1) const;
632  bool writeBinary(ostream& out, const size_t nb_threads = 1) const;
633 
634  bool readBinary(const string& fn);
635  bool readBinary(istream& in);
636 
637  protected:
638 
639  bool annotateSplitUnitigs(const CompactedDBG<U, G>& o, const size_t nb_threads = 1, const bool verbose = false);
640 
641  pair<size_t, size_t> splitAllUnitigs();
642  pair<size_t, size_t> getSplitInfoAllUnitigs() const;
643 
644  inline size_t joinUnitigs(vector<Kmer>* v_joins = nullptr, const size_t nb_threads = 1) {
645 
646  return joinUnitigs_<is_void<U>::value>(v_joins, nb_threads);
647  }
648 
649  bool mergeData(const CompactedDBG<U, G>& o, const size_t nb_threads = 1, const bool verbose = false);
650  bool mergeData(CompactedDBG<U, G>&& o, const size_t nb_threads = 1, const bool verbose = false);
651 
652  private:
653 
654  bool writeBinaryGraph(ostream& out, const size_t nb_threads = 1) const;
655  bool writeBinaryGraph(const string& fn, const size_t nb_threads = 1) const;
656 
657  bool writeBinaryIndex(ostream& out, const uint64_t checksum, const size_t nb_threads = 1) const;
658  bool writeBinaryIndex(const string& fn, const uint64_t checksum, const size_t nb_threads = 1) const;
659 
660  pair<uint64_t, bool> readBinaryGraph(istream& in);
661  pair<uint64_t, bool> readBinaryGraph(const string& fn);
662 
663  bool readBinaryIndex(istream& in, const uint64_t checksum);
664  bool readBinaryIndex(const string& fn, const uint64_t checksum);
665 
666  bool readBinaryIndexHead(const string& fn, size_t& file_format_version, size_t& v_unitigs_sz, size_t& km_unitigs_sz,
667  size_t& h_kmers_ccov_sz, size_t& hmap_min_unitigs_sz, uint64_t& read_checksum) const;
668 
669  bool readBinaryIndexHead(istream& in, size_t& file_format_version, size_t& v_unitigs_sz, size_t& km_unitigs_sz,
670  size_t& h_kmers_ccov_sz, size_t& hmap_min_unitigs_sz, uint64_t& read_checksum) const;
671 
672  uint64_t checksum() const;
673 
674  CompactedDBG<U, G>& toDataGraph(CompactedDBG<void, void>&& o, const size_t nb_threads = 1);
675 
676  pair<bool, pair<BlockedBloomFilter, Roaring>> filter(const CDBG_Build_opt& opt, const size_t nb_unique_kmers, const size_t nb_non_unique_kmers);
677  bool construct(const CDBG_Build_opt& opt, BlockedBloomFilter& bf, Roaring& r, const size_t nb_unique_minimizers, const size_t nb_non_unique_minimizers);
678 
679  void addUnitigSequence(const Kmer km, const string& seq, const size_t pos_match_km, const size_t len_match_km, LockGraph& lck_g, const bool map_read = true);
680  //void addUnitigSequence(const string& seq);
681 
682  size_t findUnitigSequenceBBF(const BlockedBloomFilter& bf, const Kmer km, string& s, bool& isIsolated, vector<Kmer>& l_ignored_km_tip);
683  size_t findUnitigSequenceBBF(const BlockedBloomFilter& bf, const Kmer km, string& s, bool& isIsolated, vector<Kmer>& l_ignored_km_tip, LockGraph& lck_g);
684 
685  bool bwStepBBF(const BlockedBloomFilter& bf, const Kmer km, Kmer& front, char& c, bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip, const bool check_fp_cand = true) const;
686  bool fwStepBBF(const BlockedBloomFilter& bf, const Kmer km, Kmer& end, char& c, bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip, const bool check_fp_cand = true) const;
687 
688  inline size_t find(const preAllocMinHashIterator<RepHash>& it_min_h) const {
689 
690  const int pos = it_min_h.getPosition();
691  return (hmap_min_unitigs.find(Minimizer(it_min_h.s + pos).rep()) != hmap_min_unitigs.end() ? 0 : pos - it_min_h.p);
692  }
693 
694  UnitigMap<U, G> find(const char* s, const size_t pos_km, const minHashIterator<RepHash>& it_min, const bool extremities_only = false);
695  const_UnitigMap<U, G> find(const char* s, const size_t pos_km, const minHashIterator<RepHash>& it_min, const bool extremities_only = false) const;
696 
697  UnitigMap<U, G> find(const Kmer& km, const preAllocMinHashIterator<RepHash>& it_min_h);
698 
699  //vector<const_UnitigMap<U, G>> find(const Minimizer& minz) const;
700 
701  vector<const_UnitigMap<U, G>> findPredecessors(const Kmer& km, const bool extremities_only = false) const;
702  vector<const_UnitigMap<U, G>> findSuccessors(const Kmer& km, const size_t limit = 4, const bool extremities_only = false) const;
703 
704  vector<UnitigMap<U, G>> findPredecessors(const Kmer& km, const bool extremities_only = false);
705  vector<UnitigMap<U, G>> findSuccessors(const Kmer& km, const size_t limit = 4, const bool extremities_only = false);
706 
707  UnitigMap<U, G> findUnitig(const Kmer& km, const char* s, const size_t pos);
708  UnitigMap<U, G> findUnitig(const Kmer& km, const char* s, const size_t pos, const preAllocMinHashIterator<RepHash>& it_min_h);
709 
710  UnitigMap<U, G> findUnitig(const char* s, const size_t pos, const size_t len, const minHashIterator<RepHash>& it_min);
711  const_UnitigMap<U, G> findUnitig(const char* s, const size_t pos, const size_t len, const minHashIterator<RepHash>& it_min) const;
712 
713  bool addUnitig(const string& str_unitig, const size_t id_unitig);
714  bool addUnitig(const string& str_unitig, const size_t id_unitig, const size_t id_unitig_r, const size_t is_short_r);
715  bool addUnitig(const string& str_unitig, const size_t id_unitig, SpinLock& lck_unitig, SpinLock& lck_kmer/*, const bool enable_abundant = true*/);
716  void swapUnitigs(const bool isShort, const size_t id_a, const size_t id_b);
717 
718  bool mergeUnitig(const string& seq, const bool verbose = false);
719  bool annotateSplitUnitig(const string& seq, const bool verbose = false);
720  bool annotateSplitUnitig(const string& seq, LockGraph& lck_g, const bool verbose = false);
721 
722  template<bool is_void>
723  inline typename std::enable_if<!is_void, void>::type mergeData_(const UnitigMap<U, G>& a, const const_UnitigMap<U, G>& b){
724 
725  a.getData()->merge(a, b);
726  }
727 
728  template<bool is_void>
729  inline typename std::enable_if<is_void, void>::type mergeData_(const UnitigMap<U, G>& a, const const_UnitigMap<U, G>& b) {}
730 
731  template<bool is_void>
732  typename std::enable_if<!is_void, void>::type deleteUnitig_(const bool isShort, const bool isAbundant,
733  const size_t id_unitig, const bool delete_data = true);
734 
735  template<bool is_void>
736  typename std::enable_if<is_void, void>::type deleteUnitig_( const bool isShort, const bool isAbundant,
737  const size_t id_unitig, const bool delete_data = true);
738 
739  void deleteUnitig_(const bool isShort, const bool isAbundant, const size_t id_unitig, const string& str);
740 
741  template<bool is_void>
742  typename std::enable_if<!is_void, bool>::type extractUnitig_(size_t& pos_v_unitigs, size_t& nxt_pos_insert_v_unitigs,
743  size_t& v_unitigs_sz, size_t& v_kmers_sz, const vector<pair<int,int>>& sp);
744  template<bool is_void>
745  typename std::enable_if<is_void, bool>::type extractUnitig_(size_t& pos_v_unitigs, size_t& nxt_pos_insert_v_unitigs,
746  size_t& v_unitigs_sz, size_t& v_kmers_sz, const vector<pair<int,int>>& sp);
747 
748  pair<size_t, size_t> extractAllUnitigs();
749 
750  template<bool is_void>
751  typename std::enable_if<!is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins = nullptr, const size_t nb_threads = 1);
752 
753  template<bool is_void>
754  typename std::enable_if<is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins = nullptr, const size_t nb_threads = 1);
755 
756  void moveToAbundant();
757  void setFullCoverage(const size_t cov) const;
758 
759  void createJoinHT(vector<Kmer>* v_joins, KmerHashTable<Kmer>& joins, const size_t nb_threads) const;
760  void createJoinHT(vector<Kmer>* v_joins, KmerHashTable<char>& joins, const size_t nb_threads) const;
761 
762  bool checkJoin(const Kmer& a, const const_UnitigMap<U, G>& cm_a, Kmer& b) const;
763  void check_fp_tips(KmerHashTable<bool>& ignored_km_tips);
764  size_t removeUnitigs(bool rmIsolated, bool clipTips, vector<Kmer>& v);
765 
766  size_t joinTips(string filename_MBBF_uniq_kmers, const size_t nb_threads = 1, const bool verbose = false);
767  vector<Kmer> extractMercyKmers(const BlockedBloomFilter& bf_uniq_km, const size_t nb_threads = 1, const bool verbose = false);
768 
769  bool writeGFA(const string& fn, const size_t nb_threads = 1, const bool compressed_output = false) const;
770  bool writeFASTA(const string& fn, const bool compressed_output = false) const;
771 
772  void makeGraphFromGFA(const string& fn, const size_t nb_threads = 1);
773  void makeGraphFromFASTA(const string& fn, const size_t nb_threads = 1);
774 
775  pair<uint64_t, bool> readGraphFromIndexGFA(const string& graph_fn, const string& index_fn, const size_t k, const size_t g);
776  pair<uint64_t, bool> readGraphFromIndexFASTA(const string& graph_fn, const string& index_fn, const size_t k, const size_t g);
777 
778  template<bool is_void>
779  typename std::enable_if<!is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap) const;
780  template<bool is_void>
781  typename std::enable_if<is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap) const;
782 
783  void mapRead(const const_UnitigMap<U, G>& um);
784  void mapRead(const const_UnitigMap<U, G>& um, LockGraph& lck_g);
785 
786  void unmapRead(const const_UnitigMap<U, G>& um);
787  void unmapRead(const const_UnitigMap<U, G>& um, LockGraph& lck_g);
788 
789  void setKmerGmerLength(const int kmer_length, const int minimizer_length = -1);
790  void print() const;
791 
792  vector<pair<size_t, UnitigMap<U, G>>> searchSequence( const string& seq, const bool exact, const bool insertion, const bool deletion,
793  const bool substitution, const double ratio_kmers, const bool or_exclusive_match);
794 
795  vector<pair<size_t, const_UnitigMap<U, G>>> searchSequence( const string& seq, const bool exact, const bool insertion, const bool deletion,
796  const bool substitution, const double ratio_kmers, const bool or_exclusive_match) const;
797 
798  int k_;
799  int g_;
800 
801  bool invalid;
802 
803  static const int tiny_vector_sz = 2;
804  static const int min_abundance_lim = 15;
805  static const int max_abundance_lim = 15;
806 
807  typedef KmerHashTable<CompressedCoverage_t<U>> h_kmers_ccov_t;
808 
809  vector<Unitig<U>*> v_unitigs;
810  KmerCovIndex<U> km_unitigs;
811  h_kmers_ccov_t h_kmers_ccov;
812 
813  MinimizerIndex hmap_min_unitigs;
814 
815  wrapperData<G> data;
816 };
817 
818 #include "CompactedDBG.tcc"
819 #include "Search.tcc"
820 #include "IO.tcc"
821 
822 #endif
CDBG_Build_opt::outputBFG
bool outputBFG
Boolean indicating if the graph is written to a BFG/BFI file.
Definition: CompactedDBG.hpp:161
CDBG_Build_opt::nb_bits_kmers_bf
size_t nb_bits_kmers_bf
Number of Bloom filter bits per k-mer occurring in the FASTA/FASTQ/GFA files of CDBG_Build_opt::filen...
Definition: CompactedDBG.hpp:136
CompactedDBG::size
size_t size() const
Return the number of unitigs in the graph.
Definition: CompactedDBG.hpp:615
CompactedDBG::find
const_UnitigMap< U, G > find(const Kmer &km, const bool extremities_only=false) const
Find the unitig containing the queried k-mer in the Compacted de Bruijn graph.
CompactedDBG::operator=
CompactedDBG< U, G > & operator=(CompactedDBG< U, G > &&o)
Move assignment operator (move a compacted de Bruijn graph).
CompactedDBG::simplify
bool simplify(const bool delete_short_isolated_unitigs=true, const bool clip_short_tips=true, const bool verbose=false)
Simplify the Compacted de Bruijn graph: clip short (< 2k length) tips and/or delete short (< 2k lengt...
CDBG_Build_opt::outputFASTA
bool outputFASTA
Boolean indicating if the graph is written to a FASTA file.
Definition: CompactedDBG.hpp:160
CDBG_Build_opt::useMercyKmers
bool useMercyKmers
Keep in the graph low coverage k-mers (cov=1) connecting tips of the graph.
Definition: CompactedDBG.hpp:157
CDBG_Data_t::extract
void extract(const UnitigMap< Unitig_data_t, Graph_data_t > &um_src, bool last_extraction)
Extract data corresponding to a sub-unitig of a unitig A.
Definition: CompactedDBG.hpp:272
CompactedDBG::operator+=
CompactedDBG< U, G > & operator+=(const CompactedDBG< U, G > &o)
Addition assignment operator (merge a compacted de Bruijn graph).
CompactedDBG::operator=
CompactedDBG< U, G > & operator=(const CompactedDBG< U, G > &o)
Copy assignment operator (copy a compacted de Bruijn graph).
CDBG_Build_opt::outFilenameBBF
string outFilenameBBF
String containing the name of a Bloom filter file that will be generated by CompactedDBG<U,...
Definition: CompactedDBG.hpp:139
CDBG_Data_t::serialize
string serialize(const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src) const
Serialize the data to a GFA-formatted string.
Definition: CompactedDBG.hpp:282
CompactedDBG::read
bool read(const string &input_graph_fn, const string &input_index_fn, const size_t nb_threads=1, const bool verbose=false)
Read a Compacted de Bruijn graph from disk (GFA1, FASTA or BFG format) using an index file (BFI forma...
CompactedDBG::add
bool add(const string &seq, const bool verbose=false)
Add a sequence to the Compacted de Bruijn graph.
CompactedDBG::isInvalid
bool isInvalid() const
Return a boolean indicating if the graph is invalid (wrong input parameters/files,...
Definition: CompactedDBG.hpp:600
CompactedDBG::findUnitig
UnitigMap< U, G > findUnitig(const char *s, const size_t pos, const size_t len)
Find the unitig containing the k-mer starting at a given position in a query sequence and extends the...
CompactedDBG::getData
const G * getData() const
Return a constant pointer to the graph data.
Definition: CompactedDBG.hpp:625
CompactedDBG::operator==
bool operator==(const CompactedDBG< U, G > &o) const
Equality operator.
Kmer
Interface to store and manipulate k-mers.
Definition: Kmer.hpp:42
CompactedDBG::const_iterator
unitigIterator< U, G, true > const_iterator
A constant iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:330
CompactedDBG::operator!=
bool operator!=(const CompactedDBG< U, G > &o) const
Inequality operator.
CompactedDBG::remove
bool remove(const const_UnitigMap< U, G > &um, const bool verbose=false)
Remove a unitig from the Compacted de Bruijn graph.
CDBG_Build_opt::prefixFilenameOut
string prefixFilenameOut
Prefix for the name of the file to which the graph must be written.
Definition: CompactedDBG.hpp:170
CompactedDBG::getK
int getK() const
Return the length of k-mers of the graph.
Definition: CompactedDBG.hpp:605
CompactedDBG::getG
int getG() const
Return the length of minimizers of the graph.
Definition: CompactedDBG.hpp:610
CompactedDBG::CompactedDBG
CompactedDBG(const CompactedDBG< U, G > &o)
Copy constructor (copy a compacted de Bruijn graph).
CompactedDBG::CompactedDBG
CompactedDBG(CompactedDBG< U, G > &&o)
Move constructor (move a compacted de Bruijn graph).
CompactedDBG::CompactedDBG
CompactedDBG(const int kmer_length=31, const int minimizer_length=-1)
Constructor (set up an empty compacted dBG).
UnitigMap.hpp
UnitigMap type interface.
CompactedDBG::build
bool build(CDBG_Build_opt &opt)
Build the Compacted de Bruijn graph.
CDBG_Build_opt::build
bool build
Boolean indicating if the graph must be built.
Definition: CompactedDBG.hpp:151
CDBG_Build_opt::filename_seq_in
vector< string > filename_seq_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:141
CDBG_Data_t
If data are to be associated with the unitigs of the compacted de Bruijn graph, those data must be wr...
Definition: CompactedDBG.hpp:220
UnitigMap
Contain all the information for the mapping of a k-mer or a sequence to a unitig of a Compacted de Br...
Definition: UnitigMap.hpp:92
CDBG_Build_opt::verbose
bool verbose
Print information messages during execution if true.
Definition: CompactedDBG.hpp:130
CompactedDBG::read
bool read(const string &input_graph_fn, const size_t nb_threads=1, const bool verbose=false)
Load a Compacted de Bruijn graph from disk (GFA1 or FASTA format).
CDBG_Build_opt::filename_graph_in
string filename_graph_in
String containing the name of a GFA file to read using CompactedDBG<U, G>::read.
Definition: CompactedDBG.hpp:172
CompactedDBG::begin
iterator begin()
Create an iterator to the first unitig of the Compacted de Bruijn graph (unitigs are NOT sorted lexic...
CompactedDBG::end
const_iterator end() const
Create a constant iterator to the "past-the-last" unitig of the Compacted de Bruijn graph (unitigs ar...
CompactedDBG::find
UnitigMap< U, G > find(const Kmer &km, const bool extremities_only=false)
Find the unitig containing the queried k-mer in the Compacted de Bruijn graph.
CDBG_Build_opt::clipTips
bool clipTips
Clip short tips (length < 2k) of the graph (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:155
Unitig.hpp
The Unitig interface.
CompactedDBG::findUnitig
const_UnitigMap< U, G > findUnitig(const char *s, const size_t pos, const size_t len) const
Find the unitig containing the k-mer starting at a given position in a query sequence and extends the...
CompactedDBG::begin
const_iterator begin() const
Create an constant iterator to the first unitig of the Compacted de Bruijn graph (unitigs are NOT sor...
CDBG_Build_opt::update
bool update
Boolean indicating if the graph must be updated.
Definition: CompactedDBG.hpp:152
CDBG_Data_t::clear
void clear(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest)
Clear the data associated with a unitig.
Definition: CompactedDBG.hpp:229
CompactedDBG::searchSequence
vector< pair< size_t, const_UnitigMap< U, G > > > searchSequence(const string &s, const bool exact, const bool insertion, const bool deletion, const bool substitution, const bool or_exclusive_match=false) const
Performs exact and/or inexact search of the k-mers of a sequence query in the Compacted de Bruijn gra...
neighborIterator
Iterator for the neighbors (predecessors or successors) of a reference unitig used in a UnitigMap obj...
Definition: NeighborIterator.hpp:34
CDBG_Build_opt::filename_ref_in
vector< string > filename_ref_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:142
CDBG_Build_opt::nb_threads
size_t nb_threads
Number of threads to use for building the graph.
Definition: CompactedDBG.hpp:132
unitigIterator
Iterator for the unitigs of a Compacted de Bruijn graph.
Definition: UnitigIterator.hpp:36
CDBG_Data_t::concat
void concat(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Join data of two unitigs which are going to be concatenated.
Definition: CompactedDBG.hpp:244
CompactedDBG::merge
bool merge(const vector< CompactedDBG > &v, const size_t nb_threads=1, const bool verbose=false)
Merge multiple compacted de Bruijn graphs.
CDBG_Build_opt
Most members of this structure are parameters for CompactedDBG<U, G>::build(), except for:
Definition: CompactedDBG.hpp:128
CompactedDBG::getData
G * getData()
Return a pointer to the graph data.
Definition: CompactedDBG.hpp:620
CompactedDBG::clear
void clear()
Clear the graph: empty the graph and reset its parameters.
CompactedDBG::end
iterator end()
Create an iterator to the "past-the-last" unitig of the Compacted de Bruijn graph (unitigs are NOT so...
Kmer.hpp
Interface for the class Kmer:
CompactedDBG::searchSequence
vector< pair< size_t, UnitigMap< U, G > > > searchSequence(const string &s, const bool exact, const bool insertion, const bool deletion, const bool substitution, const bool or_exclusive_match=false)
Performs exact and/or inexact search of the k-mers of a sequence query in the Compacted de Bruijn gra...
UnitigMap::getData
Unitig_data_ptr_t getData() const
Get a pointer to the data associated with the reference unitig used in the mapping.
UnitigIterator.hpp
The unitigIterator type interface.
CompactedDBG::~CompactedDBG
virtual ~CompactedDBG()
Destructor.
CDBG_Build_opt::deleteIsolated
bool deleteIsolated
Remove short isolated unitigs (length < 2k) of the graph (not used by CompactedDBG<U,...
Definition: CompactedDBG.hpp:156
CompactedDBG::length
size_t length() const
Return the sum of the unitigs length.
DataStorage
Definition: ColorSet.hpp:16
CDBG_Data_t::merge
void merge(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Merge the data of a sub-unitig B to the data of a sub-unitig A.
Definition: CompactedDBG.hpp:256
CompactedDBG::write
bool write(const string &output_fn, const size_t nb_threads=1, const bool GFA_output=true, const bool FASTA_output=false, const bool BFG_output=false, const bool write_index_file=true, const bool compressed_output=false, const bool verbose=false) const
Write the Compacted de Bruijn graph to disk (GFA1 format).
CompactedDBG::iterator
unitigIterator< U, G, false > iterator
An iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:329
CompactedDBG
Represent a Compacted de Bruijn graph.
Definition: CompactedDBG.hpp:313
CDBG_Build_opt::outputGFA
bool outputGFA
Boolean indicating if the graph is written to a GFA file.
Definition: CompactedDBG.hpp:159
CompactedDBG::merge
bool merge(const CompactedDBG &o, const size_t nb_threads=1, const bool verbose=false)
Merge a compacted de Bruijn graph.
CompactedDBG::nbKmers
size_t nbKmers() const
Return the number of k-mers in the graph.
CDBG_Build_opt::inFilenameBBF
string inFilenameBBF
String containing the name of a Bloom filter file that is generated by CompactedDBG<U,...
Definition: CompactedDBG.hpp:138
CDBG_Build_opt::k
int k
Length of k-mers (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:149