bpp-seq-omics  2.2.0
MafStatistics.h
Go to the documentation of this file.
1 //
2 // File: MafStatistics.h
3 // Authors: Julien Dutheil
4 // Created: Mon Jun 25 2012
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2012)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
40 #ifndef _MAFSTATISTICS_H_
41 #define _MAFSTATISTICS_H_
42 
43 #include "MafBlock.h"
44 
45 //From bpp-core:
46 #include <Bpp/Utils/MapTools.h>
47 #include <Bpp/Numeric/VectorTools.h>
48 #include <Bpp/Numeric/Number.h>
49 
50 //From the STL:
51 #include <map>
52 #include <string>
53 
54 namespace bpp {
55 
63 {
64  protected:
65  mutable std::map<std::string, BppNumberI*> values_;
66 
67  public:
69  virtual ~MafStatisticsResult() {}
70 
72  {
73  for (std::map<std::string, BppNumberI*>::const_iterator it = msr.values_.begin();
74  it != msr.values_.end();
75  ++it) {
76  values_[it->first] = it->second->clone();
77  }
78  }
79 
80  public:
81  virtual const BppNumberI& getValue(const std::string& tag) const throw (Exception) {
82  std::map<std::string, BppNumberI*>::iterator it = values_.find(tag);
83  if (it != values_.end())
84  return *it->second;
85  else
86  throw Exception("MafStatisticsResult::getValue(). No value found for tag: " + tag + ".");
87  }
88 
95  virtual void setValue(const std::string& tag, double value) throw (Exception) {
96  if (values_[tag]) {
97  delete values_[tag];
98  }
99  values_[tag] = new BppDouble(value);
100  }
101 
108  virtual void setValue(const std::string& tag, int value) throw (Exception) {
109  if (values_[tag]) {
110  delete values_[tag];
111  }
112  values_[tag] = new BppInteger(value);
113  }
114 
121  virtual void setValue(const std::string& tag, unsigned int value) throw (Exception) {
122  if (values_[tag]) {
123  delete values_[tag];
124  }
125  values_[tag] = new BppUnsignedInteger(value);
126  }
127 
132  virtual bool hasValue(const std::string& tag) const {
133  return (values_.find(tag) != values_.end());
134  }
135 
139  std::vector<std::string> getAvailableTags() const { return MapTools::getKeys(values_); }
140 };
141 
146  public virtual MafStatisticsResult
147 {
148  private:
149  std::string name_;
150 
151  public:
152  SimpleMafStatisticsResult(const std::string& name): MafStatisticsResult(), name_(name) {
153  setValue(name, 0);
154  }
156 
157  public:
158  virtual const BppNumberI& getValue(const std::string& tag) const throw (Exception) { return SimpleMafStatisticsResult::getValue(tag); }
159 
160  virtual const BppNumberI& getValue() const { return *values_[name_]; }
161 
162  virtual void setValue(const std::string& tag, double value) throw (Exception) {
163  if (tag == name_)
164  setValue(value);
165  else
166  throw Exception("SimpleMafStatisticsResult::setValue(). Unvalid tag name: " + tag + ".");
167  }
168 
169  virtual void setValue(const std::string& tag, int value) throw (Exception) {
170  if (tag == name_)
171  setValue(value);
172  else
173  throw Exception("SimpleMafStatisticsResult::setValue(). Unvalid tag name: " + tag + ".");
174  }
175 
176  virtual void setValue(const std::string& tag, unsigned int value) throw (Exception) {
177  if (tag == name_)
178  setValue(value);
179  else
180  throw Exception("SimpleMafStatisticsResult::setValue(). Unvalid tag name: " + tag + ".");
181  }
182 
183  virtual void setValue(double value) {
184  if (values_[name_]) delete values_[name_];
185  values_[name_] = new BppDouble(value);
186  }
187 
188  virtual void setValue(int value) {
189  if (values_[name_]) delete values_[name_];
190  values_[name_] = new BppInteger(value);
191  }
192 
193  virtual void setValue(unsigned int value) {
194  if (values_[name_]) delete values_[name_];
195  values_[name_] = new BppUnsignedInteger(value);
196  }
197 
198 };
199 
207 {
208  public:
210  virtual ~MafStatistics() {}
211 
212  public:
213  virtual std::string getShortName() const = 0;
214  virtual std::string getFullName() const = 0;
215  virtual const MafStatisticsResult& getResult() const = 0;
216  virtual void compute(const MafBlock& block) = 0;
217 
221  virtual std::vector<std::string> getSupportedTags() const = 0;
222 
223 };
224 
229  public virtual MafStatistics
230 {
231  protected:
233 
234  public:
237 
238  public:
239  const MafStatisticsResult& getResult() const { return result_; }
240 };
241 
246  public MafStatistics
247 {
248  protected:
250 
251  public:
252  AbstractMafStatisticsSimple(const std::string& name): result_(name) {}
254 
255  public:
256  const SimpleMafStatisticsResult& getResult() const { return result_; }
257  std::vector<std::string> getSupportedTags() const { return result_.getAvailableTags(); }
258 };
259 
265 {
266  private:
267  std::string species1_;
268  std::string species2_;
269 
270  public:
271  PairwiseDivergenceMafStatistics(const std::string& species1, const std::string& species2):
272  AbstractMafStatisticsSimple("Divergence"), species1_(species1), species2_(species2) {}
273 
275 
276  public:
277  std::string getShortName() const { return "Div." + species1_ + "-" + species2_; }
278  std::string getFullName() const { return "Pairwise divergence between " + species1_ + " and " + species2_ + "."; }
279  void compute(const MafBlock& block);
280 
281 };
282 
288 {
289  public:
292 
293  public:
294  std::string getShortName() const { return "BlockSize"; }
295  std::string getFullName() const { return "Number of sequences."; }
296  void compute(const MafBlock& block) {
297  result_.setValue(static_cast<double>(block.getNumberOfSequences()));
298  }
299 };
300 
306 {
307  public:
310 
311  public:
312  std::string getShortName() const { return "BlockLength"; }
313  std::string getFullName() const { return "Number of sites."; }
314  void compute(const MafBlock& block) {
315  result_.setValue(static_cast<double>(block.getNumberOfSites()));
316  }
317 };
318 
327 {
328  private:
329  std::string species_;
330 
331  public:
332  SequenceLengthMafStatistics(const std::string& species): AbstractMafStatisticsSimple("BlockSize"), species_(species) {}
334 
335  public:
336  std::string getShortName() const { return "SequenceLengthFor" + species_; }
337  std::string getFullName() const { return "Sequence length for species " + species_; }
338  void compute(const MafBlock& block) {
339  std::vector<const MafSequence*> seqs = block.getSequencesForSpecies(species_);
340  if (seqs.size() == 0)
341  result_.setValue(0.);
342  else if (seqs.size() == 1)
343  result_.setValue(static_cast<double>(SequenceTools::getNumberOfSites(*seqs[0])));
344  else
345  throw Exception("SequenceLengthMafStatistics::compute. More than one sequence found for species " + species_ + " in current block.");
346  }
347 };
348 
349 
355 {
356  public:
359 
360  public:
361  std::string getShortName() const { return "AlnScore"; }
362  std::string getFullName() const { return "Alignment score."; }
363  void compute(const MafBlock& block) {
364  result_.setValue(block.getScore());
365  }
366 };
367 
381  public AbstractMafStatistics
382 {
383  private:
384  const Alphabet* alphabet_;
385 
386  public:
387  CharacterCountsMafStatistics(const Alphabet* alphabet): AbstractMafStatistics(), alphabet_(alphabet) {}
389  AbstractMafStatistics(stats), alphabet_(stats.alphabet_) {}
391  AbstractMafStatistics::operator=(stats);
392  alphabet_ = stats.alphabet_;
393  return *this;
394  }
395 
397 
398  public:
399  std::string getShortName() const { return "Count"; }
400  std::string getFullName() const { return "Character counts."; }
401  void compute(const MafBlock& block);
402  std::vector<std::string> getSupportedTags() const;
403 };
404 
405 
412  public virtual MafStatistics
413 {
414  private:
415  std::vector<std::string> species_;
416 
417  public:
418  AbstractSpeciesSelectionMafStatistics(const std::vector<std::string>& species):
419  species_(species) {}
420 
421  protected:
422  SiteContainer* getSiteContainer_(const MafBlock& block);
423 
424 };
425 
426 
433  public virtual MafStatistics
434 {
435  private:
436  std::vector< std::vector<std::string> > species_;
437 
438  public:
439  AbstractSpeciesMultipleSelectionMafStatistics(const std::vector< std::vector<std::string> >& species);
440 
441  protected:
442  std::vector<SiteContainer*> getSiteContainers_(const MafBlock& block);
443 
444 };
445 
446 
454  public AbstractMafStatistics,
456 {
457  private:
458  class Categorizer {
459  private:
460  std::vector<double> bounds_;
461 
462  public:
463  Categorizer(const std::vector<double>& bounds):
464  bounds_(bounds) {
465  std::sort(bounds_.begin(), bounds_.end());
466  }
467 
468  public:
469  size_t getNumberOfCategories() const { return (bounds_.size() - 1); }
470 
471  //Category numbers start at 1!
472  size_t getCategory(double value) const throw (OutOfRangeException) {
473  if (value < bounds_[0])
474  throw OutOfRangeException("SiteFrequencySpectrumMafStatistics::Categorizer::getCategory.", value, *bounds_.begin(), *bounds_.rbegin());
475  for (size_t i = 1; i < bounds_.size(); ++i) {
476  if (value < bounds_[i])
477  return i;
478  }
479  throw OutOfRangeException("SiteFrequencySpectrumMafStatistics::Categorizer::getCategory.", value, *bounds_.begin(), *bounds_.rbegin());
480  }
481  };
482 
483  private:
484  const Alphabet* alphabet_;
486  std::vector<unsigned int> counts_;
487  std::string outgroup_;
488 
489  public:
490  SiteFrequencySpectrumMafStatistics(const Alphabet* alphabet, const std::vector<double>& bounds, const std::vector<std::string>& ingroup, const std::string outgroup = ""):
493  alphabet_(alphabet),
494  categorizer_(bounds),
495  counts_(bounds.size() - 1),
496  outgroup_(outgroup)
497  {}
498 
502  alphabet_(stats.alphabet_),
503  categorizer_(stats.categorizer_),
504  counts_(stats.counts_),
505  outgroup_(stats.outgroup_)
506  {}
507 
509  AbstractMafStatistics::operator=(stats);
510  AbstractSpeciesSelectionMafStatistics::operator=(stats);
511  alphabet_ = stats.alphabet_;
512  categorizer_ = stats.categorizer_;
513  counts_ = stats.counts_;
514  outgroup_ = stats.outgroup_;
515  return *this;
516  }
517 
519 
520  public:
521  std::string getShortName() const { return "SiteFrequencySpectrum"; }
522  std::string getFullName() const { return "Site frequency spectrum."; }
523  void compute(const MafBlock& block);
524  std::vector<std::string> getSupportedTags() const;
525 };
526 
527 
539  public AbstractMafStatistics,
541 {
542  private:
543  const Alphabet* alphabet_;
544  std::vector<unsigned int> counts_;
545 
546  public:
548  const Alphabet* alphabet,
549  const std::vector<std::string>& species):
552  alphabet_(alphabet),
553  counts_(6)
554  {
555  if (species.size() != 4)
556  throw Exception("FourSpeciesPatternCountsMafStatistics, constructor: 4 species should be provided.");
557  if (VectorTools::unique(species).size() != 4)
558  throw Exception("FourSpeciesPatternCountsMafStatistics, constructor: duplicated species name!");
559  }
560 
564  alphabet_(stats.alphabet_),
565  counts_(stats.counts_)
566  {}
567 
569  AbstractMafStatistics::operator=(stats);
570  AbstractSpeciesSelectionMafStatistics::operator=(stats);
571  alphabet_ = stats.alphabet_;
572  counts_ = stats.counts_;
573  return *this;
574  }
575 
577 
578  public:
579  std::string getShortName() const { return "FourSpeciesPatternCounts"; }
580  std::string getFullName() const { return "FourSpecies pattern counts."; }
581  void compute(const MafBlock& block);
582  std::vector<std::string> getSupportedTags() const;
583 };
584 
585 
586 
596  public AbstractMafStatistics,
598 {
599  public:
600  SiteMafStatistics(const std::vector<std::string>& species):
603  {}
604 
605  virtual ~SiteMafStatistics() {}
606 
607  public:
608  std::string getShortName() const { return "SiteStatistics"; }
609  std::string getFullName() const { return "Site statistics."; }
610  void compute(const MafBlock& block);
611  std::vector<std::string> getSupportedTags() const;
612 };
613 
614 
628  public AbstractMafStatistics,
630 {
631  public:
632  PolymorphismMafStatistics(const std::vector< std::vector<std::string> >& species):
635  {
636  if (species.size() != 2)
637  throw Exception("PolymorphismStatistics: exactly two species selection should be provided.");
638  }
639 
641 
642  public:
643  std::string getShortName() const { return "PolymorphismStatistics"; }
644  std::string getFullName() const { return "Polymorphism statistics."; }
645  void compute(const MafBlock& block);
646  std::vector<std::string> getSupportedTags() const;
647 
648  private:
649  static std::vector<int> getPatterns_(const SiteContainer& sites);
650 };
651 
652 
653 
663  public AbstractMafStatistics,
665 {
666  public:
667  SequenceDiversityMafStatistics(const std::vector<std::string>& ingroup):
670  {}
671 
673 
674  public:
675  std::string getShortName() const { return "SequenceDiversityStatistics"; }
676  std::string getFullName() const { return "Sequence diversity statistics."; }
677  void compute(const MafBlock& block);
678  std::vector<std::string> getSupportedTags() const;
679 
680  private:
681  static std::vector<int> getPatterns_(const SiteContainer& sites);
682 };
683 
684 
685 } // end of namespace bpp
686 
687 #endif //_MAFSTATISTICS_H_
688 
Counts number of polymorphic / fixed sites in two populations.
virtual const BppNumberI & getValue(const std::string &tag) const
Definition: MafStatistics.h:81
CharacterCountsMafStatistics(const CharacterCountsMafStatistics &stats)
MafStatisticsResult(const MafStatisticsResult &msr)
Definition: MafStatistics.h:71
std::vector< std::string > getSupportedTags() const
std::string getShortName() const
std::map< std::string, BppNumberI * > values_
Definition: MafStatistics.h:65
PolymorphismMafStatistics(const std::vector< std::vector< std::string > > &species)
void compute(const MafBlock &block)
std::string getShortName() const
Provide estimates of sequence diversity.
void compute(const MafBlock &block)
const MafStatisticsResult & getResult() const
General interface for storing statistical results.
Definition: MafStatistics.h:62
FourSpeciesPatternCountsMafStatistics(const FourSpeciesPatternCountsMafStatistics &stats)
std::string getFullName() const
SiteMafStatistics(const std::vector< std::string > &species)
virtual void setValue(int value)
virtual bool hasValue(const std::string &tag) const
std::vector< const MafSequence * > getSequencesForSpecies(const std::string &species) const
Definition: MafBlock.h:149
Partial implementation of MafStatistics, for convenience.
SimpleMafStatisticsResult result_
void compute(const MafBlock &block)
std::vector< std::string > getSupportedTags() const
Retrieve the sequence length (number of nucleotides) for a given species in a maf block...
std::vector< unsigned int > counts_
Retrieves the alignment score of a maf block.
General interface for computing statistics based on a Maf block.
virtual void setValue(const std::string &tag, unsigned int value)
Associate a value to a certain tag. Any existing tag will be overwritten.
size_t getNumberOfSites() const
Definition: MafBlock.h:113
std::string getShortName() const
std::string getShortName() const
std::vector< std::string > getSupportedTags() const
SiteFrequencySpectrumMafStatistics & operator=(const SiteFrequencySpectrumMafStatistics &stats)
Computes the number of columns in a maf block.
virtual std::string getShortName() const =0
void compute(const MafBlock &block)
Compute the frequency of site patterns for a quadruplet of species.
FourSpeciesPatternCountsMafStatistics(const Alphabet *alphabet, const std::vector< std::string > &species)
CharacterCountsMafStatistics & operator=(const CharacterCountsMafStatistics &stats)
void compute(const MafBlock &block)
std::vector< std::string > getSupportedTags() const
AbstractMafStatisticsSimple(const std::string &name)
virtual void setValue(const std::string &tag, unsigned int value)
Associate a value to a certain tag. Any existing tag will be overwritten.
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:55
std::string getShortName() const
std::vector< std::string > getAvailableTags() const
std::string getShortName() const
Compute the Site Frequency Spectrum of a maf block.
void compute(const MafBlock &block)
std::vector< std::string > getSupportedTags() const
SiteContainer * getSiteContainer_(const MafBlock &block)
Partial implementation of MafStatistics, for convenience.
virtual std::string getFullName() const =0
std::string getFullName() const
SiteFrequencySpectrumMafStatistics(const Alphabet *alphabet, const std::vector< double > &bounds, const std::vector< std::string > &ingroup, const std::string outgroup="")
SiteFrequencySpectrumMafStatistics(const SiteFrequencySpectrumMafStatistics &stats)
virtual std::vector< std::string > getSupportedTags() const =0
void compute(const MafBlock &block)
void compute(const MafBlock &block)
const SimpleMafStatisticsResult & getResult() const
virtual const MafStatisticsResult & getResult() const =0
std::vector< std::vector< std::string > > species_
virtual const BppNumberI & getValue(const std::string &tag) const
virtual void compute(const MafBlock &block)=0
Compute the base frequencies of a maf block.
std::vector< SiteContainer * > getSiteContainers_(const MafBlock &block)
std::vector< unsigned int > counts_
virtual void setValue(const std::string &tag, int value)
Associate a value to a certain tag. Any existing tag will be overwritten.
void compute(const MafBlock &block)
std::vector< std::string > getSupportedTags() const
std::string getFullName() const
virtual const BppNumberI & getValue() const
virtual ~MafStatistics()
PairwiseDivergenceMafStatistics(const std::string &species1, const std::string &species2)
static std::vector< int > getPatterns_(const SiteContainer &sites)
Partial implementation of MafStatistics for method working on a subset of species, in a site-wise manner.
virtual void setValue(const std::string &tag, double value)
Associate a value to a certain tag. Any existing tag will be overwritten.
static std::vector< int > getPatterns_(const SiteContainer &sites)
AbstractSpeciesSelectionMafStatistics(const std::vector< std::string > &species)
FourSpeciesPatternCountsMafStatistics & operator=(const FourSpeciesPatternCountsMafStatistics &stats)
std::string getFullName() const
virtual void setValue(const std::string &tag, double value)
Associate a value to a certain tag. Any existing tag will be overwritten.
Definition: MafStatistics.h:95
AbstractSpeciesMultipleSelectionMafStatistics(const std::vector< std::vector< std::string > > &species)
double getScore() const
Definition: MafBlock.h:105
Partial implementation of MafStatistics for method working on multiple distinct subsets of species...
void compute(const MafBlock &block)
std::string getFullName() const
Computes the pairwise divergence for a pair of sequences in a maf block.
Computes the number of sequences in a maf block.
size_t getNumberOfSequences() const
Definition: MafBlock.h:111
std::string getShortName() const
Compute a few site statistics in a maf block.
MafStatisticsResult result_
virtual void setValue(const std::string &tag, int value)
Associate a value to a certain tag. Any existing tag will be overwritten.
SequenceDiversityMafStatistics(const std::vector< std::string > &ingroup)
A simple maf statistics result, with only one value.
Categorizer(const std::vector< double > &bounds)
SequenceLengthMafStatistics(const std::string &species)
CharacterCountsMafStatistics(const Alphabet *alphabet)
SimpleMafStatisticsResult(const std::string &name)
std::string getFullName() const
virtual void setValue(double value)
virtual void setValue(unsigned int value)
std::vector< std::string > getSupportedTags() const