bpp-popgen  2.2.0
SequenceStatistics.h
Go to the documentation of this file.
1 //
2 // File SequenceStatistics.h
3 // Authors: Eric Bazin
4 // Sylvain Gaillard
5 // Khalid Belkhir
6 // Benoit Nabholz
7 // Created on: Wed Aug 04 2004
8 //
9 
10 /*
11  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
12 
13  This software is a computer program whose purpose is to provide classes
14  for population genetics analysis.
15 
16  This software is governed by the CeCILL license under French law and
17  abiding by the rules of distribution of free software. You can use,
18  modify and/ or redistribute the software under the terms of the CeCILL
19  license as circulated by CEA, CNRS and INRIA at the following URL
20  "http://www.cecill.info".
21 
22  As a counterpart to the access to the source code and rights to copy,
23  modify and redistribute granted by the license, users are provided only
24  with a limited warranty and the software's author, the holder of the
25  economic rights, and the successive licensors have only limited
26  liability.
27 
28  In this respect, the user's attention is drawn to the risks associated
29  with loading, using, modifying and/or developing or reproducing the
30  software by the user in light of its specific status of free software,
31  that may mean that it is complicated to manipulate, and that also
32  therefore means that it is reserved for developers and experienced
33  professionals having in-depth computer knowledge. Users are therefore
34  encouraged to load and test the software's suitability as regards their
35  requirements in conditions enabling the security of their systems and/or
36  data to be ensured and, more generally, to use and operate it in the
37  same conditions as regards security.
38 
39  The fact that you are presently reading this means that you have had
40  knowledge of the CeCILL license and that you accept its terms.
41  */
42 
43 // Secured inclusion of header's file
44 #ifndef _SEQUENCESTATISTICS_H_
45 #define _SEQUENCESTATISTICS_H_
46 
47 // From the bpp-seq library
48 #include <Bpp/Seq/SymbolListTools.h>
49 #include <Bpp/Seq/Alphabet/CodonAlphabet.h>
50 #include <Bpp/Seq/GeneticCode/GeneticCode.h>
51 #include <Bpp/Seq/Container/SiteContainerIterator.h>
52 #include <Bpp/Seq/Container/SiteContainer.h>
53 #include <Bpp/Seq/Container/SiteContainerTools.h>
54 
56 
57 // From the STL
58 #include <string>
59 #include <map>
60 #include <vector>
61 
62 namespace bpp
63 {
70 {
71 public:
87  static unsigned int numberOfPolymorphicSites(
89  bool gapflag = true,
90  bool ignoreUnknown = true);
91 
99  static unsigned int numberOfParsimonyInformativeSites(
101  bool gapflag = true);
102 
111  static unsigned int numberOfSingletons(
113  bool gapflag = true);
114 
125  static unsigned int totalNumberOfMutations(
127  bool gapflag = true);
128 
142  static unsigned int totalNumberOfMutationsOnExternalBranches(
144  const PolymorphismSequenceContainer& outg)
145  throw (Exception);
146 
154  static unsigned int numberOfTriplets(
156  bool gapflag = true);
157 
164  static double heterozygosity(
166  bool gapflag = true);
167 
175  static double squaredHeterozygosity(
177  bool gapflag = true);
178 
184  static double gcContent(
185  const PolymorphismSequenceContainer& psc);
186 
201  static std::vector<unsigned int> gcPolymorphism(
203  bool gapflag = true);
204 
221  static double watterson75(
223  bool gapflag = true,
224  bool ignoreUnknown = true);
225 
243  static double tajima83(
245  bool gapflag = true);
246 
255  static double fayWu2000(
257  const Sequence& ancestralSites);
258 
271  static unsigned int dvk(
273  bool gapflag = true);
274 
287  static double dvh(
289  bool gapflag = true);
290 
297  static unsigned int numberOfTransitions(
298  const PolymorphismSequenceContainer& psc);
299 
306  static unsigned int numberOfTransversions(
307  const PolymorphismSequenceContainer& psc);
308 
315  static double ratioOfTransitionsTransversions(
316  const PolymorphismSequenceContainer& psc )
317  throw (Exception);
318 
328  static unsigned int numberOfSitesWithStopCodon(
330  const GeneticCode& gCode,
331  bool gapflag = true);
332 
345  static unsigned int numberOfMonoSitePolymorphicCodons(
347  bool stopflag = true,
348  bool gapflag = true);
349 
360  static unsigned int numberOfSynonymousPolymorphicCodons(
362  const GeneticCode& gc);
363 
378  static double watterson75Synonymous(
380  const GeneticCode& gc);
381 
396  static double watterson75NonSynonymous(
398  const GeneticCode& gc);
399 
415  static double piSynonymous(
417  const GeneticCode& gc,
418  bool minchange = false);
419 
435  static double piNonSynonymous(
437  const GeneticCode& gc,
438  bool minchange = false);
439 
454  static double meanNumberOfSynonymousSites(
456  const GeneticCode& gc,
457  double ratio = 1.);
458 
472  static double meanNumberOfNonSynonymousSites(
474  const GeneticCode& gc,
475  double ratio = 1.);
476 
492  static unsigned int numberOfSynonymousSubstitutions(
494  const GeneticCode& gc,
495  double freqmin = 0.);
496 
512  static unsigned int numberOfNonSynonymousSubstitutions(
514  const GeneticCode& gc,
515  double freqmin = 0.);
516 
534  static std::vector<unsigned int> fixedDifferences(
535  const PolymorphismSequenceContainer& pscin,
536  const PolymorphismSequenceContainer& pscout,
538  const GeneticCode& gc);
539 
551  static std::vector<unsigned int> mkTable(
552  const PolymorphismSequenceContainer& ingroup,
553  const PolymorphismSequenceContainer& outgroup,
554  const GeneticCode& gc,
555  double freqmin = 0.);
556 
570  static double neutralityIndex(
571  const PolymorphismSequenceContainer& ingroup,
572  const PolymorphismSequenceContainer& outgroup,
573  const GeneticCode& gc,
574  double freqmin = 0.);
575 
591  static double tajimaDss(
593  bool gapflag = true)
594  throw (ZeroDivisionException);
595 
609  static double tajimaDtnm(
611  bool gapflag = true)
612  throw (ZeroDivisionException);
613 
629  static double fuLiD(
630  const PolymorphismSequenceContainer& ingroup,
631  const PolymorphismSequenceContainer& outgroup,
632  bool original = true)
633  throw (ZeroDivisionException);
634 
641  static double fuLiDStar(
642  const PolymorphismSequenceContainer& group)
643  throw (ZeroDivisionException);
644 
659  static double fuLiF(
660  const PolymorphismSequenceContainer& ingroup,
661  const PolymorphismSequenceContainer& outgroup,
662  bool original = true)
663  throw (ZeroDivisionException);
664 
671  static double fuLiFStar(
672  const PolymorphismSequenceContainer& group)
673  throw (ZeroDivisionException);
674 
693  double fstHudson92(
695  size_t id1,
696  size_t id2);
697 
698 
727  bool keepsingleton = true,
728  double freqmin = 0.);
729 
743  static Vdouble pairwiseDistances1(
745  bool keepsingleton = true,
746  double freqmin = 0.)
747  throw (DimensionException);
748 
763  static Vdouble pairwiseDistances2(
765  bool keepsingleton = true,
766  double freqmin = 0.)
767  throw (DimensionException);
768 
781  static Vdouble pairwiseD(
783  bool keepsingleton = true,
784  double freqmin = 0.)
785  throw (DimensionException);
786 
799  static Vdouble pairwiseDprime(
801  bool keepsingleton = true,
802  double freqmin = 0.)
803  throw (DimensionException);
804 
817  static Vdouble pairwiseR2(
819  bool keepsingleton = true,
820  double freqmin = 0.)
821  throw (DimensionException);
822 
835  static double meanD(
837  bool keepsingleton = true,
838  double freqmin = 0.)
839  throw (DimensionException);
840 
853  static double meanDprime(
855  bool keepsingleton = true,
856  double freqmin = 0.)
857  throw (DimensionException);
858 
871  static double meanR2(
873  bool keepsingleton = true,
874  double freqmin = 0.)
875  throw (DimensionException);
876 
888  static double meanDistance1(
890  bool keepsingleton = true,
891  double freqmin = 0.)
892  throw (DimensionException);
893 
905  static double meanDistance2(
907  bool keepsingleton = true,
908  double freqmin = 0.)
909  throw (DimensionException);
910 
927  static double originRegressionD(
929  bool distance1 = false,
930  bool keepsingleton = true,
931  double freqmin = 0.)
932  throw (DimensionException);
933 
950  static double originRegressionDprime(
952  bool distance1 = false,
953  bool keepsingleton = true,
954  double freqmin = 0.)
955  throw (DimensionException);
956 
973  static double originRegressionR2(
975  bool distance1 = false,
976  bool keepsingleton = true,
977  double freqmin = 0.)
978  throw (DimensionException);
979 
996  static Vdouble linearRegressionD(
998  bool distance1 = false,
999  bool keepsingleton = true,
1000  double freqmin = 0.)
1001  throw (DimensionException);
1002 
1019  static Vdouble linearRegressionDprime(
1020  const PolymorphismSequenceContainer& psc,
1021  bool distance1 = false,
1022  bool keepsingleton = true,
1023  double freqmin = 0.) throw (DimensionException);
1024 
1041  static Vdouble linearRegressionR2(
1042  const PolymorphismSequenceContainer& psc,
1043  bool distance1 = false,
1044  bool keepsingleton = true,
1045  double freqmin = 0.)
1046  throw (DimensionException);
1047 
1065  static double inverseRegressionR2(
1066  const PolymorphismSequenceContainer& psc,
1067  bool distance1 = false,
1068  bool keepsingleton = true,
1069  double freqmin = 0.)
1070  throw (DimensionException);
1071 
1081  static double hudson87(
1082  const PolymorphismSequenceContainer& psc,
1083  double precision = 0.000001,
1084  double cinf = 0.001,
1085  double csup = 10000.);
1086 
1093  static void testUsefulValues(
1094  std::ostream& s,
1095  size_t n);
1096 
1097 private:
1101  static unsigned int getNumberOfMutations_(const Site& site);
1102 
1106  static unsigned int getNumberOfSingletons_(const Site& site);
1107 
1115  static unsigned getNumberOfDerivedSingletons_(
1116  const Site& site_in,
1117  const Site& site_out);
1118 
1152  static std::map<std::string, double> getUsefulValues_(
1153  size_t n);
1154 
1167  static double getVD_(
1168  size_t n,
1169  double a1,
1170  double a2,
1171  double cn);
1172 
1183  static double getUD_(
1184  double a1,
1185  double vD);
1186 
1199  static double getVDstar_(
1200  size_t n,
1201  double a1,
1202  double a2,
1203  double dn);
1204 
1216  static double getUDstar_(
1217  size_t n,
1218  double a1,
1219  double vDs);
1220 
1226  static double leftHandHudson_(
1227  const PolymorphismSequenceContainer& psc);
1228 
1233  static double rightHandHudson_(
1234  double c,
1235  size_t n);
1236 
1237  /************************************************************************/
1238 };
1239 } // end of namespace bpp;
1240 
1241 #endif // _SEQUENCESTATISTICS_H_
1242 
static unsigned int getNumberOfSingletons_(const Site &site)
Count the number of singleton for a site.
static unsigned int totalNumberOfMutationsOnExternalBranches(const PolymorphismSequenceContainer &ing, const PolymorphismSequenceContainer &outg)
Count the total number of mutations in external branchs.
static double tajima83(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute diversity estimator Theta of Tajima (1983, Genetics, 105 pp437-460)
static unsigned int dvk(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the number of haplotype in the sample. Depaulis and Veuille (1998, Mol Biol Evol...
static double getUD_(double a1, double vD)
Get the uD value of equation (32) in Fu & Li 1993, Genetics, 133 pp693-709)
static double meanNumberOfSynonymousSites(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double ratio=1.)
compute the mean number of synonymous site in an alignment
static double watterson75(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Compute diversity estimator Theta of Watterson (1975, Theor Popul Biol, 7 pp256-276) ...
static double neutralityIndex(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, const GeneticCode &gc, double freqmin=0.)
return the neutrality index NI = (Pa/Ps)/(Da/Ds) (Rand & Kann 1996, Mol. Biol. Evol. 13 pp735-748)
static double originRegressionDprime(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression |D&#39;| = 1+a*distance
static std::vector< unsigned int > fixedDifferences(const PolymorphismSequenceContainer &pscin, const PolymorphismSequenceContainer &pscout, PolymorphismSequenceContainer &psccons, const GeneticCode &gc)
compute the number of fixed differences between two alignements
static double meanD(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean D over all pairwise comparisons
static double fuLiDStar(const PolymorphismSequenceContainer &group)
Return the Fu and Li D* test (Fu & Li 1993, Genetics, 133 pp693-709).
static double fuLiD(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, bool original=true)
Return the Fu and Li D test (Fu & Li 1993, Genetics, 133 pp693-709).
static double watterson75Synonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the Watterson(1975,Theor Popul Biol, 7 pp256-276) estimator for synonymous positions...
static double leftHandHudson_(const PolymorphismSequenceContainer &psc)
give the left hand term of equation (4) in Hudson (Hudson 1987, Genet. Res., 50 pp245-250) This term ...
static Vdouble pairwiseD(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise D value between two sites (Lewontin & Kojima 1964...
static double meanDistance2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean pairwise distances between sites / method 2: differences between sequences are taken into a...
static unsigned int numberOfSitesWithStopCodon(const PolymorphismSequenceContainer &psc, const GeneticCode &gCode, bool gapflag=true)
Compute the number of codon sites with stop codon.
static void testUsefulValues(std::ostream &s, size_t n)
Test useful values.
static unsigned int totalNumberOfMutations(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Count the total number of mutations in an alignment.
static double fuLiF(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, bool original=true)
Return the Fu and Li F test (Fu & Li 1993, Genetics, 133 pp693-709).
static Vdouble linearRegressionD(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression |D| = a*distance+b
static double piSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, bool minchange=false)
Compute the synonymous nucleotide diversity, pi.
static double gcContent(const PolymorphismSequenceContainer &psc)
Compute the mean GC content in an alignment.
static double meanDprime(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean D&#39; over all pairwise comparisons
static std::map< std::string, double > getUsefulValues_(size_t n)
Get useful values for theta estimators.
static double tajimaDss(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the Tajima&#39;s D test (Tajima 1989, Genetics 123 pp 585-595).
static double ratioOfTransitionsTransversions(const PolymorphismSequenceContainer &psc)
Return the ratio of transitions/transversions.
double fstHudson92(const PolymorphismSequenceContainer &psc, size_t id1, size_t id2)
static unsigned int numberOfSynonymousSubstitutions(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double freqmin=0.)
compute the number of synonymous subsitutions in an alignment
static Vdouble linearRegressionDprime(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression |D&#39;| = a*distance+b
static double originRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression R² = 1+a*distance
static Vdouble pairwiseDistances1(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of the pairwise distances between site positions corresponding to a LD SequencePolymo...
static double originRegressionD(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression |D| = 1+a*distance
static double getVDstar_(size_t n, double a1, double a2, double dn)
Get the vD* value of D* equation in Fu & Li 1993, Genetics, 133 pp693-709)
static double heterozygosity(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the sum of per site heterozygosity in an alignment.
static std::vector< unsigned int > gcPolymorphism(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the number of GC alleles and the total number of alleles at polymorphic sites only...
Static class providing methods to compute statistics on sequences data.
static double hudson87(const PolymorphismSequenceContainer &psc, double precision=0.000001, double cinf=0.001, double csup=10000.)
give estimate of C=4Nr using Hudson method (Hudson 1987, Genet. Res., 50 pp245-250) ...
static double piNonSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, bool minchange=false)
Compute the non-synonymous nucleotide diversity, pi.
static unsigned int numberOfTransversions(const PolymorphismSequenceContainer &psc)
Return the number of transversions.
static double tajimaDtnm(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the Tajima&#39;s D test (Tajima 1989, Genetics 123 pp 585-595).
static double fayWu2000(const PolymorphismSequenceContainer &psc, const Sequence &ancestralSites)
Compute diversity estimator Theta H (eq. 3) of Fay and Wu (2000, Genetics, 155: 1405-1413) ...
static double meanDistance1(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean pairwise distances between sites / method 1: differences between sequences are not taken in...
static double meanR2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean R² over all pairwise comparisons
static PolymorphismSequenceContainer * generateLdContainer(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
generate a special PolymorphismSequenceContainer for linkage disequilbrium analysis ...
static unsigned int numberOfMonoSitePolymorphicCodons(const PolymorphismSequenceContainer &psc, bool stopflag=true, bool gapflag=true)
Compute the number of polymorphic codon with only one mutated site.
static Vdouble pairwiseR2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise R² value between two sites (Hill & Robertson 1968...
static unsigned int numberOfTransitions(const PolymorphismSequenceContainer &psc)
Return the number of transitions.
static double fuLiFStar(const PolymorphismSequenceContainer &group)
Return the Fu and Li F* test (Fu & Li 1993, Genetics, 133 pp693-709).
static double dvh(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the haplotype diversity of a sample. Depaulis and Veuille (1998, Mol Biol Evol, 12 pp1788-1790)
static unsigned int numberOfTriplets(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the number of triplet in an alignment.
static unsigned int numberOfNonSynonymousSubstitutions(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double freqmin=0.)
compute the number of non synonymous subsitutions in an alignment
static Vdouble pairwiseDprime(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise D&#39; value between two sites (Lewontin 1964, Genetics 49 pp49-67))...
static unsigned getNumberOfDerivedSingletons_(const Site &site_in, const Site &site_out)
Count the number of singleton for a site.
static std::vector< unsigned int > mkTable(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, const GeneticCode &gc, double freqmin=0.)
return a vector containing Pa, Ps, Da, Ds
The PolymorphismSequenceContainer class.
static Vdouble pairwiseDistances2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise distance between two sites to a LD SequencePolymorphismContainer...
static unsigned int numberOfParsimonyInformativeSites(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the number of parsimony informative sites in an alignment.
static double watterson75NonSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the Watterson(1975, Theor Popul Biol, 7 pp256-276) estimator for non synonymous positions...
static Vdouble linearRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression R² = a*distance+b
static double squaredHeterozygosity(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the sum of per site squared heterozygosity in an alignment.
static double rightHandHudson_(double c, size_t n)
give the right hand term of equation (4) in Hudson (Hudson 1987, Genet. Res., 50 pp245-250) This term...
static double getVD_(size_t n, double a1, double a2, double cn)
Get the vD value of equation (32) in Fu & Li 1993, Genetics, 133 pp693-709)
static double getUDstar_(size_t n, double a1, double vDs)
Get the uD* value of D* equation in Fu & Li 1993, Genetics, 133 pp693-709)
static double meanNumberOfNonSynonymousSites(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double ratio=1.)
compute the mean number of non-synonymous site in an alignment
static unsigned int numberOfSynonymousPolymorphicCodons(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the number of synonymous polymorphic codon sites.
static unsigned int getNumberOfMutations_(const Site &site)
Count the number of mutation for a site.
static double inverseRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression R² = 1/(1+a*distance)
static unsigned int numberOfSingletons(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Count the number of singleton nucleotides in an alignment.
static unsigned int numberOfPolymorphicSites(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Compute the number of polymorphic site in an alignment.