44 #ifndef _SEQUENCESTATISTICS_H_ 45 #define _SEQUENCESTATISTICS_H_ 48 #include <Bpp/Seq/SymbolListTools.h> 49 #include <Bpp/Seq/Alphabet/CodonAlphabet.h> 50 #include <Bpp/Seq/GeneticCode/GeneticCode.h> 51 #include <Bpp/Seq/Container/SiteContainerIterator.h> 52 #include <Bpp/Seq/Container/SiteContainer.h> 53 #include <Bpp/Seq/Container/SiteContainerTools.h> 90 bool ignoreUnknown =
true);
101 bool gapflag =
true);
113 bool gapflag =
true);
127 bool gapflag =
true);
156 bool gapflag =
true);
166 bool gapflag =
true);
177 bool gapflag =
true);
203 bool gapflag =
true);
224 bool ignoreUnknown =
true);
245 bool gapflag =
true);
257 const Sequence& ancestralSites);
271 static unsigned int dvk(
273 bool gapflag =
true);
289 bool gapflag =
true);
330 const GeneticCode& gCode,
331 bool gapflag =
true);
347 bool stopflag =
true,
348 bool gapflag =
true);
362 const GeneticCode& gc);
380 const GeneticCode& gc);
398 const GeneticCode& gc);
417 const GeneticCode& gc,
418 bool minchange =
false);
437 const GeneticCode& gc,
438 bool minchange =
false);
456 const GeneticCode& gc,
474 const GeneticCode& gc,
494 const GeneticCode& gc,
495 double freqmin = 0.);
514 const GeneticCode& gc,
515 double freqmin = 0.);
538 const GeneticCode& gc);
551 static std::vector<unsigned int>
mkTable(
554 const GeneticCode& gc,
555 double freqmin = 0.);
573 const GeneticCode& gc,
574 double freqmin = 0.);
594 throw (ZeroDivisionException);
612 throw (ZeroDivisionException);
632 bool original =
true)
633 throw (ZeroDivisionException);
643 throw (ZeroDivisionException);
662 bool original =
true)
663 throw (ZeroDivisionException);
673 throw (ZeroDivisionException);
727 bool keepsingleton =
true,
728 double freqmin = 0.);
745 bool keepsingleton =
true,
747 throw (DimensionException);
765 bool keepsingleton =
true,
767 throw (DimensionException);
783 bool keepsingleton =
true,
785 throw (DimensionException);
801 bool keepsingleton =
true,
803 throw (DimensionException);
819 bool keepsingleton =
true,
821 throw (DimensionException);
837 bool keepsingleton =
true,
839 throw (DimensionException);
855 bool keepsingleton =
true,
857 throw (DimensionException);
873 bool keepsingleton =
true,
875 throw (DimensionException);
890 bool keepsingleton =
true,
892 throw (DimensionException);
907 bool keepsingleton =
true,
909 throw (DimensionException);
929 bool distance1 =
false,
930 bool keepsingleton =
true,
932 throw (DimensionException);
952 bool distance1 =
false,
953 bool keepsingleton =
true,
955 throw (DimensionException);
975 bool distance1 =
false,
976 bool keepsingleton =
true,
978 throw (DimensionException);
998 bool distance1 =
false,
999 bool keepsingleton =
true,
1000 double freqmin = 0.)
1001 throw (DimensionException);
1021 bool distance1 =
false,
1022 bool keepsingleton =
true,
1023 double freqmin = 0.)
throw (DimensionException);
1043 bool distance1 =
false,
1044 bool keepsingleton =
true,
1045 double freqmin = 0.)
1046 throw (DimensionException);
1067 bool distance1 =
false,
1068 bool keepsingleton =
true,
1069 double freqmin = 0.)
1070 throw (DimensionException);
1083 double precision = 0.000001,
1084 double cinf = 0.001,
1085 double csup = 10000.);
1116 const Site& site_in,
1117 const Site& site_out);
1241 #endif // _SEQUENCESTATISTICS_H_ static unsigned int getNumberOfSingletons_(const Site &site)
Count the number of singleton for a site.
static unsigned int totalNumberOfMutationsOnExternalBranches(const PolymorphismSequenceContainer &ing, const PolymorphismSequenceContainer &outg)
Count the total number of mutations in external branchs.
static double tajima83(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute diversity estimator Theta of Tajima (1983, Genetics, 105 pp437-460)
static unsigned int dvk(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the number of haplotype in the sample. Depaulis and Veuille (1998, Mol Biol Evol...
static double getUD_(double a1, double vD)
Get the uD value of equation (32) in Fu & Li 1993, Genetics, 133 pp693-709)
static double meanNumberOfSynonymousSites(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double ratio=1.)
compute the mean number of synonymous site in an alignment
static double watterson75(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Compute diversity estimator Theta of Watterson (1975, Theor Popul Biol, 7 pp256-276) ...
static double neutralityIndex(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, const GeneticCode &gc, double freqmin=0.)
return the neutrality index NI = (Pa/Ps)/(Da/Ds) (Rand & Kann 1996, Mol. Biol. Evol. 13 pp735-748)
static double originRegressionDprime(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression |D'| = 1+a*distance
static std::vector< unsigned int > fixedDifferences(const PolymorphismSequenceContainer &pscin, const PolymorphismSequenceContainer &pscout, PolymorphismSequenceContainer &psccons, const GeneticCode &gc)
compute the number of fixed differences between two alignements
static double meanD(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean D over all pairwise comparisons
static double fuLiDStar(const PolymorphismSequenceContainer &group)
Return the Fu and Li D* test (Fu & Li 1993, Genetics, 133 pp693-709).
static double fuLiD(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, bool original=true)
Return the Fu and Li D test (Fu & Li 1993, Genetics, 133 pp693-709).
static double watterson75Synonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the Watterson(1975,Theor Popul Biol, 7 pp256-276) estimator for synonymous positions...
static double leftHandHudson_(const PolymorphismSequenceContainer &psc)
give the left hand term of equation (4) in Hudson (Hudson 1987, Genet. Res., 50 pp245-250) This term ...
static Vdouble pairwiseD(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise D value between two sites (Lewontin & Kojima 1964...
static double meanDistance2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean pairwise distances between sites / method 2: differences between sequences are taken into a...
static unsigned int numberOfSitesWithStopCodon(const PolymorphismSequenceContainer &psc, const GeneticCode &gCode, bool gapflag=true)
Compute the number of codon sites with stop codon.
static void testUsefulValues(std::ostream &s, size_t n)
Test useful values.
static unsigned int totalNumberOfMutations(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Count the total number of mutations in an alignment.
static double fuLiF(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, bool original=true)
Return the Fu and Li F test (Fu & Li 1993, Genetics, 133 pp693-709).
static Vdouble linearRegressionD(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression |D| = a*distance+b
static double piSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, bool minchange=false)
Compute the synonymous nucleotide diversity, pi.
static double gcContent(const PolymorphismSequenceContainer &psc)
Compute the mean GC content in an alignment.
static double meanDprime(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean D' over all pairwise comparisons
static std::map< std::string, double > getUsefulValues_(size_t n)
Get useful values for theta estimators.
static double tajimaDss(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the Tajima's D test (Tajima 1989, Genetics 123 pp 585-595).
static double ratioOfTransitionsTransversions(const PolymorphismSequenceContainer &psc)
Return the ratio of transitions/transversions.
double fstHudson92(const PolymorphismSequenceContainer &psc, size_t id1, size_t id2)
static unsigned int numberOfSynonymousSubstitutions(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double freqmin=0.)
compute the number of synonymous subsitutions in an alignment
static Vdouble linearRegressionDprime(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression |D'| = a*distance+b
static double originRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression R² = 1+a*distance
static Vdouble pairwiseDistances1(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of the pairwise distances between site positions corresponding to a LD SequencePolymo...
static double originRegressionD(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression |D| = 1+a*distance
static double getVDstar_(size_t n, double a1, double a2, double dn)
Get the vD* value of D* equation in Fu & Li 1993, Genetics, 133 pp693-709)
static double heterozygosity(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the sum of per site heterozygosity in an alignment.
static std::vector< unsigned int > gcPolymorphism(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the number of GC alleles and the total number of alleles at polymorphic sites only...
Static class providing methods to compute statistics on sequences data.
static double hudson87(const PolymorphismSequenceContainer &psc, double precision=0.000001, double cinf=0.001, double csup=10000.)
give estimate of C=4Nr using Hudson method (Hudson 1987, Genet. Res., 50 pp245-250) ...
static double piNonSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, bool minchange=false)
Compute the non-synonymous nucleotide diversity, pi.
static unsigned int numberOfTransversions(const PolymorphismSequenceContainer &psc)
Return the number of transversions.
static double tajimaDtnm(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the Tajima's D test (Tajima 1989, Genetics 123 pp 585-595).
static double fayWu2000(const PolymorphismSequenceContainer &psc, const Sequence &ancestralSites)
Compute diversity estimator Theta H (eq. 3) of Fay and Wu (2000, Genetics, 155: 1405-1413) ...
static double meanDistance1(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean pairwise distances between sites / method 1: differences between sequences are not taken in...
static double meanR2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean R² over all pairwise comparisons
static PolymorphismSequenceContainer * generateLdContainer(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
generate a special PolymorphismSequenceContainer for linkage disequilbrium analysis ...
static unsigned int numberOfMonoSitePolymorphicCodons(const PolymorphismSequenceContainer &psc, bool stopflag=true, bool gapflag=true)
Compute the number of polymorphic codon with only one mutated site.
static Vdouble pairwiseR2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise R² value between two sites (Hill & Robertson 1968...
static unsigned int numberOfTransitions(const PolymorphismSequenceContainer &psc)
Return the number of transitions.
static double fuLiFStar(const PolymorphismSequenceContainer &group)
Return the Fu and Li F* test (Fu & Li 1993, Genetics, 133 pp693-709).
static double dvh(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the haplotype diversity of a sample. Depaulis and Veuille (1998, Mol Biol Evol, 12 pp1788-1790)
static unsigned int numberOfTriplets(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the number of triplet in an alignment.
static unsigned int numberOfNonSynonymousSubstitutions(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double freqmin=0.)
compute the number of non synonymous subsitutions in an alignment
static Vdouble pairwiseDprime(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise D' value between two sites (Lewontin 1964, Genetics 49 pp49-67))...
static unsigned getNumberOfDerivedSingletons_(const Site &site_in, const Site &site_out)
Count the number of singleton for a site.
static std::vector< unsigned int > mkTable(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, const GeneticCode &gc, double freqmin=0.)
return a vector containing Pa, Ps, Da, Ds
The PolymorphismSequenceContainer class.
static Vdouble pairwiseDistances2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise distance between two sites to a LD SequencePolymorphismContainer...
static unsigned int numberOfParsimonyInformativeSites(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the number of parsimony informative sites in an alignment.
static double watterson75NonSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the Watterson(1975, Theor Popul Biol, 7 pp256-276) estimator for non synonymous positions...
static Vdouble linearRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression R² = a*distance+b
static double squaredHeterozygosity(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the sum of per site squared heterozygosity in an alignment.
static double rightHandHudson_(double c, size_t n)
give the right hand term of equation (4) in Hudson (Hudson 1987, Genet. Res., 50 pp245-250) This term...
static double getVD_(size_t n, double a1, double a2, double cn)
Get the vD value of equation (32) in Fu & Li 1993, Genetics, 133 pp693-709)
static double getUDstar_(size_t n, double a1, double vDs)
Get the uD* value of D* equation in Fu & Li 1993, Genetics, 133 pp693-709)
static double meanNumberOfNonSynonymousSites(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double ratio=1.)
compute the mean number of non-synonymous site in an alignment
static unsigned int numberOfSynonymousPolymorphicCodons(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the number of synonymous polymorphic codon sites.
static unsigned int getNumberOfMutations_(const Site &site)
Count the number of mutation for a site.
static double inverseRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression R² = 1/(1+a*distance)
static unsigned int numberOfSingletons(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Count the number of singleton nucleotides in an alignment.
static unsigned int numberOfPolymorphicSites(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Compute the number of polymorphic site in an alignment.