56 #include <Bpp/Seq/Site.h> 57 #include <Bpp/Seq/SiteTools.h> 58 #include <Bpp/Seq/StringSequenceTools.h> 59 #include <Bpp/Seq/CodonSiteTools.h> 60 #include <Bpp/Seq/Alphabet/DNA.h> 61 #include <Bpp/Seq/Alphabet/AlphabetTools.h> 63 #include <Bpp/Numeric/VectorTools.h> 64 #include <Bpp/Numeric/VectorExceptions.h> 76 auto_ptr<ConstSiteIterator> si;
78 si.reset(
new CompleteSiteContainerIterator(psc));
80 si.reset(
new SimpleSiteContainerIterator(psc));
81 while (si->hasMoreSites())
83 site = si->nextSite();
84 if (!SiteTools::isConstant(*site, ignoreUnknown))
94 auto_ptr<ConstSiteIterator> si;
96 si.reset(
new CompleteSiteContainerIterator(psc));
98 si.reset(
new SimpleSiteContainerIterator(psc));
100 const Site* site = 0;
101 while (si->hasMoreSites())
103 site = si->nextSite();
104 if (SiteTools::isParsimonyInformativeSite(*site))
114 auto_ptr<ConstSiteIterator> si;
116 si.reset(
new CompleteSiteContainerIterator(psc));
118 si.reset(
new SimpleSiteContainerIterator(psc));
119 unsigned int nus = 0;
120 const Site* site = 0;
121 while (si->hasMoreSites())
123 site = si->nextSite();
124 nus += getNumberOfSingletons_(*site);
131 auto_ptr<ConstSiteIterator> si;
133 si.reset(
new CompleteSiteContainerIterator(psc));
135 si.reset(
new SimpleSiteContainerIterator(psc));
137 const Site* site = 0;
138 while (si->hasMoreSites())
140 site = si->nextSite();
141 if (SiteTools::isTriplet(*site))
151 auto_ptr<ConstSiteIterator> si;
153 si.reset(
new CompleteSiteContainerIterator(psc));
155 si.reset(
new SimpleSiteContainerIterator(psc));
156 unsigned int tnm = 0;
157 const Site* site = 0;
158 while (si->hasMoreSites())
160 site = si->nextSite();
161 tnm += getNumberOfMutations_(*site);
166 unsigned int SequenceStatistics::totalNumberOfMutationsOnExternalBranches(
170 if (ing.getNumberOfSites() != outg.getNumberOfSites())
171 throw Exception(
"ing and outg must have the same size");
172 unsigned int nmuts = 0;
173 const Site* site_in = 0;
174 const Site* site_out = 0;
175 auto_ptr<ConstSiteIterator> si(
new SimpleSiteContainerIterator(ing));
176 auto_ptr<ConstSiteIterator> so(
new SimpleSiteContainerIterator(outg));
177 while (si->hasMoreSites())
179 site_in = si->nextSite();
180 site_out = so->nextSite();
182 if (SiteTools::isComplete(*site_in) && SiteTools::isComplete(*site_out))
183 nmuts += getNumberOfDerivedSingletons_(*site_in, *site_out);
190 auto_ptr<ConstSiteIterator> si;
192 si.reset(
new CompleteSiteContainerIterator(psc));
194 si.reset(
new SimpleSiteContainerIterator(psc));
195 const Site* site = 0;
197 while (si->hasMoreSites())
199 site = si->nextSite();
200 s += SiteTools::heterozygosity(*site);
207 auto_ptr<ConstSiteIterator> si;
209 si.reset(
new CompleteSiteContainerIterator(psc));
211 si.reset(
new SimpleSiteContainerIterator(psc));
212 const Site* site = 0;
214 while (si->hasMoreSites())
216 site = si->nextSite();
217 double h = SiteTools::heterozygosity(*site);
229 map<int, double> freqs;
230 SequenceContainerTools::getFrequencies(psc, freqs);
231 const Alphabet* alpha = psc.getAlphabet();
232 return (freqs[alpha->charToInt(
"C")] + freqs[alpha->charToInt(
"G")]) / (freqs[alpha->charToInt(
"A")] + freqs[alpha->charToInt(
"C")] + freqs[alpha->charToInt(
"G")] + freqs[alpha->charToInt(
"T")]);
237 unsigned int nbMut = 0;
238 unsigned int nbGC = 0;
239 size_t nbSeq = psc.getNumberOfSequences();
240 vector<unsigned int> vect(2);
241 const Site* site = 0;
242 auto_ptr<ConstSiteIterator> si;
244 si.reset(
new CompleteSiteContainerIterator(psc));
246 si.reset(
new NoGapSiteContainerIterator(psc));
247 while (si->hasMoreSites())
249 site = si->nextSite();
250 if (!SiteTools::isConstant(*site))
252 long double freqGC = SymbolListTools::getGCContent(*site);
259 if (freqGC > 0 && freqGC < 1)
261 nbMut +=
static_cast<unsigned int>(nbSeq);
262 long double adGC = freqGC * nbSeq;
263 nbGC +=
static_cast<unsigned int>(adGC);
279 size_t n = psc.getNumberOfSequences();
280 unsigned int s = numberOfPolymorphicSites(psc, gapflag, ignoreUnknown);
281 map<string, double> values = getUsefulValues_(n);
282 ThetaW =
static_cast<double>(s) / values[
"a1"];
288 size_t alphabet_size = psc.getAlphabet()->getSize();
289 const Site* site = 0;
290 ConstSiteIterator* si = 0;
293 si =
new CompleteSiteContainerIterator(psc);
295 si =
new SimpleSiteContainerIterator(psc);
296 while (si->hasMoreSites())
298 site = si->nextSite();
299 if (!SiteTools::isConstant(*site))
302 map<int, size_t> count;
303 SymbolListTools::getCounts(*site, count);
304 map<int, size_t> tmp_k;
306 for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
308 if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
310 tmp_k[it->first] = it->second * (it->second - 1);
314 if (tmp_n == 0 || tmp_n == 1)
316 for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
318 value +=
static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
320 value2 += 1. - value;
329 if (psc.getNumberOfSites() != ancestralSites.size())
330 throw Exception(
"SequenceStatistics::FayWu2000: ancestralSites and psc don't have the same size!!!'" );
332 const Sequence& tmps = psc.getSequence(0);
334 size_t alphabet_size = (psc.getAlphabet())->getSize();
336 for (
size_t i = 0; i < psc.getNumberOfSites(); i++)
338 const Site& site = psc.getSite(i);
339 string ancB = ancestralSites.getChar(i);
340 int ancV = ancestralSites.getValue(i);
342 if (!SiteTools::isConstant(site) || tmps.getChar(i) != ancB)
347 map<int, size_t> count;
348 SymbolListTools::getCounts(site, count);
349 map<int, size_t> tmp_k;
351 for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
353 if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
356 if (it->first != ancV)
358 tmp_k[it->first] = 2 * it->second * it->second;
363 if (tmp_n == 0 || tmp_n == 1)
365 for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
367 value +=
static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
381 auto_ptr<PolymorphismSequenceContainer> sc;
383 sc.reset(PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc));
387 vector<string> pscvector;
388 pscvector.push_back(sc->toString(0));
390 for (
size_t i = 1; i < sc->getNumberOfSequences(); i++)
393 string query = sc->toString(i);
394 for (vector<string>::iterator it = pscvector.begin(); it != pscvector.end(); it++)
396 if (query.compare(*it) == 0)
405 pscvector.push_back(query);
409 return static_cast<unsigned int>(pscvector.size());
419 auto_ptr<PolymorphismSequenceContainer> sc;
421 sc.reset(PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc));
426 vector<string> pscvector;
427 vector<size_t> effvector;
428 pscvector.push_back(sc->toString(0));
431 for (
size_t i = 1; i < sc->getNumberOfSequences(); i++)
435 string query = sc->toString(i);
436 for (
size_t j = 0; j < pscvector.size(); j++)
438 if (query.compare(pscvector[j]) == 0)
447 pscvector.push_back(query);
451 for (
size_t i = 0; i < effvector.size(); i++)
453 H -= (
static_cast<double>(effvector[i]) / static_cast<double>(nbSeq)) * ( static_cast<double>(effvector[i]) /
static_cast<double>(nbSeq));
461 unsigned int nbT = 0;
462 auto_ptr<ConstSiteIterator> si(
new CompleteSiteContainerIterator(psc));
463 const Site* site = 0;
464 while (si->hasMoreSites())
466 site = si->nextSite();
468 if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
470 vector<int> seq = site->getContent();
473 for (
size_t i = 1; i < seq.size(); i++)
475 if (state1 != seq[i])
481 if (((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
482 ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1)))
492 unsigned int nbTv = 0;
493 auto_ptr<ConstSiteIterator> si(
new CompleteSiteContainerIterator(psc));
494 const Site* site = 0;
495 while (si->hasMoreSites())
497 site = si->nextSite();
499 if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
501 vector<int> seq = site->getContent();
504 for (
size_t i = 1; i < seq.size(); i++)
506 if (state1 != seq[i])
512 if (!(((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
513 ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1))))
526 auto_ptr<ConstSiteIterator> si(
new CompleteSiteContainerIterator(psc));
527 const Site* site = 0;
528 vector<int> state(2);
529 while (si->hasMoreSites())
531 map<int, size_t> count;
532 site = si->nextSite();
533 SymbolListTools::getCounts(*site, count);
534 if (count.size() != 2)
537 for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
539 state[i] = it->first;
542 if (((state[0] == 0 && state[1] == 2) || (state[0] == 2 && state[1] == 0)) ||
543 ((state[0] == 1 && state[1] == 3) || (state[0] == 3 && state[1] == 1)))
553 throw ZeroDivisionException(
"SequenceStatistics::getTransitionsTransversionsRatio.");
567 if (!AlphabetTools::isCodonAlphabet(psc.getAlphabet()))
568 throw AlphabetMismatchException(
"SequenceStatistics::stopCodonSiteNumber(). PolymorphismSequenceContainer must be with a codon alphabet.", psc.getAlphabet());
570 auto_ptr<ConstSiteIterator> si;
572 si.reset(
new NoGapSiteContainerIterator(psc));
574 si.reset(
new SimpleSiteContainerIterator(psc));
576 const Site* site = 0;
577 while (si->hasMoreSites())
579 site = si->nextSite();
580 if (CodonSiteTools::hasStop(*site, gCode))
588 auto_ptr<ConstSiteIterator> si;
590 si.reset(
new CompleteSiteContainerIterator(psc));
594 si.reset(
new NoGapSiteContainerIterator(psc));
596 si.reset(
new SimpleSiteContainerIterator(psc));
600 while (si->hasMoreSites())
602 site = si->nextSite();
603 if (CodonSiteTools::isMonoSitePolymorphic(*site))
611 auto_ptr<ConstSiteIterator> si(
new CompleteSiteContainerIterator(psc));
614 while (si->hasMoreSites())
616 site = si->nextSite();
617 if (CodonSiteTools::isSynonymousPolymorphic(*site, gc))
626 size_t n = psc.getNumberOfSequences();
627 unsigned int s = numberOfSynonymousSubstitutions(psc, gc);
628 map<string, double> values = getUsefulValues_(n);
629 ThetaW =
static_cast<double>(s) / values[
"a1"];
636 size_t n = psc.getNumberOfSequences();
637 unsigned int s = numberOfNonSynonymousSubstitutions(psc, gc);
638 map<string, double> values = getUsefulValues_(n);
639 ThetaW =
static_cast<double>(s) / values[
"a1"];
646 ConstSiteIterator* si =
new CompleteSiteContainerIterator(psc);
647 const Site* site = 0;
648 while (si->hasMoreSites())
650 site = si->nextSite();
651 S += CodonSiteTools::piSynonymous(*site, gc, minchange);
660 ConstSiteIterator* si =
new CompleteSiteContainerIterator(psc);
661 const Site* site = 0;
662 while (si->hasMoreSites())
664 site = si->nextSite();
665 S += CodonSiteTools::piNonSynonymous(*site, gc, minchange);
674 ConstSiteIterator* si =
new CompleteSiteContainerIterator(psc);
675 const Site* site = 0;
676 while (si->hasMoreSites())
678 site = si->nextSite();
679 S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
689 ConstSiteIterator* si =
new CompleteSiteContainerIterator(psc);
690 const Site* site = 0;
691 while (si->hasMoreSites())
693 site = si->nextSite();
695 S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
698 return static_cast<double>(n - S);
703 size_t st = 0, sns = 0;
704 auto_ptr<ConstSiteIterator> si(
new CompleteSiteContainerIterator(psc));
705 const Site* site = 0;
706 while (si->hasMoreSites())
708 site = si->nextSite();
709 st += CodonSiteTools::numberOfSubsitutions(*site, gc, freqmin);
710 sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin);
712 return static_cast<unsigned int>(st - sns);
717 unsigned int sns = 0;
718 auto_ptr<ConstSiteIterator> si(
new CompleteSiteContainerIterator(psc));
719 const Site* site = 0;
720 while (si->hasMoreSites())
722 site = si->nextSite();
723 sns +=
static_cast<unsigned int>(CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin));
730 auto_ptr<ConstSiteIterator> siIn(
new CompleteSiteContainerIterator(pscin));
731 auto_ptr<ConstSiteIterator> siOut(
new CompleteSiteContainerIterator(pscout));
732 auto_ptr<ConstSiteIterator> siCons(
new CompleteSiteContainerIterator(psccons));
733 const Site* siteIn = 0;
734 const Site* siteOut = 0;
735 const Site* siteCons = 0;
738 while (siIn->hasMoreSites())
740 siteIn = siIn->nextSite();
741 siteOut = siOut->nextSite();
742 siteCons = siCons->nextSite();
743 vector<size_t> v = CodonSiteTools::fixedDifferences(*siteIn, *siteOut, siteCons->getValue(0), siteCons->getValue(1), gc);
747 vector<unsigned int> v(2);
748 v[0] =
static_cast<unsigned int>(NfixS);
749 v[1] =
static_cast<unsigned int>(NfixA);
756 for (
size_t i = 0; i < outgroup.getNumberOfSequences(); i++)
761 auto_ptr<const PolymorphismSequenceContainer> psccomplet(PolymorphismSequenceContainerTools::getCompleteSites(psctot));
762 auto_ptr<const PolymorphismSequenceContainer> pscin (PolymorphismSequenceContainerTools::extractIngroup(*psccomplet));
763 auto_ptr<const PolymorphismSequenceContainer> pscout (PolymorphismSequenceContainerTools::extractOutgroup(*psccomplet));
764 auto_ptr<const Sequence> consensusIn (SiteContainerTools::getConsensus(*pscin,
"consensusIn"));
765 auto_ptr<const Sequence> consensusOut(SiteContainerTools::getConsensus(*pscout,
"consensusOut"));
769 vector<unsigned int> u = SequenceStatistics::fixedDifferences(*pscin, *pscout, *consensus, gc);
770 vector<unsigned int> v(4);
771 v[0] = SequenceStatistics::numberOfNonSynonymousSubstitutions(*pscin, gc, freqmin);
772 v[1] = SequenceStatistics::numberOfSynonymousSubstitutions(*pscin, gc, freqmin);
780 vector<unsigned int> v = SequenceStatistics::mkTable(ingroup, outgroup, gc, freqmin);
781 if (v[1] != 0 && v[2] != 0)
782 return static_cast<double>(v[0] * v[3]) / static_cast<double>(v[1] * v[2]);
793 unsigned int Sp = numberOfPolymorphicSites(psc, gapflag);
795 throw ZeroDivisionException(
"SequenceStatistics::tajimaDss. S should not be 0.");
796 double S =
static_cast<double>(Sp);
797 double tajima = tajima83(psc, gapflag);
798 double watterson = watterson75(psc, gapflag);
799 size_t n = psc.getNumberOfSequences();
800 map<string, double> values = getUsefulValues_(n);
801 return (tajima - watterson) / sqrt((values[
"e1"] * S) + (values[
"e2"] * S * (S - 1)));
806 unsigned int etaP = totalNumberOfMutations(psc, gapflag);
808 throw ZeroDivisionException(
"SequenceStatistics::tajimaDtnm. Eta should not be 0.");
809 double eta =
static_cast<double>(etaP);
810 double tajima = tajima83(psc, gapflag);
811 size_t n = psc.getNumberOfSequences();
812 map<string, double> values = getUsefulValues_(n);
813 double eta_a1 = eta / values[
"a1"];
814 return (tajima - eta_a1) / sqrt((values[
"e1"] * eta) + (values[
"e2"] * eta * (eta - 1)));
819 size_t n = ingroup.getNumberOfSequences();
820 map<string, double> values = getUsefulValues_(n);
821 double vD = getVD_(n, values[
"a1"], values[
"a2"], values[
"cn"]);
822 double uD = getUD_(values[
"a1"], vD);
823 unsigned int etaP = totalNumberOfMutations(ingroup);
825 throw ZeroDivisionException(
"SequenceStatistics::fuLiD. Eta should not be 0.");
826 double eta =
static_cast<double>(etaP);
829 etae =
static_cast<double>(numberOfSingletons(outgroup));
831 etae =
static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup));
832 return (eta - (values[
"a1"] * etae)) / sqrt((uD * eta) + (vD * eta * eta));
837 size_t n = group.getNumberOfSequences();
838 double nn =
static_cast<double>(n);
839 double _n = nn / (nn - 1.);
840 map<string, double> values = getUsefulValues_(n);
841 double vDs = getVDstar_(n, values[
"a1"], values[
"a2"], values[
"dn"]);
842 double uDs = getUDstar_(n, values[
"a1"], vDs);
843 double eta =
static_cast<double>(totalNumberOfMutations(group));
845 throw ZeroDivisionException(
"eta should not be null");
846 double etas =
static_cast<double>(numberOfSingletons(group));
849 return ((_n * eta) - (values[
"a1"] * etas)) / sqrt(uDs * eta + vDs * eta * eta);
859 size_t n = ingroup.getNumberOfSequences();
860 double nn =
static_cast<double>(n);
861 map<string, double> values = getUsefulValues_(n);
862 double pi = tajima83(ingroup,
true);
863 double vF = (values[
"cn"] + values[
"b2"] - 2. / (nn - 1.)) / (pow(values[
"a1"], 2) + values[
"a2"]);
864 double uF = ((1. + values[
"b1"] - (4. * ((nn + 1.) / ((nn - 1.) * (nn - 1.)))) * (values[
"a1n"] - (2. * nn) / (nn + 1.))) / values[
"a1"]) - vF;
865 double eta =
static_cast<double>(totalNumberOfMutations(ingroup));
867 throw ZeroDivisionException(
"eta should not be null");
870 etae =
static_cast<double>(numberOfSingletons(outgroup));
872 etae =
static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup));
873 return (pi - etae) / sqrt(uF * eta + vF * eta * eta);
878 double n =
static_cast<double>(group.getNumberOfSequences());
879 map<string, double> values = getUsefulValues_(group.getNumberOfSequences());
880 double pi = tajima83(group,
true);
887 double vFs = (((2 * n * n * n + 110 * n * n - 255 * n + 153) / (9 * n * n * (n - 1))) + ((2 * (n - 1) * values[
"a1"]) / (n * n)) - 8 * values[
"a2"] / n) / (pow(values[
"a1"], 2) + values[
"a2"]);
888 double uFs = (((4 * n * n + 19 * n + 3 - 12 * (n + 1) * values[
"a1n"]) / (3 * n * (n - 1))) / values[
"a1"]) - vFs;
889 double eta =
static_cast<double>(totalNumberOfMutations(group));
891 throw ZeroDivisionException(
"eta should not be null");
892 double etas =
static_cast<double>(numberOfSingletons(group));
895 return (pi - ((n - 1.) / n * etas)) / sqrt(uFs * eta + vFs * eta * eta);
900 vector<double> vdiff;
901 double piIntra1, piIntra2, meanPiIntra, piInter, Fst;
906 piIntra1 = SequenceStatistics::tajima83(*Pop1,
false);
907 piIntra2 = SequenceStatistics::tajima83(*Pop2,
false);
909 meanPiIntra = (piIntra1 + piIntra2) / 2;
912 for (
size_t i = 0; i < Pop1->getNumberOfSequences(); i++)
914 const Sequence& s1 = Pop1->getSequence(i);
915 for (
size_t j = 0; j < Pop2->getNumberOfSequences(); j++)
918 const Sequence& s2 = Pop2->getSequence(j);
919 vdiff.push_back(SiteContainerTools::computeSimilarity(s1, s2,
true,
"no gap",
true));
922 piInter = (VectorTools::sum(vdiff) / n) * static_cast<double>(psc.getNumberOfSites());
925 Fst = 1.0 - meanPiIntra / piInter;
945 for (
size_t i = 0; i < psc.getNumberOfSites(); i++)
949 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
956 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
963 const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
964 Alphabet* alpha =
new DNA();
967 for (
size_t i = 0; i < sc->getNumberOfSites(); i++)
969 const Site& site = sc->getSite(i);
970 Site siteclone(site);
971 bool deletesite =
false;
972 map<int, double> freqs;
973 SymbolListTools::getFrequencies(siteclone, freqs);
975 for (map<int, double>::iterator it = freqs.begin(); it != freqs.end(); it++)
977 if (it->second >= 0.5)
980 for (
size_t j = 0; j < sc->getNumberOfSequences(); j++)
982 if (freqs[site.getValue(j)] >= 0.5 && site.getValue(j) == first)
984 if (freqs[site.getValue(j)] <= 1 - freqmin)
986 siteclone.setElement(j, 1);
987 first = site.getValue(j);
994 if (freqs[site.getValue(j)] >= freqmin)
995 siteclone.setElement(j, 0);
1001 ldpsc->addSite(siteclone);
1015 for (
size_t i = 0; i < psc.getNumberOfSites(); i++)
1019 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1021 const Site& site = psc.getSite(i);
1022 bool deletesite =
false;
1023 map<int, double> freqs;
1024 SymbolListTools::getFrequencies(site, freqs);
1025 for (
int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1027 if (freqs[j] >= 1 - freqmin)
1036 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1039 const Site& site = psc.getSite(i);
1040 bool deletesite =
false;
1041 map<int, double> freqs;
1042 SymbolListTools::getFrequencies(site, freqs);
1043 for (
int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1045 if (freqs[j] >= 1 - freqmin)
1055 throw DimensionException(
"SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1057 for (
size_t i = 0; i < ss.size() - 1; i++)
1059 for (
size_t j = i + 1; j < ss.size(); j++)
1061 dist.push_back(static_cast<double>(ss[j] - ss[i]));
1070 for (
size_t i = 0; i < psc.getNumberOfSites(); i++)
1074 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1076 const Site& site = psc.getSite(i);
1077 bool deletesite =
false;
1078 map<int, double> freqs;
1079 SymbolListTools::getFrequencies(site, freqs);
1080 for (
int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1082 if (freqs[j] >= 1 - freqmin)
1091 if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1094 const Site& site = psc.getSite(i);
1095 bool deletesite =
false;
1096 map<int, double> freqs;
1097 SymbolListTools::getFrequencies(site, freqs);
1098 for (
int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1100 if (freqs[j] >= 1 - freqmin)
1108 size_t n = ss.size();
1110 throw DimensionException(
"SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1111 Vdouble distance(n * (n - 1) / 2, 0);
1112 size_t nbsite = psc.getNumberOfSites();
1113 for (
size_t k = 0; k < psc.getNumberOfSequences(); k++)
1115 const Sequence& seq = psc.getSequence(k);
1116 SiteSelection gap, newss = ss;
1118 for (
size_t i = 0; i < nbsite; i++)
1120 if (seq.getValue(i) == -1)
1124 for (
size_t i = 0; i < gap.size(); i++)
1126 for (
size_t j = 0; j < ss.size(); j++)
1132 for (
size_t i = 0; i < n - 1; i++)
1134 for (
size_t j = i + 1; j < n; j++)
1136 dist.push_back(static_cast<double>(newss[j] - newss[i]));
1141 distance = distance /
static_cast<double>(psc.getNumberOfSequences());
1149 size_t nbsite = newpsc->getNumberOfSites();
1150 size_t nbseq = newpsc->getNumberOfSequences();
1152 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1154 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1155 for (
size_t i = 0; i < nbsite - 1; i++)
1157 for (
size_t j = i + 1; j < nbsite; j++)
1160 const Site& site1 = newpsc->getSite(i);
1161 const Site& site2 = newpsc->getSite(j);
1162 map<int, double> freq1;
1163 map<int, double> freq2;
1164 SymbolListTools::getFrequencies(site1, freq1);
1165 SymbolListTools::getFrequencies(site2, freq2);
1166 for (
size_t k = 0; k < nbseq; k++)
1168 if (site1.getValue(k) + site2.getValue(k) == 2)
1171 haplo = haplo /
static_cast<double>(nbseq);
1172 D.push_back(std::abs(haplo - freq1[1] * freq2[1]));
1182 size_t nbsite = newpsc->getNumberOfSites();
1183 size_t nbseq = newpsc->getNumberOfSequences();
1185 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1187 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1188 for (
size_t i = 0; i < nbsite - 1; i++)
1190 for (
size_t j = i + 1; j < nbsite; j++)
1193 const Site& site1 = newpsc->getSite(i);
1194 const Site& site2 = newpsc->getSite(j);
1195 map<int, double> freq1;
1196 map<int, double> freq2;
1197 SymbolListTools::getFrequencies(site1, freq1);
1198 SymbolListTools::getFrequencies(site2, freq2);
1199 for (
size_t k = 0; k < nbseq; k++)
1201 if (site1.getValue(k) + site2.getValue(k) == 2)
1204 haplo = haplo /
static_cast<double>(nbseq);
1205 double d, D = (haplo - freq1[1] * freq2[1]);
1208 if (freq1[1] * freq2[0] <= freq1[0] * freq2[1])
1210 d = std::abs(D) / (freq1[1] * freq2[0]);
1214 d = std::abs(D) / (freq1[0] * freq2[1]);
1219 if (freq1[1] * freq2[1] <= freq1[0] * freq2[0])
1221 d = std::abs(D) / (freq1[1] * freq2[1]);
1225 d = std::abs(D) / (freq1[0] * freq2[0]);
1228 Dprime.push_back(d);
1238 size_t nbsite = newpsc->getNumberOfSites();
1239 size_t nbseq = newpsc->getNumberOfSequences();
1241 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1243 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1244 for (
size_t i = 0; i < nbsite - 1; i++)
1246 for (
size_t j = i + 1; j < nbsite; j++)
1249 const Site& site1 = newpsc->getSite(i);
1250 const Site& site2 = newpsc->getSite(j);
1251 map<int, double> freq1;
1252 map<int, double> freq2;
1253 SymbolListTools::getFrequencies(site1, freq1);
1254 SymbolListTools::getFrequencies(site2, freq2);
1255 for (
size_t k = 0; k < nbseq; k++)
1257 if (site1.getValue(k) + site2.getValue(k) == 2)
1260 haplo = haplo /
static_cast<double>(nbseq);
1261 double r = ((haplo - freq1[1] * freq2[1]) * (haplo - freq1[1] * freq2[1])) / (freq1[0] * freq1[1] * freq2[0] * freq2[1]);
1274 Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1275 return VectorTools::mean<double, double>(D);
1282 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1283 return VectorTools::mean<double, double>(Dprime);
1285 catch (DimensionException& e)
1295 Vdouble R2 = SequenceStatistics::pairwiseR2(psc, keepsingleton, freqmin);
1296 return VectorTools::mean<double, double>(R2);
1298 catch (DimensionException& e)
1308 Vdouble dist = pairwiseDistances1(psc, keepsingleton, freqmin);
1309 return VectorTools::mean<double, double>(dist);
1311 catch (DimensionException& e)
1321 Vdouble dist = pairwiseDistances2(psc, keepsingleton, freqmin);
1322 return VectorTools::mean<double, double>(dist);
1324 catch (DimensionException& e)
1334 double SequenceStatistics::originRegressionD(
const PolymorphismSequenceContainer& psc,
bool distance1,
bool keepsingleton,
double freqmin)
throw (DimensionException)
1338 Vdouble D = pairwiseD(psc, keepsingleton, freqmin) - 1;
1341 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1343 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1344 return VectorTools::sum(D * dist) / VectorTools::sum(dist * dist);
1346 catch (DimensionException& e)
1352 double SequenceStatistics::originRegressionDprime(
const PolymorphismSequenceContainer& psc,
bool distance1,
bool keepsingleton,
double freqmin)
throw (DimensionException)
1356 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin) - 1;
1359 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1361 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1362 return VectorTools::sum(Dprime * dist) / VectorTools::sum(dist * dist);
1364 catch (DimensionException& e)
1370 double SequenceStatistics::originRegressionR2(
const PolymorphismSequenceContainer& psc,
bool distance1,
bool keepsingleton,
double freqmin)
throw (DimensionException)
1374 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin) - 1;
1377 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1379 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1380 return VectorTools::sum(R2 * dist) / VectorTools::sum(dist * dist);
1382 catch (DimensionException& e)
1388 Vdouble SequenceStatistics::linearRegressionD(
const PolymorphismSequenceContainer& psc,
bool distance1,
bool keepsingleton,
double freqmin)
throw (DimensionException)
1392 Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1396 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1398 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1399 reg[0] = VectorTools::cov<double, double>(dist, D) / VectorTools::var<double, double>(dist);
1400 reg[1] = VectorTools::mean<double, double>(D) - reg[0] * VectorTools::mean<double, double>(dist);
1403 catch (DimensionException& e)
1409 Vdouble SequenceStatistics::linearRegressionDprime(
const PolymorphismSequenceContainer& psc,
bool distance1,
bool keepsingleton,
double freqmin)
throw (DimensionException)
1413 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1417 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1419 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1420 reg[0] = VectorTools::cov<double, double>(dist, Dprime) / VectorTools::var<double, double>(dist);
1421 reg[1] = VectorTools::mean<double, double>(Dprime) - reg[0] * VectorTools::mean<double, double>(dist);
1424 catch (DimensionException& e)
1430 Vdouble SequenceStatistics::linearRegressionR2(
const PolymorphismSequenceContainer& psc,
bool distance1,
bool keepsingleton,
double freqmin)
throw (DimensionException)
1434 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1438 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1440 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1441 reg[0] = VectorTools::cov<double, double>(dist, R2) / VectorTools::var<double, double>(dist);
1442 reg[1] = VectorTools::mean<double, double>(R2) - reg[0] * VectorTools::mean<double, double>(dist);
1445 catch (DimensionException& e)
1451 double SequenceStatistics::inverseRegressionR2(
const PolymorphismSequenceContainer& psc,
bool distance1,
bool keepsingleton,
double freqmin)
throw (DimensionException)
1455 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1456 Vdouble unit(R2.size(), 1);
1457 Vdouble R2transformed = unit / R2 - 1;
1460 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1462 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1463 return VectorTools::sum(R2transformed * dist) / VectorTools::sum(dist * dist);
1465 catch (DimensionException& e)
1477 double left = leftHandHudson_(psc);
1478 size_t n = psc.getNumberOfSequences();
1482 if (SequenceStatistics::numberOfPolymorphicSites(psc) < 2)
1484 if (rightHandHudson_(c1, n) < left)
1486 if (rightHandHudson_(c2, n) > left)
1488 while (dif > precision)
1490 if (rightHandHudson_((c1 + c2) / 2, n) > left)
1494 dif = std::abs(2 * (c1 - c2) / (c1 + c2));
1496 return (c1 + c2) / 2;
1503 void SequenceStatistics::testUsefulValues(std::ostream& s,
size_t n)
1505 map<string, double> v = getUsefulValues_(n);
1506 double vD = getVD_(n, v[
"a1"], v[
"a2"], v[
"cn"]);
1507 double uD = getUD_(v[
"a1"], vD);
1508 double vDs = getVDstar_(n, v[
"a1"], v[
"a2"], v[
"dn"]);
1509 double uDs = getUDstar_(n, v[
"a1"], vDs);
1512 s << v[
"a1"] <<
"\t";
1513 s << v[
"a2"] <<
"\t";
1514 s << v[
"a1n"] <<
"\t";
1515 s << v[
"b1"] <<
"\t";
1516 s << v[
"b2"] <<
"\t";
1517 s << v[
"c1"] <<
"\t";
1518 s << v[
"c2"] <<
"\t";
1519 s << v[
"cn"] <<
"\t";
1520 s << v[
"dn"] <<
"\t";
1521 s << v[
"e1"] <<
"\t";
1522 s << v[
"e2"] <<
"\t";
1533 unsigned int SequenceStatistics::getNumberOfMutations_(
const Site& site)
1535 unsigned int tmp_count = 0;
1536 map<int, size_t> states_count;
1537 SymbolListTools::getCounts(site, states_count);
1539 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1549 unsigned int SequenceStatistics::getNumberOfSingletons_(
const Site& site)
1551 unsigned int nus = 0;
1552 map<int, size_t> states_count;
1553 SymbolListTools::getCounts(site, states_count);
1554 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1556 if (it->second == 1)
1562 unsigned int SequenceStatistics::getNumberOfDerivedSingletons_(
const Site& site_in,
const Site& site_out)
1564 unsigned int nus = 0;
1565 map<int, size_t> states_count;
1566 map<int, size_t> outgroup_states_count;
1567 SymbolListTools::getCounts(site_in, states_count);
1568 SymbolListTools::getCounts(site_out, outgroup_states_count);
1570 if (outgroup_states_count.size() == 1)
1572 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1574 if (it->second == 1)
1576 if (outgroup_states_count.find(it->first) == outgroup_states_count.end())
1584 std::map<std::string, double> SequenceStatistics::getUsefulValues_(
size_t n)
1586 double nn =
static_cast<double>(n);
1587 map<string, double> values;
1601 for (
double i = 1; i < nn; i++)
1603 values[
"a1"] += 1. / i;
1604 values[
"a2"] += 1. / (i * i);
1606 values[
"a1n"] = values[
"a1"] + (1. / nn);
1607 values[
"b1"] = (nn + 1.) / (3. * (nn - 1.));
1608 values[
"b2"] = 2. * ((nn * nn) + nn + 3.) / (9. * nn * (nn - 1.));
1609 values[
"c1"] = values[
"b1"] - (1. / values[
"a1"]);
1610 values[
"c2"] = values[
"b2"] - ((nn + 2.) / (values[
"a1"] * nn)) + (values[
"a2"] / (values[
"a1"] * values[
"a1"]));
1618 values[
"cn"] = 2. * ((nn * values[
"a1"]) - (2. * (nn - 1.))) / ((nn - 1.) * (nn - 2.));
1621 + ((nn - 2.) / ((nn - 1.) * (nn - 1.)))
1623 * ((3. / 2.) - (((2. * values[
"a1n"]) - 3.) / (nn - 2.)) - (1. / nn));
1625 values[
"e1"] = values[
"c1"] / values[
"a1"];
1626 values[
"e2"] = values[
"c2"] / ((values[
"a1"] * values[
"a1"]) + values[
"a2"]);
1631 double SequenceStatistics::getVD_(
size_t n,
double a1,
double a2,
double cn)
1633 double nn =
static_cast<double>(n);
1636 double vD = 1. + ((a1 * a1) / (a2 + (a1 * a1))) * (cn - ((nn + 1.) / (nn - 1.)));
1640 double SequenceStatistics::getUD_(
double a1,
double vD)
1642 return a1 - 1. - vD;
1645 double SequenceStatistics::getVDstar_(
size_t n,
double a1,
double a2,
double dn)
1647 double denom = (a1 * a1) + a2;
1648 if (n < 3 || denom == 0.)
1650 double nn =
static_cast<double>(n);
1651 double nnn = nn / (nn - 1.);
1656 - (2. * (nn * a1 * (a1 + 1)) / ((nn - 1.) * (nn - 1.)))
1673 double SequenceStatistics::getUDstar_(
size_t n,
double a1,
double vDs)
1677 double nn =
static_cast<double>(n);
1678 double nnn = nn / (nn - 1.);
1680 double uDs = (nnn * (a1 - nnn)) - vDs;
1691 size_t nbseq = newpsc->getNumberOfSequences();
1694 for (
size_t i = 0; i < nbseq - 1; i++)
1696 for (
size_t j = i + 1; j < nbseq; j++)
1698 SequenceSelection ss(2);
1702 S1 += SequenceStatistics::watterson75(*psc2,
true);
1703 S2 += SequenceStatistics::watterson75(*psc2,
true) * SequenceStatistics::watterson75(*psc2,
true);
1707 double Sk = (2 * S2 - pow(2 * S1 / static_cast<double>(nbseq), 2.)) / pow(nbseq, 2.);
1708 double H = SequenceStatistics::heterozygosity(*newpsc);
1709 double H2 = SequenceStatistics::squaredHeterozygosity(*newpsc);
1711 return static_cast<double>(Sk - H + H2) / pow(H * static_cast<double>(nbseq) /
static_cast<double>(nbseq - 1), 2.);
1714 double SequenceStatistics::rightHandHudson_(
double c,
size_t n)
1716 double nn =
static_cast<double>(n);
1717 return 1. / (97. * pow(c, 2.) * pow(nn, 3.)) * ((nn - 1.) * (97. * (c * (4. + (c - 2. * nn) * nn) + (-2. * (7. + c) + 4. * nn + (c - 1.) * pow(nn, 2.)) * log((18. + c * (13. + c)) / 18.)) + sqrt(97.) * (110. + nn * (49. * nn - 52.) + c * (2. + nn * (15. * nn - 8.))) * log(-1. + (72. + 26. * c) / (36. + 13. * c - c * sqrt(97.)))));
void setAsOutgroupMember(size_t index)
Set a sequence as outgroup member by index.
unsigned int getSequenceCount(size_t index) const
Get the count of a sequence by index.
void addSequence(const Sequence &sequence, bool checkName=true)
The PolymorphismSequenceContainer class.