bpp-popgen  2.2.0
SequenceStatistics.cpp
Go to the documentation of this file.
1 //
2 // File SequenceStatistics.cpp
3 // Authors: Eric Bazin
4 // Sylvain Gailard
5 // Khalid Belkhir
6 // Benoit Nabholz
7 // Created on: Wed Aug 04 2004
8 //
9 
10 /*
11  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
12 
13  This software is a computer program whose purpose is to provide classes
14  for population genetics analysis.
15 
16  This software is governed by the CeCILL license under French law and
17  abiding by the rules of distribution of free software. You can use,
18  modify and/ or redistribute the software under the terms of the CeCILL
19  license as circulated by CEA, CNRS and INRIA at the following URL
20  "http://www.cecill.info".
21 
22  As a counterpart to the access to the source code and rights to copy,
23  modify and redistribute granted by the license, users are provided only
24  with a limited warranty and the software's author, the holder of the
25  economic rights, and the successive licensors have only limited
26  liability.
27 
28  In this respect, the user's attention is drawn to the risks associated
29  with loading, using, modifying and/or developing or reproducing the
30  software by the user in light of its specific status of free software,
31  that may mean that it is complicated to manipulate, and that also
32  therefore means that it is reserved for developers and experienced
33  professionals having in-depth computer knowledge. Users are therefore
34  encouraged to load and test the software's suitability as regards their
35  requirements in conditions enabling the security of their systems and/or
36  data to be ensured and, more generally, to use and operate it in the
37  same conditions as regards security.
38 
39  The fact that you are presently reading this means that you have had
40  knowledge of the CeCILL license and that you accept its terms.
41  */
42 
43 #include "SequenceStatistics.h" // class's header file
46 
47 // From the STL:
48 #include <ctype.h>
49 #include <cmath>
50 #include <iostream>
51 #include <vector>
52 
53 using namespace std;
54 
55 // From SeqLib:
56 #include <Bpp/Seq/Site.h>
57 #include <Bpp/Seq/SiteTools.h>
58 #include <Bpp/Seq/StringSequenceTools.h>
59 #include <Bpp/Seq/CodonSiteTools.h>
60 #include <Bpp/Seq/Alphabet/DNA.h>
61 #include <Bpp/Seq/Alphabet/AlphabetTools.h>
62 
63 #include <Bpp/Numeric/VectorTools.h>
64 #include <Bpp/Numeric/VectorExceptions.h>
65 
66 using namespace bpp;
67 
68 // ******************************************************************************
69 // Basic statistics
70 // ******************************************************************************
71 
72 unsigned int SequenceStatistics::numberOfPolymorphicSites(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
73 {
74  unsigned int s = 0;
75  const Site* site = 0;
76  auto_ptr<ConstSiteIterator> si;
77  if (gapflag)
78  si.reset(new CompleteSiteContainerIterator(psc));
79  else
80  si.reset(new SimpleSiteContainerIterator(psc));
81  while (si->hasMoreSites())
82  {
83  site = si->nextSite();
84  if (!SiteTools::isConstant(*site, ignoreUnknown))
85  {
86  s++;
87  }
88  }
89  return s;
90 }
91 
92 unsigned int SequenceStatistics::numberOfParsimonyInformativeSites(const PolymorphismSequenceContainer& psc, bool gapflag)
93 {
94  auto_ptr<ConstSiteIterator> si;
95  if (gapflag)
96  si.reset(new CompleteSiteContainerIterator(psc));
97  else
98  si.reset(new SimpleSiteContainerIterator(psc));
99  unsigned int s = 0;
100  const Site* site = 0;
101  while (si->hasMoreSites())
102  {
103  site = si->nextSite();
104  if (SiteTools::isParsimonyInformativeSite(*site))
105  {
106  s++;
107  }
108  }
109  return s;
110 }
111 
112 unsigned int SequenceStatistics::numberOfSingletons(const PolymorphismSequenceContainer& psc, bool gapflag)
113 {
114  auto_ptr<ConstSiteIterator> si;
115  if (gapflag)
116  si.reset(new CompleteSiteContainerIterator(psc));
117  else
118  si.reset(new SimpleSiteContainerIterator(psc));
119  unsigned int nus = 0;
120  const Site* site = 0;
121  while (si->hasMoreSites())
122  {
123  site = si->nextSite();
124  nus += getNumberOfSingletons_(*site);
125  }
126  return nus;
127 }
128 
129 unsigned int SequenceStatistics::numberOfTriplets(const PolymorphismSequenceContainer& psc, bool gapflag)
130 {
131  auto_ptr<ConstSiteIterator> si;
132  if (gapflag)
133  si.reset(new CompleteSiteContainerIterator(psc));
134  else
135  si.reset(new SimpleSiteContainerIterator(psc));
136  unsigned int s = 0;
137  const Site* site = 0;
138  while (si->hasMoreSites())
139  {
140  site = si->nextSite();
141  if (SiteTools::isTriplet(*site))
142  {
143  s++;
144  }
145  }
146  return s;
147 }
148 
149 unsigned int SequenceStatistics::totalNumberOfMutations(const PolymorphismSequenceContainer& psc, bool gapflag)
150 {
151  auto_ptr<ConstSiteIterator> si;
152  if (gapflag)
153  si.reset(new CompleteSiteContainerIterator(psc));
154  else
155  si.reset(new SimpleSiteContainerIterator(psc));
156  unsigned int tnm = 0;
157  const Site* site = 0;
158  while (si->hasMoreSites())
159  {
160  site = si->nextSite();
161  tnm += getNumberOfMutations_(*site);
162  }
163  return tnm;
164 }
165 
166 unsigned int SequenceStatistics::totalNumberOfMutationsOnExternalBranches(
168  const PolymorphismSequenceContainer& outg) throw (Exception)
169 {
170  if (ing.getNumberOfSites() != outg.getNumberOfSites())
171  throw Exception("ing and outg must have the same size");
172  unsigned int nmuts = 0;
173  const Site* site_in = 0;
174  const Site* site_out = 0;
175  auto_ptr<ConstSiteIterator> si(new SimpleSiteContainerIterator(ing));
176  auto_ptr<ConstSiteIterator> so(new SimpleSiteContainerIterator(outg));
177  while (si->hasMoreSites())
178  {
179  site_in = si->nextSite();
180  site_out = so->nextSite();
181  // use fully resolved sites
182  if (SiteTools::isComplete(*site_in) && SiteTools::isComplete(*site_out))
183  nmuts += getNumberOfDerivedSingletons_(*site_in, *site_out); // singletons that are not in outgroup
184  }
185  return nmuts;
186 }
187 
188 double SequenceStatistics::heterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag)
189 {
190  auto_ptr<ConstSiteIterator> si;
191  if (gapflag)
192  si.reset(new CompleteSiteContainerIterator(psc));
193  else
194  si.reset(new SimpleSiteContainerIterator(psc));
195  const Site* site = 0;
196  double s = 0;
197  while (si->hasMoreSites())
198  {
199  site = si->nextSite();
200  s += SiteTools::heterozygosity(*site);
201  }
202  return s;
203 }
204 
205 double SequenceStatistics::squaredHeterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag)
206 {
207  auto_ptr<ConstSiteIterator> si;
208  if (gapflag)
209  si.reset(new CompleteSiteContainerIterator(psc));
210  else
211  si.reset(new SimpleSiteContainerIterator(psc));
212  const Site* site = 0;
213  double s = 0;
214  while (si->hasMoreSites())
215  {
216  site = si->nextSite();
217  double h = SiteTools::heterozygosity(*site);
218  s += h * h;
219  }
220  return s;
221 }
222 
223 // ******************************************************************************
224 // GC statistics
225 // ******************************************************************************
226 
227 double SequenceStatistics::gcContent(const PolymorphismSequenceContainer& psc)
228 {
229  map<int, double> freqs;
230  SequenceContainerTools::getFrequencies(psc, freqs);
231  const Alphabet* alpha = psc.getAlphabet();
232  return (freqs[alpha->charToInt("C")] + freqs[alpha->charToInt("G")]) / (freqs[alpha->charToInt("A")] + freqs[alpha->charToInt("C")] + freqs[alpha->charToInt("G")] + freqs[alpha->charToInt("T")]);
233 }
234 
235 std::vector<unsigned int> SequenceStatistics::gcPolymorphism(const PolymorphismSequenceContainer& psc, bool gapflag)
236 {
237  unsigned int nbMut = 0;
238  unsigned int nbGC = 0;
239  size_t nbSeq = psc.getNumberOfSequences();
240  vector<unsigned int> vect(2);
241  const Site* site = 0;
242  auto_ptr<ConstSiteIterator> si;
243  if (gapflag)
244  si.reset(new CompleteSiteContainerIterator(psc));
245  else
246  si.reset(new NoGapSiteContainerIterator(psc));
247  while (si->hasMoreSites())
248  {
249  site = si->nextSite();
250  if (!SiteTools::isConstant(*site))
251  {
252  long double freqGC = SymbolListTools::getGCContent(*site);
253  /*
254  * Sylvain Gaillard 15/03/2010: realy unclear ...
255  * freqGC is always in [0,1] then why testing it ?
256  * why casting double into size_t ?
257  * is that method used by someone ?
258  */
259  if (freqGC > 0 && freqGC < 1)
260  {
261  nbMut += static_cast<unsigned int>(nbSeq);
262  long double adGC = freqGC * nbSeq;
263  nbGC += static_cast<unsigned int>(adGC);
264  }
265  }
266  }
267  vect[0] = nbMut;
268  vect[1] = nbGC;
269  return vect;
270 }
271 
272 // ******************************************************************************
273 // Diversity statistics
274 // ******************************************************************************
275 
276 double SequenceStatistics::watterson75(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
277 {
278  double ThetaW;
279  size_t n = psc.getNumberOfSequences();
280  unsigned int s = numberOfPolymorphicSites(psc, gapflag, ignoreUnknown);
281  map<string, double> values = getUsefulValues_(n);
282  ThetaW = static_cast<double>(s) / values["a1"];
283  return ThetaW;
284 }
285 
286 double SequenceStatistics::tajima83(const PolymorphismSequenceContainer& psc, bool gapflag)
287 {
288  size_t alphabet_size = psc.getAlphabet()->getSize();
289  const Site* site = 0;
290  ConstSiteIterator* si = 0;
291  double value2 = 0.;
292  if (gapflag)
293  si = new CompleteSiteContainerIterator(psc);
294  else
295  si = new SimpleSiteContainerIterator(psc);
296  while (si->hasMoreSites())
297  {
298  site = si->nextSite();
299  if (!SiteTools::isConstant(*site))
300  {
301  double value = 0.;
302  map<int, size_t> count;
303  SymbolListTools::getCounts(*site, count);
304  map<int, size_t> tmp_k;
305  size_t tmp_n = 0;
306  for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
307  {
308  if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
309  {
310  tmp_k[it->first] = it->second * (it->second - 1);
311  tmp_n += it->second;
312  }
313  }
314  if (tmp_n == 0 || tmp_n == 1)
315  continue;
316  for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
317  {
318  value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
319  }
320  value2 += 1. - value;
321  }
322  }
323  delete si;
324  return value2;
325 }
326 
327 double SequenceStatistics::fayWu2000(const PolymorphismSequenceContainer& psc, const Sequence& ancestralSites)
328 {
329  if (psc.getNumberOfSites() != ancestralSites.size())
330  throw Exception("SequenceStatistics::FayWu2000: ancestralSites and psc don't have the same size!!!'" );
331 
332  const Sequence& tmps = psc.getSequence(0);
333 
334  size_t alphabet_size = (psc.getAlphabet())->getSize();
335  double value = 0.;
336  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
337  {
338  const Site& site = psc.getSite(i);
339  string ancB = ancestralSites.getChar(i);
340  int ancV = ancestralSites.getValue(i);
341 
342  if (!SiteTools::isConstant(site) || tmps.getChar(i) != ancB)
343  {
344  if (ancV < 0)
345  continue;
346 
347  map<int, size_t> count;
348  SymbolListTools::getCounts(site, count);
349  map<int, size_t> tmp_k;
350  size_t tmp_n = 0;
351  for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
352  {
353  if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
354  {
355  /* if derived allele */
356  if (it->first != ancV)
357  {
358  tmp_k[it->first] = 2 * it->second * it->second;
359  }
360  tmp_n += it->second;
361  }
362  }
363  if (tmp_n == 0 || tmp_n == 1)
364  continue;
365  for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
366  {
367  value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
368  }
369  }
370  }
371  return value;
372 }
373 
374 unsigned int SequenceStatistics::dvk(const PolymorphismSequenceContainer& psc, bool gapflag)
375 {
376  /*
377  * Sylvain Gaillard 17/03/2010:
378  * This implementation uses unneeded SequenceContainer recopy and works on
379  * string. It needs to be improved.
380  */
381  auto_ptr<PolymorphismSequenceContainer> sc;
382  if (gapflag)
383  sc.reset(PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc));
384  else
385  sc.reset(new PolymorphismSequenceContainer(psc));
386  // int K = 0;
387  vector<string> pscvector;
388  pscvector.push_back(sc->toString(0));
389  // K++;
390  for (size_t i = 1; i < sc->getNumberOfSequences(); i++)
391  {
392  bool uniq = true;
393  string query = sc->toString(i);
394  for (vector<string>::iterator it = pscvector.begin(); it != pscvector.end(); it++)
395  {
396  if (query.compare(*it) == 0)
397  {
398  uniq = false;
399  break;
400  }
401  }
402  if (uniq)
403  {
404  // K++;
405  pscvector.push_back(query);
406  }
407  }
408  // return K;
409  return static_cast<unsigned int>(pscvector.size());
410 }
411 
412 double SequenceStatistics::dvh(const PolymorphismSequenceContainer& psc, bool gapflag)
413 {
414  /*
415  * Sylvain Gaillard 17/03/2010:
416  * This implementation uses unneeded SequenceContainer recopy and works on
417  * string. It needs to be improved.
418  */
419  auto_ptr<PolymorphismSequenceContainer> sc;
420  if (gapflag)
421  sc.reset(PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc));
422  else
423  sc.reset(new PolymorphismSequenceContainer(psc));
424  double H = 0.;
425  size_t nbSeq;
426  vector<string> pscvector;
427  vector<size_t> effvector;
428  pscvector.push_back(sc->toString(0));
429  effvector.push_back(sc->getSequenceCount(0));
430  nbSeq = sc->getSequenceCount(0);
431  for (size_t i = 1; i < sc->getNumberOfSequences(); i++)
432  {
433  nbSeq += sc->getSequenceCount(i);
434  bool uniq = true;
435  string query = sc->toString(i);
436  for (size_t j = 0; j < pscvector.size(); j++)
437  {
438  if (query.compare(pscvector[j]) == 0)
439  {
440  effvector[j] += sc->getSequenceCount(i);
441  uniq = false;
442  break;
443  }
444  }
445  if (uniq)
446  {
447  pscvector.push_back(query);
448  effvector.push_back(sc->getSequenceCount(i));
449  }
450  }
451  for (size_t i = 0; i < effvector.size(); i++)
452  {
453  H -= (static_cast<double>(effvector[i]) / static_cast<double>(nbSeq)) * ( static_cast<double>(effvector[i]) / static_cast<double>(nbSeq));
454  }
455  H += 1.;
456  return H;
457 }
458 
459 unsigned int SequenceStatistics::numberOfTransitions(const PolymorphismSequenceContainer& psc)
460 {
461  unsigned int nbT = 0;
462  auto_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
463  const Site* site = 0;
464  while (si->hasMoreSites())
465  {
466  site = si->nextSite();
467  // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue;
468  if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
469  continue;
470  vector<int> seq = site->getContent();
471  int state1 = seq[0];
472  int state2 = seq[0];
473  for (size_t i = 1; i < seq.size(); i++)
474  {
475  if (state1 != seq[i])
476  {
477  state2 = seq[i];
478  break;
479  }
480  }
481  if (((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
482  ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1)))
483  {
484  nbT++;
485  }
486  }
487  return nbT;
488 }
489 
490 unsigned int SequenceStatistics::numberOfTransversions(const PolymorphismSequenceContainer& psc)
491 {
492  unsigned int nbTv = 0;
493  auto_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
494  const Site* site = 0;
495  while (si->hasMoreSites())
496  {
497  site = si->nextSite();
498  // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue;
499  if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
500  continue;
501  vector<int> seq = site->getContent();
502  int state1 = seq[0];
503  int state2 = seq[0];
504  for (size_t i = 1; i < seq.size(); i++)
505  {
506  if (state1 != seq[i])
507  {
508  state2 = seq[i];
509  break;
510  }
511  }
512  if (!(((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
513  ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1))))
514  {
515  nbTv++;
516  }
517  }
518  return nbTv;
519 }
520 
521 double SequenceStatistics::ratioOfTransitionsTransversions(const PolymorphismSequenceContainer& psc) throw (Exception)
522 {
523  // return (double) getNumberOfTransitions(psc)/getNumberOfTransversions(psc);
524  double nbTs = 0;
525  double nbTv = 0;
526  auto_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
527  const Site* site = 0;
528  vector<int> state(2);
529  while (si->hasMoreSites())
530  {
531  map<int, size_t> count;
532  site = si->nextSite();
533  SymbolListTools::getCounts(*site, count);
534  if (count.size() != 2)
535  continue;
536  size_t i = 0;
537  for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
538  {
539  state[i] = it->first;
540  i++;
541  }
542  if (((state[0] == 0 && state[1] == 2) || (state[0] == 2 && state[1] == 0)) ||
543  ((state[0] == 1 && state[1] == 3) || (state[0] == 3 && state[1] == 1)))
544  {
545  nbTs++; // transitions
546  }
547  else
548  {
549  nbTv++; // transversion
550  }
551  }
552  if (nbTv == 0)
553  throw ZeroDivisionException("SequenceStatistics::getTransitionsTransversionsRatio.");
554  return nbTs / nbTv;
555 }
556 
557 // ******************************************************************************
558 // Synonymous and non-synonymous polymorphism
559 // ******************************************************************************
560 
561 unsigned int SequenceStatistics::numberOfSitesWithStopCodon(const PolymorphismSequenceContainer& psc, const GeneticCode& gCode, bool gapflag)
562 {
563  /*
564  * Sylvain Gaillard 17/03/2010
565  * What if the Alphabet is not a codon alphabet?
566  */
567  if (!AlphabetTools::isCodonAlphabet(psc.getAlphabet()))
568  throw AlphabetMismatchException("SequenceStatistics::stopCodonSiteNumber(). PolymorphismSequenceContainer must be with a codon alphabet.", psc.getAlphabet());
569 
570  auto_ptr<ConstSiteIterator> si;
571  if (gapflag)
572  si.reset(new NoGapSiteContainerIterator(psc));
573  else
574  si.reset(new SimpleSiteContainerIterator(psc));
575  unsigned int s = 0;
576  const Site* site = 0;
577  while (si->hasMoreSites())
578  {
579  site = si->nextSite();
580  if (CodonSiteTools::hasStop(*site, gCode))
581  s++;
582  }
583  return s;
584 }
585 
586 unsigned int SequenceStatistics::numberOfMonoSitePolymorphicCodons(const PolymorphismSequenceContainer& psc, bool stopflag, bool gapflag)
587 {
588  auto_ptr<ConstSiteIterator> si;
589  if (stopflag)
590  si.reset(new CompleteSiteContainerIterator(psc));
591  else
592  {
593  if (gapflag)
594  si.reset(new NoGapSiteContainerIterator(psc));
595  else
596  si.reset(new SimpleSiteContainerIterator(psc));
597  }
598  unsigned int s = 0;
599  const Site* site;
600  while (si->hasMoreSites())
601  {
602  site = si->nextSite();
603  if (CodonSiteTools::isMonoSitePolymorphic(*site))
604  s++;
605  }
606  return s;
607 }
608 
609 unsigned int SequenceStatistics::numberOfSynonymousPolymorphicCodons(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
610 {
611  auto_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
612  unsigned int s = 0;
613  const Site* site;
614  while (si->hasMoreSites())
615  {
616  site = si->nextSite();
617  if (CodonSiteTools::isSynonymousPolymorphic(*site, gc))
618  s++;
619  }
620  return s;
621 }
622 
623 double SequenceStatistics::watterson75Synonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
624 {
625  double ThetaW = 0.;
626  size_t n = psc.getNumberOfSequences();
627  unsigned int s = numberOfSynonymousSubstitutions(psc, gc);
628  map<string, double> values = getUsefulValues_(n);
629  ThetaW = static_cast<double>(s) / values["a1"];
630  return ThetaW;
631 }
632 
633 double SequenceStatistics::watterson75NonSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
634 {
635  double ThetaW;
636  size_t n = psc.getNumberOfSequences();
637  unsigned int s = numberOfNonSynonymousSubstitutions(psc, gc);
638  map<string, double> values = getUsefulValues_(n);
639  ThetaW = static_cast<double>(s) / values["a1"];
640  return ThetaW;
641 }
642 
643 double SequenceStatistics::piSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, bool minchange)
644 {
645  double S = 0.;
646  ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
647  const Site* site = 0;
648  while (si->hasMoreSites())
649  {
650  site = si->nextSite();
651  S += CodonSiteTools::piSynonymous(*site, gc, minchange);
652  }
653  delete si;
654  return S;
655 }
656 
657 double SequenceStatistics::piNonSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, bool minchange)
658 {
659  double S = 0.;
660  ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
661  const Site* site = 0;
662  while (si->hasMoreSites())
663  {
664  site = si->nextSite();
665  S += CodonSiteTools::piNonSynonymous(*site, gc, minchange);
666  }
667  delete si;
668  return S;
669 }
670 
671 double SequenceStatistics::meanNumberOfSynonymousSites(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio)
672 {
673  double S = 0.;
674  ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
675  const Site* site = 0;
676  while (si->hasMoreSites())
677  {
678  site = si->nextSite();
679  S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
680  }
681  delete si;
682  return S;
683 }
684 
685 double SequenceStatistics::meanNumberOfNonSynonymousSites(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio)
686 {
687  double S = 0.;
688  int n = 0;
689  ConstSiteIterator* si = new CompleteSiteContainerIterator(psc);
690  const Site* site = 0;
691  while (si->hasMoreSites())
692  {
693  site = si->nextSite();
694  n = n + 3;
695  S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
696  }
697  delete si;
698  return static_cast<double>(n - S);
699 }
700 
701 unsigned int SequenceStatistics::numberOfSynonymousSubstitutions(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin)
702 {
703  size_t st = 0, sns = 0;
704  auto_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
705  const Site* site = 0;
706  while (si->hasMoreSites())
707  {
708  site = si->nextSite();
709  st += CodonSiteTools::numberOfSubsitutions(*site, gc, freqmin);
710  sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin);
711  }
712  return static_cast<unsigned int>(st - sns);
713 }
714 
715 unsigned int SequenceStatistics::numberOfNonSynonymousSubstitutions(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin)
716 {
717  unsigned int sns = 0;
718  auto_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
719  const Site* site = 0;
720  while (si->hasMoreSites())
721  {
722  site = si->nextSite();
723  sns += static_cast<unsigned int>(CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin));
724  }
725  return sns;
726 }
727 
728 vector<unsigned int> SequenceStatistics::fixedDifferences(const PolymorphismSequenceContainer& pscin, const PolymorphismSequenceContainer& pscout, PolymorphismSequenceContainer& psccons, const GeneticCode& gc)
729 {
730  auto_ptr<ConstSiteIterator> siIn(new CompleteSiteContainerIterator(pscin));
731  auto_ptr<ConstSiteIterator> siOut(new CompleteSiteContainerIterator(pscout));
732  auto_ptr<ConstSiteIterator> siCons(new CompleteSiteContainerIterator(psccons));
733  const Site* siteIn = 0;
734  const Site* siteOut = 0;
735  const Site* siteCons = 0;
736  size_t NfixS = 0;
737  size_t NfixA = 0;
738  while (siIn->hasMoreSites())
739  {
740  siteIn = siIn->nextSite();
741  siteOut = siOut->nextSite();
742  siteCons = siCons->nextSite();
743  vector<size_t> v = CodonSiteTools::fixedDifferences(*siteIn, *siteOut, siteCons->getValue(0), siteCons->getValue(1), gc);
744  NfixS += v[0];
745  NfixA += v[1];
746  }
747  vector<unsigned int> v(2);
748  v[0] = static_cast<unsigned int>(NfixS);
749  v[1] = static_cast<unsigned int>(NfixA);
750  return v;
751 }
752 
753 vector<unsigned int> SequenceStatistics::mkTable(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin)
754 {
755  PolymorphismSequenceContainer psctot(ingroup);
756  for (size_t i = 0; i < outgroup.getNumberOfSequences(); i++)
757  {
758  psctot.addSequence(outgroup.getSequence(i));
759  psctot.setAsOutgroupMember(i + ingroup.getNumberOfSequences());
760  }
761  auto_ptr<const PolymorphismSequenceContainer> psccomplet(PolymorphismSequenceContainerTools::getCompleteSites(psctot));
762  auto_ptr<const PolymorphismSequenceContainer> pscin (PolymorphismSequenceContainerTools::extractIngroup(*psccomplet));
763  auto_ptr<const PolymorphismSequenceContainer> pscout (PolymorphismSequenceContainerTools::extractOutgroup(*psccomplet));
764  auto_ptr<const Sequence> consensusIn (SiteContainerTools::getConsensus(*pscin, "consensusIn"));
765  auto_ptr<const Sequence> consensusOut(SiteContainerTools::getConsensus(*pscout, "consensusOut"));
766  auto_ptr<PolymorphismSequenceContainer> consensus(new PolymorphismSequenceContainer(ingroup.getAlphabet()));
767  consensus->addSequence(*consensusIn);
768  consensus->addSequence(*consensusOut);
769  vector<unsigned int> u = SequenceStatistics::fixedDifferences(*pscin, *pscout, *consensus, gc);
770  vector<unsigned int> v(4);
771  v[0] = SequenceStatistics::numberOfNonSynonymousSubstitutions(*pscin, gc, freqmin);
772  v[1] = SequenceStatistics::numberOfSynonymousSubstitutions(*pscin, gc, freqmin);
773  v[2] = u[1];
774  v[3] = u[0];
775  return v;
776 }
777 
778 double SequenceStatistics::neutralityIndex(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin)
779 {
780  vector<unsigned int> v = SequenceStatistics::mkTable(ingroup, outgroup, gc, freqmin);
781  if (v[1] != 0 && v[2] != 0)
782  return static_cast<double>(v[0] * v[3]) / static_cast<double>(v[1] * v[2]);
783  else
784  return -1;
785 }
786 
787 // ******************************************************************************
788 // Statistical tests
789 // ******************************************************************************
790 
791 double SequenceStatistics::tajimaDss(const PolymorphismSequenceContainer& psc, bool gapflag) throw (ZeroDivisionException)
792 {
793  unsigned int Sp = numberOfPolymorphicSites(psc, gapflag);
794  if (Sp == 0)
795  throw ZeroDivisionException("SequenceStatistics::tajimaDss. S should not be 0.");
796  double S = static_cast<double>(Sp);
797  double tajima = tajima83(psc, gapflag);
798  double watterson = watterson75(psc, gapflag);
799  size_t n = psc.getNumberOfSequences();
800  map<string, double> values = getUsefulValues_(n);
801  return (tajima - watterson) / sqrt((values["e1"] * S) + (values["e2"] * S * (S - 1)));
802 }
803 
804 double SequenceStatistics::tajimaDtnm(const PolymorphismSequenceContainer& psc, bool gapflag) throw (ZeroDivisionException)
805 {
806  unsigned int etaP = totalNumberOfMutations(psc, gapflag);
807  if (etaP == 0)
808  throw ZeroDivisionException("SequenceStatistics::tajimaDtnm. Eta should not be 0.");
809  double eta = static_cast<double>(etaP);
810  double tajima = tajima83(psc, gapflag);
811  size_t n = psc.getNumberOfSequences();
812  map<string, double> values = getUsefulValues_(n);
813  double eta_a1 = eta / values["a1"];
814  return (tajima - eta_a1) / sqrt((values["e1"] * eta) + (values["e2"] * eta * (eta - 1)));
815 }
816 
817 double SequenceStatistics::fuLiD(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, bool original) throw (ZeroDivisionException)
818 {
819  size_t n = ingroup.getNumberOfSequences();
820  map<string, double> values = getUsefulValues_(n);
821  double vD = getVD_(n, values["a1"], values["a2"], values["cn"]);
822  double uD = getUD_(values["a1"], vD);
823  unsigned int etaP = totalNumberOfMutations(ingroup);
824  if (etaP == 0)
825  throw ZeroDivisionException("SequenceStatistics::fuLiD. Eta should not be 0.");
826  double eta = static_cast<double>(etaP);
827  double etae = 0.;
828  if (original)
829  etae = static_cast<double>(numberOfSingletons(outgroup));
830  else
831  etae = static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup)); // added by Khalid 13/07/2005
832  return (eta - (values["a1"] * etae)) / sqrt((uD * eta) + (vD * eta * eta));
833 }
834 
835 double SequenceStatistics::fuLiDStar(const PolymorphismSequenceContainer& group) throw (ZeroDivisionException)
836 {
837  size_t n = group.getNumberOfSequences();
838  double nn = static_cast<double>(n);
839  double _n = nn / (nn - 1.);
840  map<string, double> values = getUsefulValues_(n);
841  double vDs = getVDstar_(n, values["a1"], values["a2"], values["dn"]);
842  double uDs = getUDstar_(n, values["a1"], vDs);
843  double eta = static_cast<double>(totalNumberOfMutations(group));
844  if (eta == 0.)
845  throw ZeroDivisionException("eta should not be null");
846  double etas = static_cast<double>(numberOfSingletons(group));
847 
848  // Fu & Li 1993
849  return ((_n * eta) - (values["a1"] * etas)) / sqrt(uDs * eta + vDs * eta * eta);
850 
851  // Simonsen et al. 1995
852  /*
853  return ((eta / values["a1"]) - (etas * ((n - 1) / n))) / sqrt(uDs * eta + vDs * eta * eta);
854  */
855 }
856 
857 double SequenceStatistics::fuLiF(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, bool original) throw (ZeroDivisionException)
858 {
859  size_t n = ingroup.getNumberOfSequences();
860  double nn = static_cast<double>(n);
861  map<string, double> values = getUsefulValues_(n);
862  double pi = tajima83(ingroup, true);
863  double vF = (values["cn"] + values["b2"] - 2. / (nn - 1.)) / (pow(values["a1"], 2) + values["a2"]);
864  double uF = ((1. + values["b1"] - (4. * ((nn + 1.) / ((nn - 1.) * (nn - 1.)))) * (values["a1n"] - (2. * nn) / (nn + 1.))) / values["a1"]) - vF;
865  double eta = static_cast<double>(totalNumberOfMutations(ingroup));
866  if (eta == 0.)
867  throw ZeroDivisionException("eta should not be null");
868  double etae = 0.;
869  if (original)
870  etae = static_cast<double>(numberOfSingletons(outgroup));
871  else
872  etae = static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup)); // added by Khalid 13/07/2005
873  return (pi - etae) / sqrt(uF * eta + vF * eta * eta);
874 }
875 
876 double SequenceStatistics::fuLiFStar(const PolymorphismSequenceContainer& group) throw (ZeroDivisionException)
877 {
878  double n = static_cast<double>(group.getNumberOfSequences());
879  map<string, double> values = getUsefulValues_(group.getNumberOfSequences());
880  double pi = tajima83(group, true);
881 
882  // Fu & Li 1993
883  // double vFs = (values["dn"] + values["b2"] - (2. / (nn - 1.)) * (4. * values["a2"] - 6. + 8. / nn)) / (pow(values["a1"], 2) + values["a2"]);
884  // double uFs = (((nn / (nn - 1.)) + values["b1"] - (4. / (nn * (nn - 1.))) + 2. * ((nn + 1.) / (pow((nn - 1.), 2))) * (values["a1n"] - 2. * nn / (nn + 1.))) / values["a1"]) - vFs;
885 
886  // Simonsen et al. 1995
887  double vFs = (((2 * n * n * n + 110 * n * n - 255 * n + 153) / (9 * n * n * (n - 1))) + ((2 * (n - 1) * values["a1"]) / (n * n)) - 8 * values["a2"] / n) / (pow(values["a1"], 2) + values["a2"]);
888  double uFs = (((4 * n * n + 19 * n + 3 - 12 * (n + 1) * values["a1n"]) / (3 * n * (n - 1))) / values["a1"]) - vFs;
889  double eta = static_cast<double>(totalNumberOfMutations(group));
890  if (eta == 0.)
891  throw ZeroDivisionException("eta should not be null");
892  double etas = static_cast<double>(numberOfSingletons(group));
893  // Fu & Li 1993
894  // Simonsen et al. 1995
895  return (pi - ((n - 1.) / n * etas)) / sqrt(uFs * eta + vFs * eta * eta);
896 }
897 
898 double SequenceStatistics::fstHudson92(const PolymorphismSequenceContainer& psc, size_t id1, size_t id2)
899 {
900  vector<double> vdiff;
901  double piIntra1, piIntra2, meanPiIntra, piInter, Fst;
902 
903  PolymorphismSequenceContainer* Pop1 = PolymorphismSequenceContainerTools::extractGroup(psc, id1);
904  PolymorphismSequenceContainer* Pop2 = PolymorphismSequenceContainerTools::extractGroup(psc, id2);
905 
906  piIntra1 = SequenceStatistics::tajima83(*Pop1, false);
907  piIntra2 = SequenceStatistics::tajima83(*Pop2, false);
908 
909  meanPiIntra = (piIntra1 + piIntra2) / 2;
910 
911  double n = 0;
912  for (size_t i = 0; i < Pop1->getNumberOfSequences(); i++)
913  {
914  const Sequence& s1 = Pop1->getSequence(i);
915  for (size_t j = 0; j < Pop2->getNumberOfSequences(); j++)
916  {
917  n++;
918  const Sequence& s2 = Pop2->getSequence(j);
919  vdiff.push_back(SiteContainerTools::computeSimilarity(s1, s2, true, "no gap", true));
920  }
921  }
922  piInter = (VectorTools::sum(vdiff) / n) * static_cast<double>(psc.getNumberOfSites());
923 
924 
925  Fst = 1.0 - meanPiIntra / piInter;
926 
927  delete Pop1;
928  delete Pop2;
929 
930  return Fst;
931 }
932 
933 // ******************************************************************************
934 // Linkage disequilibrium statistics
935 // ******************************************************************************
936 
937 /**********************/
938 /* Preliminary method */
939 /**********************/
940 
941 PolymorphismSequenceContainer* SequenceStatistics::generateLdContainer(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
942 {
943  SiteSelection ss;
944  // Extract polymorphic site with only two alleles
945  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
946  {
947  if (keepsingleton)
948  {
949  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
950  {
951  ss.push_back(i);
952  }
953  }
954  else
955  {
956  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
957  {
958  ss.push_back(i);
959  }
960  }
961  }
962 
963  const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
964  Alphabet* alpha = new DNA(); // Sylvain Gaillard 17/03/2010: What if psc's Alphabet is not DNA
965  PolymorphismSequenceContainer* ldpsc = new PolymorphismSequenceContainer(sc->getNumberOfSequences(), alpha);
966  // Assign 1 to the more frequent and 0 to the less frequent alleles
967  for (size_t i = 0; i < sc->getNumberOfSites(); i++)
968  {
969  const Site& site = sc->getSite(i);
970  Site siteclone(site);
971  bool deletesite = false;
972  map<int, double> freqs;
973  SymbolListTools::getFrequencies(siteclone, freqs);
974  int first = 0;
975  for (map<int, double>::iterator it = freqs.begin(); it != freqs.end(); it++)
976  {
977  if (it->second >= 0.5)
978  first = it->first;
979  }
980  for (size_t j = 0; j < sc->getNumberOfSequences(); j++)
981  {
982  if (freqs[site.getValue(j)] >= 0.5 && site.getValue(j) == first)
983  {
984  if (freqs[site.getValue(j)] <= 1 - freqmin)
985  {
986  siteclone.setElement(j, 1);
987  first = site.getValue(j);
988  }
989  else
990  deletesite = true;
991  }
992  else
993  {
994  if (freqs[site.getValue(j)] >= freqmin)
995  siteclone.setElement(j, 0);
996  else
997  deletesite = true;
998  }
999  }
1000  if (!deletesite)
1001  ldpsc->addSite(siteclone);
1002  }
1003  delete alpha;
1004  return ldpsc;
1005 }
1006 
1007 /*************************************/
1008 /* Pairwise LD and distance measures */
1009 /*************************************/
1010 
1011 Vdouble SequenceStatistics::pairwiseDistances1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1012 {
1013  // get Positions with sites of interest
1014  SiteSelection ss;
1015  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
1016  {
1017  if (keepsingleton)
1018  {
1019  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1020  {
1021  const Site& site = psc.getSite(i);
1022  bool deletesite = false;
1023  map<int, double> freqs;
1024  SymbolListTools::getFrequencies(site, freqs);
1025  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1026  {
1027  if (freqs[j] >= 1 - freqmin)
1028  deletesite = true;
1029  }
1030  if (!deletesite)
1031  ss.push_back(i);
1032  }
1033  }
1034  else
1035  {
1036  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1037  {
1038  ss.push_back(i);
1039  const Site& site = psc.getSite(i);
1040  bool deletesite = false;
1041  map<int, double> freqs;
1042  SymbolListTools::getFrequencies(site, freqs);
1043  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1044  {
1045  if (freqs[j] >= 1 - freqmin)
1046  deletesite = true;
1047  }
1048  if (!deletesite)
1049  ss.push_back(i);
1050  }
1051  }
1052  }
1053  // compute pairwise distances
1054  if (ss.size() < 2)
1055  throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1056  Vdouble dist;
1057  for (size_t i = 0; i < ss.size() - 1; i++)
1058  {
1059  for (size_t j = i + 1; j < ss.size(); j++)
1060  {
1061  dist.push_back(static_cast<double>(ss[j] - ss[i]));
1062  }
1063  }
1064  return dist;
1065 }
1066 
1067 Vdouble SequenceStatistics::pairwiseDistances2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1068 {
1069  SiteSelection ss;
1070  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
1071  {
1072  if (keepsingleton)
1073  {
1074  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1075  {
1076  const Site& site = psc.getSite(i);
1077  bool deletesite = false;
1078  map<int, double> freqs;
1079  SymbolListTools::getFrequencies(site, freqs);
1080  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1081  {
1082  if (freqs[j] >= 1 - freqmin)
1083  deletesite = true;
1084  }
1085  if (!deletesite)
1086  ss.push_back(i);
1087  }
1088  }
1089  else
1090  {
1091  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1092  {
1093  ss.push_back(i);
1094  const Site& site = psc.getSite(i);
1095  bool deletesite = false;
1096  map<int, double> freqs;
1097  SymbolListTools::getFrequencies(site, freqs);
1098  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1099  {
1100  if (freqs[j] >= 1 - freqmin)
1101  deletesite = true;
1102  }
1103  if (!deletesite)
1104  ss.push_back(i);
1105  }
1106  }
1107  }
1108  size_t n = ss.size();
1109  if (n < 2)
1110  throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1111  Vdouble distance(n * (n - 1) / 2, 0);
1112  size_t nbsite = psc.getNumberOfSites();
1113  for (size_t k = 0; k < psc.getNumberOfSequences(); k++)
1114  {
1115  const Sequence& seq = psc.getSequence(k);
1116  SiteSelection gap, newss = ss;
1117  Vdouble dist;
1118  for (size_t i = 0; i < nbsite; i++)
1119  {
1120  if (seq.getValue(i) == -1)
1121  gap.push_back(i);
1122  }
1123  // Site positions are re-numbered to take gaps into account
1124  for (size_t i = 0; i < gap.size(); i++)
1125  {
1126  for (size_t j = 0; j < ss.size(); j++)
1127  {
1128  if (ss[j] > gap[i])
1129  newss[j]--;
1130  }
1131  }
1132  for (size_t i = 0; i < n - 1; i++)
1133  {
1134  for (size_t j = i + 1; j < n; j++)
1135  {
1136  dist.push_back(static_cast<double>(newss[j] - newss[i]));
1137  }
1138  }
1139  distance += dist;
1140  }
1141  distance = distance / static_cast<double>(psc.getNumberOfSequences());
1142  return distance;
1143 }
1144 
1145 Vdouble SequenceStatistics::pairwiseD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1146 {
1147  PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLdContainer(psc, keepsingleton, freqmin);
1148  Vdouble D;
1149  size_t nbsite = newpsc->getNumberOfSites();
1150  size_t nbseq = newpsc->getNumberOfSequences();
1151  if (nbsite < 2)
1152  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1153  if (nbseq < 2)
1154  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1155  for (size_t i = 0; i < nbsite - 1; i++)
1156  {
1157  for (size_t j = i + 1; j < nbsite; j++)
1158  {
1159  double haplo = 0;
1160  const Site& site1 = newpsc->getSite(i);
1161  const Site& site2 = newpsc->getSite(j);
1162  map<int, double> freq1;
1163  map<int, double> freq2;
1164  SymbolListTools::getFrequencies(site1, freq1);
1165  SymbolListTools::getFrequencies(site2, freq2);
1166  for (size_t k = 0; k < nbseq; k++)
1167  {
1168  if (site1.getValue(k) + site2.getValue(k) == 2)
1169  haplo++;
1170  }
1171  haplo = haplo / static_cast<double>(nbseq);
1172  D.push_back(std::abs(haplo - freq1[1] * freq2[1]));
1173  }
1174  }
1175  return D;
1176 }
1177 
1178 Vdouble SequenceStatistics::pairwiseDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1179 {
1180  PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLdContainer(psc, keepsingleton, freqmin);
1181  Vdouble Dprime;
1182  size_t nbsite = newpsc->getNumberOfSites();
1183  size_t nbseq = newpsc->getNumberOfSequences();
1184  if (nbsite < 2)
1185  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1186  if (nbseq < 2)
1187  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1188  for (size_t i = 0; i < nbsite - 1; i++)
1189  {
1190  for (size_t j = i + 1; j < nbsite; j++)
1191  {
1192  double haplo = 0;
1193  const Site& site1 = newpsc->getSite(i);
1194  const Site& site2 = newpsc->getSite(j);
1195  map<int, double> freq1;
1196  map<int, double> freq2;
1197  SymbolListTools::getFrequencies(site1, freq1);
1198  SymbolListTools::getFrequencies(site2, freq2);
1199  for (size_t k = 0; k < nbseq; k++)
1200  {
1201  if (site1.getValue(k) + site2.getValue(k) == 2)
1202  haplo++;
1203  }
1204  haplo = haplo / static_cast<double>(nbseq);
1205  double d, D = (haplo - freq1[1] * freq2[1]);
1206  if (D > 0)
1207  {
1208  if (freq1[1] * freq2[0] <= freq1[0] * freq2[1])
1209  {
1210  d = std::abs(D) / (freq1[1] * freq2[0]);
1211  }
1212  else
1213  {
1214  d = std::abs(D) / (freq1[0] * freq2[1]);
1215  }
1216  }
1217  else
1218  {
1219  if (freq1[1] * freq2[1] <= freq1[0] * freq2[0])
1220  {
1221  d = std::abs(D) / (freq1[1] * freq2[1]);
1222  }
1223  else
1224  {
1225  d = std::abs(D) / (freq1[0] * freq2[0]);
1226  }
1227  }
1228  Dprime.push_back(d);
1229  }
1230  }
1231  return Dprime;
1232 }
1233 
1234 Vdouble SequenceStatistics::pairwiseR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1235 {
1236  PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLdContainer(psc, keepsingleton, freqmin);
1237  Vdouble R2;
1238  size_t nbsite = newpsc->getNumberOfSites();
1239  size_t nbseq = newpsc->getNumberOfSequences();
1240  if (nbsite < 2)
1241  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1242  if (nbseq < 2)
1243  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1244  for (size_t i = 0; i < nbsite - 1; i++)
1245  {
1246  for (size_t j = i + 1; j < nbsite; j++)
1247  {
1248  double haplo = 0;
1249  const Site& site1 = newpsc->getSite(i);
1250  const Site& site2 = newpsc->getSite(j);
1251  map<int, double> freq1;
1252  map<int, double> freq2;
1253  SymbolListTools::getFrequencies(site1, freq1);
1254  SymbolListTools::getFrequencies(site2, freq2);
1255  for (size_t k = 0; k < nbseq; k++)
1256  {
1257  if (site1.getValue(k) + site2.getValue(k) == 2)
1258  haplo++;
1259  }
1260  haplo = haplo / static_cast<double>(nbseq);
1261  double r = ((haplo - freq1[1] * freq2[1]) * (haplo - freq1[1] * freq2[1])) / (freq1[0] * freq1[1] * freq2[0] * freq2[1]);
1262  R2.push_back(r);
1263  }
1264  }
1265  return R2;
1266 }
1267 
1268 /***********************************/
1269 /* Global LD and distance measures */
1270 /***********************************/
1271 
1272 double SequenceStatistics::meanD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1273 {
1274  Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1275  return VectorTools::mean<double, double>(D);
1276 }
1277 
1278 double SequenceStatistics::meanDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1279 {
1280  try
1281  {
1282  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1283  return VectorTools::mean<double, double>(Dprime);
1284  }
1285  catch (DimensionException& e)
1286  {
1287  throw e;
1288  }
1289 }
1290 
1291 double SequenceStatistics::meanR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1292 {
1293  try
1294  {
1295  Vdouble R2 = SequenceStatistics::pairwiseR2(psc, keepsingleton, freqmin);
1296  return VectorTools::mean<double, double>(R2);
1297  }
1298  catch (DimensionException& e)
1299  {
1300  throw e;
1301  }
1302 }
1303 
1304 double SequenceStatistics::meanDistance1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1305 {
1306  try
1307  {
1308  Vdouble dist = pairwiseDistances1(psc, keepsingleton, freqmin);
1309  return VectorTools::mean<double, double>(dist);
1310  }
1311  catch (DimensionException& e)
1312  {
1313  throw e;
1314  }
1315 }
1316 
1317 double SequenceStatistics::meanDistance2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin) throw (DimensionException)
1318 {
1319  try
1320  {
1321  Vdouble dist = pairwiseDistances2(psc, keepsingleton, freqmin);
1322  return VectorTools::mean<double, double>(dist);
1323  }
1324  catch (DimensionException& e)
1325  {
1326  throw e;
1327  }
1328 }
1329 
1330 /**********************/
1331 /* Regression methods */
1332 /**********************/
1333 
1334 double SequenceStatistics::originRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1335 {
1336  try
1337  {
1338  Vdouble D = pairwiseD(psc, keepsingleton, freqmin) - 1;
1339  Vdouble dist;
1340  if (distance1)
1341  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1342  else
1343  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1344  return VectorTools::sum(D * dist) / VectorTools::sum(dist * dist);
1345  }
1346  catch (DimensionException& e)
1347  {
1348  throw e;
1349  }
1350 }
1351 
1352 double SequenceStatistics::originRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1353 {
1354  try
1355  {
1356  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin) - 1;
1357  Vdouble dist;
1358  if (distance1)
1359  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1360  else
1361  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1362  return VectorTools::sum(Dprime * dist) / VectorTools::sum(dist * dist);
1363  }
1364  catch (DimensionException& e)
1365  {
1366  throw e;
1367  }
1368 }
1369 
1370 double SequenceStatistics::originRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1371 {
1372  try
1373  {
1374  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin) - 1;
1375  Vdouble dist;
1376  if (distance1)
1377  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1378  else
1379  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1380  return VectorTools::sum(R2 * dist) / VectorTools::sum(dist * dist);
1381  }
1382  catch (DimensionException& e)
1383  {
1384  throw e;
1385  }
1386 }
1387 
1388 Vdouble SequenceStatistics::linearRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1389 {
1390  try
1391  {
1392  Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1393  Vdouble dist;
1394  Vdouble reg(2);
1395  if (distance1)
1396  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1397  else
1398  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1399  reg[0] = VectorTools::cov<double, double>(dist, D) / VectorTools::var<double, double>(dist);
1400  reg[1] = VectorTools::mean<double, double>(D) - reg[0] * VectorTools::mean<double, double>(dist);
1401  return reg;
1402  }
1403  catch (DimensionException& e)
1404  {
1405  throw e;
1406  }
1407 }
1408 
1409 Vdouble SequenceStatistics::linearRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1410 {
1411  try
1412  {
1413  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1414  Vdouble dist;
1415  Vdouble reg(2);
1416  if (distance1)
1417  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1418  else
1419  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1420  reg[0] = VectorTools::cov<double, double>(dist, Dprime) / VectorTools::var<double, double>(dist);
1421  reg[1] = VectorTools::mean<double, double>(Dprime) - reg[0] * VectorTools::mean<double, double>(dist);
1422  return reg;
1423  }
1424  catch (DimensionException& e)
1425  {
1426  throw e;
1427  }
1428 }
1429 
1430 Vdouble SequenceStatistics::linearRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1431 {
1432  try
1433  {
1434  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1435  Vdouble dist;
1436  Vdouble reg(2);
1437  if (distance1)
1438  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1439  else
1440  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1441  reg[0] = VectorTools::cov<double, double>(dist, R2) / VectorTools::var<double, double>(dist);
1442  reg[1] = VectorTools::mean<double, double>(R2) - reg[0] * VectorTools::mean<double, double>(dist);
1443  return reg;
1444  }
1445  catch (DimensionException& e)
1446  {
1447  throw e;
1448  }
1449 }
1450 
1451 double SequenceStatistics::inverseRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin) throw (DimensionException)
1452 {
1453  try
1454  {
1455  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1456  Vdouble unit(R2.size(), 1);
1457  Vdouble R2transformed = unit / R2 - 1;
1458  Vdouble dist;
1459  if (distance1)
1460  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1461  else
1462  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1463  return VectorTools::sum(R2transformed * dist) / VectorTools::sum(dist * dist);
1464  }
1465  catch (DimensionException& e)
1466  {
1467  throw e;
1468  }
1469 }
1470 
1471 /**********************/
1472 /* Hudson method */
1473 /**********************/
1474 
1475 double SequenceStatistics::hudson87(const PolymorphismSequenceContainer& psc, double precision, double cinf, double csup)
1476 {
1477  double left = leftHandHudson_(psc);
1478  size_t n = psc.getNumberOfSequences();
1479  double dif = 1;
1480  double c1 = cinf;
1481  double c2 = csup;
1482  if (SequenceStatistics::numberOfPolymorphicSites(psc) < 2)
1483  return -1;
1484  if (rightHandHudson_(c1, n) < left)
1485  return cinf;
1486  if (rightHandHudson_(c2, n) > left)
1487  return csup;
1488  while (dif > precision)
1489  {
1490  if (rightHandHudson_((c1 + c2) / 2, n) > left)
1491  c1 = (c1 + c2) / 2;
1492  else
1493  c2 = (c1 + c2) / 2;
1494  dif = std::abs(2 * (c1 - c2) / (c1 + c2));
1495  }
1496  return (c1 + c2) / 2;
1497 }
1498 
1499 /*****************/
1500 /* Tests methods */
1501 /*****************/
1502 
1503 void SequenceStatistics::testUsefulValues(std::ostream& s, size_t n)
1504 {
1505  map<string, double> v = getUsefulValues_(n);
1506  double vD = getVD_(n, v["a1"], v["a2"], v["cn"]);
1507  double uD = getUD_(v["a1"], vD);
1508  double vDs = getVDstar_(n, v["a1"], v["a2"], v["dn"]);
1509  double uDs = getUDstar_(n, v["a1"], vDs);
1510 
1511  s << n << "\t";
1512  s << v["a1"] << "\t";
1513  s << v["a2"] << "\t";
1514  s << v["a1n"] << "\t";
1515  s << v["b1"] << "\t";
1516  s << v["b2"] << "\t";
1517  s << v["c1"] << "\t";
1518  s << v["c2"] << "\t";
1519  s << v["cn"] << "\t";
1520  s << v["dn"] << "\t";
1521  s << v["e1"] << "\t";
1522  s << v["e2"] << "\t";
1523  s << uD << "\t";
1524  s << vD << "\t";
1525  s << uDs << "\t";
1526  s << vDs << endl;
1527 }
1528 
1529 // ******************************************************************************
1530 // Private methods
1531 // ******************************************************************************
1532 
1533 unsigned int SequenceStatistics::getNumberOfMutations_(const Site& site)
1534 {
1535  unsigned int tmp_count = 0;
1536  map<int, size_t> states_count;
1537  SymbolListTools::getCounts(site, states_count);
1538 
1539  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1540  {
1541  if (it->first >= 0)
1542  tmp_count++;
1543  }
1544  if (tmp_count > 0)
1545  tmp_count--;
1546  return tmp_count;
1547 }
1548 
1549 unsigned int SequenceStatistics::getNumberOfSingletons_(const Site& site)
1550 {
1551  unsigned int nus = 0;
1552  map<int, size_t> states_count;
1553  SymbolListTools::getCounts(site, states_count);
1554  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1555  {
1556  if (it->second == 1)
1557  nus++;
1558  }
1559  return nus;
1560 }
1561 
1562 unsigned int SequenceStatistics::getNumberOfDerivedSingletons_(const Site& site_in, const Site& site_out)
1563 {
1564  unsigned int nus = 0;
1565  map<int, size_t> states_count;
1566  map<int, size_t> outgroup_states_count;
1567  SymbolListTools::getCounts(site_in, states_count);
1568  SymbolListTools::getCounts(site_out, outgroup_states_count);
1569  // if there is more than one variant in the outgroup we will not be able to recover the ancestral state
1570  if (outgroup_states_count.size() == 1)
1571  {
1572  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1573  {
1574  if (it->second == 1)
1575  {
1576  if (outgroup_states_count.find(it->first) == outgroup_states_count.end())
1577  nus++;
1578  }
1579  }
1580  }
1581  return nus;
1582 }
1583 
1584 std::map<std::string, double> SequenceStatistics::getUsefulValues_(size_t n)
1585 {
1586  double nn = static_cast<double>(n);
1587  map<string, double> values;
1588  values["a1"] = 0.;
1589  values["a2"] = 0.;
1590  values["a1n"] = 0.;
1591  values["b1"] = 0.;
1592  values["b2"] = 0.;
1593  values["c1"] = 0.;
1594  values["c2"] = 0.;
1595  values["cn"] = 0.;
1596  values["dn"] = 0.;
1597  values["e1"] = 0.;
1598  values["e2"] = 0.;
1599  if (n > 1)
1600  {
1601  for (double i = 1; i < nn; i++)
1602  {
1603  values["a1"] += 1. / i;
1604  values["a2"] += 1. / (i * i);
1605  }
1606  values["a1n"] = values["a1"] + (1. / nn);
1607  values["b1"] = (nn + 1.) / (3. * (nn - 1.));
1608  values["b2"] = 2. * ((nn * nn) + nn + 3.) / (9. * nn * (nn - 1.));
1609  values["c1"] = values["b1"] - (1. / values["a1"]);
1610  values["c2"] = values["b2"] - ((nn + 2.) / (values["a1"] * nn)) + (values["a2"] / (values["a1"] * values["a1"]));
1611  if (n == 2)
1612  {
1613  values["cn"] = 1.;
1614  values["dn"] = 2.;
1615  }
1616  else
1617  {
1618  values["cn"] = 2. * ((nn * values["a1"]) - (2. * (nn - 1.))) / ((nn - 1.) * (nn - 2.));
1619  values["dn"] =
1620  values["cn"]
1621  + ((nn - 2.) / ((nn - 1.) * (nn - 1.)))
1622  + (2. / (nn - 1.))
1623  * ((3. / 2.) - (((2. * values["a1n"]) - 3.) / (nn - 2.)) - (1. / nn));
1624  }
1625  values["e1"] = values["c1"] / values["a1"];
1626  values["e2"] = values["c2"] / ((values["a1"] * values["a1"]) + values["a2"]);
1627  }
1628  return values;
1629 }
1630 
1631 double SequenceStatistics::getVD_(size_t n, double a1, double a2, double cn)
1632 {
1633  double nn = static_cast<double>(n);
1634  if (n < 3)
1635  return 0.;
1636  double vD = 1. + ((a1 * a1) / (a2 + (a1 * a1))) * (cn - ((nn + 1.) / (nn - 1.)));
1637  return vD;
1638 }
1639 
1640 double SequenceStatistics::getUD_(double a1, double vD)
1641 {
1642  return a1 - 1. - vD;
1643 }
1644 
1645 double SequenceStatistics::getVDstar_(size_t n, double a1, double a2, double dn)
1646 {
1647  double denom = (a1 * a1) + a2;
1648  if (n < 3 || denom == 0.)
1649  return 0.;
1650  double nn = static_cast<double>(n);
1651  double nnn = nn / (nn - 1.);
1652  // Fu & Li 1993
1653  double vDs = (
1654  (nnn * nnn * a2)
1655  + (a1 * a1 * dn)
1656  - (2. * (nn * a1 * (a1 + 1)) / ((nn - 1.) * (nn - 1.)))
1657  )
1658  /
1659  denom;
1660  // Simonsen et al. 1995
1661  /*
1662  double vDs = (
1663  (values["a2"] / pow(values["a1"], 2))
1664  - (2./nn) * (1. + 1./values["a1"] - values["a1"] + values["a1"]/nn)
1665  - 1./(nn*nn)
1666  )
1667  /
1668  (pow(values["a1"], 2) + values["a2"]);
1669  */
1670  return vDs;
1671 }
1672 
1673 double SequenceStatistics::getUDstar_(size_t n, double a1, double vDs)
1674 {
1675  if (n < 3)
1676  return 0.;
1677  double nn = static_cast<double>(n);
1678  double nnn = nn / (nn - 1.);
1679  // Fu & Li 1993
1680  double uDs = (nnn * (a1 - nnn)) - vDs;
1681  // Simonsen et al. 1995
1682  /*
1683  double uDs = (((nn - 1.)/nn - 1./values["a1"]) / values["a1"]) - vDs;
1684  */
1685  return uDs;
1686 }
1687 
1688 double SequenceStatistics::leftHandHudson_(const PolymorphismSequenceContainer& psc)
1689 {
1690  PolymorphismSequenceContainer* newpsc = PolymorphismSequenceContainerTools::getCompleteSites(psc);
1691  size_t nbseq = newpsc->getNumberOfSequences();
1692  double S1 = 0;
1693  double S2 = 0;
1694  for (size_t i = 0; i < nbseq - 1; i++)
1695  {
1696  for (size_t j = i + 1; j < nbseq; j++)
1697  {
1698  SequenceSelection ss(2);
1699  ss[0] = i;
1700  ss[1] = j;
1701  PolymorphismSequenceContainer* psc2 = PolymorphismSequenceContainerTools::getSelectedSequences(*newpsc, ss);
1702  S1 += SequenceStatistics::watterson75(*psc2, true);
1703  S2 += SequenceStatistics::watterson75(*psc2, true) * SequenceStatistics::watterson75(*psc2, true);
1704  delete psc2;
1705  }
1706  }
1707  double Sk = (2 * S2 - pow(2 * S1 / static_cast<double>(nbseq), 2.)) / pow(nbseq, 2.);
1708  double H = SequenceStatistics::heterozygosity(*newpsc);
1709  double H2 = SequenceStatistics::squaredHeterozygosity(*newpsc);
1710  delete newpsc;
1711  return static_cast<double>(Sk - H + H2) / pow(H * static_cast<double>(nbseq) / static_cast<double>(nbseq - 1), 2.);
1712 }
1713 
1714 double SequenceStatistics::rightHandHudson_(double c, size_t n)
1715 {
1716  double nn = static_cast<double>(n);
1717  return 1. / (97. * pow(c, 2.) * pow(nn, 3.)) * ((nn - 1.) * (97. * (c * (4. + (c - 2. * nn) * nn) + (-2. * (7. + c) + 4. * nn + (c - 1.) * pow(nn, 2.)) * log((18. + c * (13. + c)) / 18.)) + sqrt(97.) * (110. + nn * (49. * nn - 52.) + c * (2. + nn * (15. * nn - 8.))) * log(-1. + (72. + 26. * c) / (36. + 13. * c - c * sqrt(97.)))));
1718 }
1719 
void setAsOutgroupMember(size_t index)
Set a sequence as outgroup member by index.
STL namespace.
unsigned int getSequenceCount(size_t index) const
Get the count of a sequence by index.
void addSequence(const Sequence &sequence, bool checkName=true)
The PolymorphismSequenceContainer class.