bpp-seq  2.2.0
SiteTools.cpp
Go to the documentation of this file.
1 //
2 // File SiteTools.cpp
3 // Author : Julien Dutheil
4 // Guillaume Deuchst
5 // Created on: Friday August 8 2003
6 //
7 
8 /*
9  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
10 
11  This software is a computer program whose purpose is to provide classes
12  for sequences analysis.
13 
14  This software is governed by the CeCILL license under French law and
15  abiding by the rules of distribution of free software. You can use,
16  modify and/ or redistribute the software under the terms of the CeCILL
17  license as circulated by CEA, CNRS and INRIA at the following URL
18  "http://www.cecill.info".
19 
20  As a counterpart to the access to the source code and rights to copy,
21  modify and redistribute granted by the license, users are provided only
22  with a limited warranty and the software's author, the holder of the
23  economic rights, and the successive licensors have only limited
24  liability.
25 
26  In this respect, the user's attention is drawn to the risks associated
27  with loading, using, modifying and/or developing or reproducing the
28  software by the user in light of its specific status of free software,
29  that may mean that it is complicated to manipulate, and that also
30  therefore means that it is reserved for developers and experienced
31  professionals having in-depth computer knowledge. Users are therefore
32  encouraged to load and test the software's suitability as regards their
33  requirements in conditions enabling the security of their systems and/or
34  data to be ensured and, more generally, to use and operate it in the
35  same conditions as regards security.
36 
37  The fact that you are presently reading this means that you have had
38  knowledge of the CeCILL license and that you accept its terms.
39  */
40 
41 #include "SiteTools.h"
42 #include "Alphabet/CodonAlphabet.h"
43 #include <Bpp/Utils/MapTools.h>
44 #include <Bpp/Numeric/NumTools.h>
45 #include <Bpp/Numeric/VectorTools.h>
46 
47 using namespace bpp;
48 
49 // From the STL:
50 #include <cmath>
51 
52 using namespace std;
53 
54 /******************************************************************************/
55 
56 bool SiteTools::hasGap(const Site& site)
57 {
58  // Main loop : for all characters in site
59  for (size_t i = 0; i < site.size(); i++)
60  {
61  if (site.getAlphabet()->isGap(site[i]))
62  return true;
63  }
64  return false;
65 }
66 
67 /******************************************************************************/
68 
69 bool SiteTools::isGapOnly(const Site& site)
70 {
71  // Main loop : for all characters in site
72  for (size_t i = 0; i < site.size(); i++)
73  {
74  if (!site.getAlphabet()->isGap(site[i]))
75  return false;
76  }
77  return true;
78 }
79 
80 /******************************************************************************/
81 
83 {
84  // Main loop : for all characters in site
85  for (size_t i = 0; i < site.size(); i++)
86  {
87  if (!site.getAlphabet()->isGap(site[i]) && !site.getAlphabet()->isUnresolved(site[i]))
88  return false;
89  }
90  return true;
91 }
92 
93 /******************************************************************************/
94 
95 bool SiteTools::hasUnknown(const Site& site)
96 {
97  // Main loop : for all characters in site
98  for (size_t i = 0; i < site.size(); i++)
99  {
100  if (site[i] == site.getAlphabet()->getUnknownCharacterCode())
101  return true;
102  }
103  return false;
104 }
105 
106 /******************************************************************************/
107 
108 bool SiteTools::isComplete(const Site& site)
109 {
110  // Main loop : for all characters in site
111  for (size_t i = 0; i < site.size(); i++)
112  {
113  if (site.getAlphabet()->isGap(site[i]) || site.getAlphabet()->isUnresolved(site[i]))
114  return false;
115  }
116  return true;
117 }
118 
119 /******************************************************************************/
120 
121 bool SiteTools::areSitesIdentical(const Site& site1, const Site& site2)
122 {
123  // Site's size and content checking
124  if (site1.getAlphabet()->getAlphabetType() != site2.getAlphabet()->getAlphabetType())
125  return false;
126  if (site1.size() != site2.size())
127  return false;
128  else
129  {
130  for (size_t i = 0; i < site1.size(); i++)
131  {
132  if (site1[i] != site2[i])
133  return false;
134  }
135  return true;
136  }
137 }
138 
139 /******************************************************************************/
140 
141 bool SiteTools::isConstant(const Site& site, bool ignoreUnknown, bool unresolvedRaisesException) throw (EmptySiteException)
142 {
143  // Empty site checking
144  if (site.size() == 0)
145  throw EmptySiteException("SiteTools::isConstant: Incorrect specified site, size must be > 0", &site);
146 
147  // For all site's characters
148  int gap = site.getAlphabet()->getGapCharacterCode();
149  if (ignoreUnknown)
150  {
151  int s = site[0];
152  int unknown = site.getAlphabet()->getUnknownCharacterCode();
153  size_t i = 0;
154  while (i < site.size() && (s == gap || s == unknown))
155  {
156  s = site[i];
157  i++;
158  }
159  if (s == unknown || s == gap)
160  {
161  if (unresolvedRaisesException)
162  throw EmptySiteException("SiteTools::isConstant: Site is only made of gaps or generic characters.");
163  else
164  return false;
165  }
166  while (i < site.size())
167  {
168  if (site[i] != s && site[i] != gap && site[i] != unknown)
169  return false;
170  i++;
171  }
172  }
173  else
174  {
175  int s = site[0];
176  size_t i = 0;
177  while (i < site.size() && s == gap)
178  {
179  s = site[i];
180  i++;
181  }
182  if (s == gap)
183  {
184  if (unresolvedRaisesException)
185  throw EmptySiteException("SiteTools::isConstant: Site is only made of gaps.");
186  else
187  return false;
188  }
189  while (i < site.size())
190  {
191  if (site[i] != s && site[i] != gap)
192  return false;
193  i++;
194  }
195  }
196 
197  return true;
198 }
199 
200 /******************************************************************************/
201 
202 double SiteTools::variabilityShannon(const Site& site, bool resolveUnknown) throw (EmptySiteException)
203 {
204  // Empty site checking
205  if (site.size() == 0)
206  throw EmptySiteException("SiteTools::variabilityShannon: Incorrect specified site, size must be > 0", &site);
207  map<int, double> p;
208  getFrequencies(site, p, resolveUnknown);
209  // We need to correct frequencies for gaps:
210  double s = 0.;
211  for (int i = 0; i < static_cast<int>(site.getAlphabet()->getSize()); i++)
212  {
213  double f = p[i];
214  if (f > 0)
215  s += f * log(f);
216  }
217  return -s;
218 }
219 
220 /******************************************************************************/
221 
222 double SiteTools::mutualInformation(const Site& site1, const Site& site2, bool resolveUnknown) throw (DimensionException, EmptySiteException)
223 {
224  // Empty site checking
225  if (site1.size() == 0)
226  throw EmptySiteException("SiteTools::mutualInformation: Incorrect specified site, size must be > 0", &site1);
227  if (site2.size() == 0)
228  throw EmptySiteException("SiteTools::mutualInformation: Incorrect specified site, size must be > 0", &site2);
229  if (site1.size() != site2.size())
230  throw DimensionException("SiteTools::mutualInformation: sites must have the same size!", site1.size(), site2.size());
231  vector<double> p1(site1.getAlphabet()->getSize());
232  vector<double> p2(site2.getAlphabet()->getSize());
233  map<int, map<int, double> > p12;
234  getCounts(site1, site2, p12, resolveUnknown);
235  double mi = 0, tot = 0, pxy;
236  // We need to correct frequencies for gaps:
237  for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
238  {
239  for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
240  {
241  pxy = p12[static_cast<int>(i)][static_cast<int>(j)];
242  tot += pxy;
243  p1[i] += pxy;
244  p2[j] += pxy;
245  }
246  }
247  for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
248  {
249  p1[i] /= tot;
250  }
251  for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
252  {
253  p2[j] /= tot;
254  }
255  for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
256  {
257  for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
258  {
259  pxy = p12[static_cast<int>(i)][static_cast<int>(j)] / tot;
260  if (pxy > 0)
261  mi += pxy * log(pxy / (p1[i] * p2[j]));
262  }
263  }
264  return mi;
265 }
266 
267 /******************************************************************************/
268 
269 double SiteTools::jointEntropy(const Site& site1, const Site& site2, bool resolveUnknown) throw (DimensionException, EmptySiteException)
270 {
271  // Empty site checking
272  if (site1.size() == 0)
273  throw EmptySiteException("SiteTools::jointEntropy: Incorrect specified site, size must be > 0", &site1);
274  if (site2.size() == 0)
275  throw EmptySiteException("SiteTools::jointEntropy: Incorrect specified site, size must be > 0", &site2);
276  if (site1.size() != site2.size())
277  throw DimensionException("SiteTools::jointEntropy: sites must have the same size!", site1.size(), site2.size());
278  map<int, map<int, double> > p12;
279  getCounts(site1, site2, p12, resolveUnknown);
280  double tot = 0, pxy, h = 0;
281  // We need to correct frequencies for gaps:
282  for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
283  {
284  for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
285  {
286  pxy = p12[static_cast<int>(i)][static_cast<int>(j)];
287  tot += pxy;
288  }
289  }
290  for (size_t i = 0; i < site1.getAlphabet()->getSize(); i++)
291  {
292  for (size_t j = 0; j < site2.getAlphabet()->getSize(); j++)
293  {
294  pxy = p12[static_cast<int>(i)][static_cast<int>(j)] / tot;
295  if (pxy > 0)
296  h += pxy * log(pxy);
297  }
298  }
299  return -h;
300 }
301 
302 /******************************************************************************/
303 
305 {
306  // Empty site checking
307  if (site.size() == 0)
308  throw EmptySiteException("SiteTools::variabilityFactorial: Incorrect specified site, size must be > 0", &site);
309  map<int, size_t> p;
310  getCounts(site, p);
311  vector<size_t> c = MapTools::getValues(p);
312  size_t s = VectorTools::sum(c);
313  long double l = static_cast<long double>(NumTools::fact(s)) / static_cast<long double>(VectorTools::sum(VectorTools::fact(c)));
314  return (static_cast<double>(std::log(l)));
315 }
316 
317 /******************************************************************************/
318 
320 {
321  // Empty site checking
322  if (site.size() == 0)
323  throw EmptySiteException("SiteTools::heterozygosity: Incorrect specified site, size must be > 0", &site);
324  map<int, double> p;
325  getFrequencies(site, p);
326  vector<double> c = MapTools::getValues(p);
327  double n = VectorTools::norm<double, double>(MapTools::getValues(p));
328  return 1. - n * n;
329 }
330 
331 /******************************************************************************/
332 
334 {
335  // Empty site checking
336  if (site.size() == 0)
337  throw EmptySiteException("SiteTools::getNumberOfDistinctCharacters(): Incorrect specified site, size must be > 0", &site);
338  // For all site's characters
339  if (SiteTools::isConstant(site))
340  return 1;
341  map<int, size_t> counts;
342  SymbolListTools::getCounts(site, counts);
343  size_t s = 0;
344  for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
345  {
346  if (it->second != 0)
347  s++;
348  }
349  return s;
350 }
351 
352 /******************************************************************************/
353 
355 {
356  // Empty site checking
357  if (site.size() == 0)
358  throw EmptySiteException("SiteTools::hasSingleton: Incorrect specified site, size must be > 0", &site);
359  // For all site's characters
360  if (SiteTools::isConstant(site))
361  return false;
362  map<int, size_t> counts;
363  getCounts(site, counts);
364  for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
365  {
366  if (it->second == 1)
367  return true;
368  }
369  return false;
370 }
371 
372 /******************************************************************************/
373 
375 {
376  // Empty site checking
377  if (site.size() == 0)
378  throw EmptySiteException("SiteTools::isParsimonyInformativeSite: Incorrect specified site, size must be > 0", &site);
379  // For all site's characters
380  if (SiteTools::isConstant(site, false, false))
381  return false;
382  map<int, size_t> counts;
383  SymbolListTools::getCounts(site, counts);
384  size_t npars = 0;
385  for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
386  {
387  if (it->second > 1)
388  npars++;
389  }
390  if (npars > 1)
391  return true;
392  return false;
393 }
394 
395 /******************************************************************************/
396 
398 {
399  // Empty site checking
400  if (site.size() == 0)
401  throw EmptySiteException("SiteTools::isTriplet: Incorrect specified site, size must be > 0", &site);
402  // For all site's characters
404  return true;
405  else
406  return false;
407 }
408 
409 /******************************************************************************/
410 
static void getCounts(const SymbolList &list, std::map< int, size_t > &counts)
Count all states in the list.
static double mutualInformation(const Site &site1, const Site &site2, bool resolveUnknowns)
Compute the mutual information between two sites.
Definition: SiteTools.cpp:222
static bool isConstant(const Site &site, bool ignoreUnknown=false, bool unresolvedRaisesException=true)
Tell if a site is constant, that is displaying the same state in all sequences that do not present a ...
Definition: SiteTools.cpp:141
static bool isParsimonyInformativeSite(const Site &site)
Tell if a site is a parsimony informative site.
Definition: SiteTools.cpp:374
virtual bool isGap(int state) const =0
This alphabet is used to deal NumericAlphabet.
virtual bool isUnresolved(int state) const =0
STL namespace.
static bool hasUnknown(const Site &site)
Definition: SiteTools.cpp:95
static bool isGapOrUnresolvedOnly(const Site &site)
Definition: SiteTools.cpp:82
static bool isComplete(const Site &site)
Definition: SiteTools.cpp:108
static bool hasGap(const Site &site)
Definition: SiteTools.cpp:56
static double heterozygosity(const Site &site)
Compute the heterozygosity index of a site.
Definition: SiteTools.cpp:319
Exception sent when a empty site is found.
static size_t getNumberOfDistinctCharacters(const Site &site)
Give the number of distinct characters at a site.
Definition: SiteTools.cpp:333
virtual size_t size() const
Get the number of elements in the list.
Definition: SymbolList.h:350
static bool areSitesIdentical(const Site &site1, const Site &site2)
Definition: SiteTools.cpp:121
static bool hasSingleton(const Site &site)
Tell if a site has singletons.
Definition: SiteTools.cpp:354
static bool isGapOnly(const Site &site)
Definition: SiteTools.cpp:69
static bool isTriplet(const Site &site)
Tell if a site has more than 2 distinct characters.
Definition: SiteTools.cpp:397
static double variabilityShannon(const Site &site, bool resolveUnknowns)
Compute the Shannon entropy index of a site.
Definition: SiteTools.cpp:202
virtual const Alphabet * getAlphabet() const
Get the alphabet associated to the list.
Definition: SymbolList.h:348
The Site class.
Definition: Site.h:61
static double variabilityFactorial(const Site &site)
Compute the factorial diversity index of a site.
Definition: SiteTools.cpp:304
static double jointEntropy(const Site &site1, const Site &site2, bool resolveUnknowns)
Compute the joint entropy between two sites.
Definition: SiteTools.cpp:269
virtual int getUnknownCharacterCode() const =0
virtual std::string getAlphabetType() const =0
Identification method.