bpp-seq-omics  2.2.0
MafStatistics.cpp
Go to the documentation of this file.
1 //
2 // File: MafStatistics.cpp
3 // Authors: Julien Dutheil
4 // Created: Mon Jun 25 2012
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2010)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
40 #include "MafStatistics.h"
41 #include <Bpp/Seq/Container/SequenceContainerTools.h>
42 #include <Bpp/Seq/Container/VectorSiteContainer.h>
43 #include <Bpp/Seq/SiteTools.h>
44 
45 //From bpp-core:
46 #include <Bpp/Numeric/NumConstants.h>
47 
48 //From the STL:
49 #include <cmath>
50 #include <map>
51 
52 using namespace bpp;
53 using namespace std;
54 
56 {
57  vector<const MafSequence*> seqs1 = block.getSequencesForSpecies(species1_);
58  vector<const MafSequence*> seqs2 = block.getSequencesForSpecies(species2_);
59  if (seqs1.size() > 1 || seqs2.size() > 1)
60  throw Exception("PairwiseDivergenceMafStatistics::compute. Duplicated sequence for species " + species1_ + "or " + species2_ + ".");
61  if (seqs1.size() == 0 || seqs2.size() == 0)
62  result_.setValue(NumConstants::NaN());
63  else
64  result_.setValue(100. - SequenceTools::getPercentIdentity(*seqs1[0], *seqs2[0], true));
65 }
66 
68 {
69  vector<string> tags;
70  for (int i = 0; i < static_cast<int>(alphabet_->getSize()); ++i) {
71  tags.push_back(alphabet_->intToChar(i));
72  }
73  tags.push_back("Gap");
74  tags.push_back("Unresolved");
75 
76  return tags;
77 }
78 
80 {
81  std::map<int, int> counts;
82  SequenceContainerTools::getCounts(block.getAlignment(), counts);
83  for (int i = 0; i < static_cast<int>(alphabet_->getSize()); ++i) {
84  result_.setValue(alphabet_->intToChar(i), counts[i]);
85  }
86  result_.setValue("Gap", counts[alphabet_->getGapCharacterCode()]);
87  double countUnres = 0;
88  for (map<int, int>::iterator it = counts.begin(); it != counts.end(); ++it) {
89  if (alphabet_->isUnresolved(it->first))
90  countUnres += it->second;
91  }
92  result_.setValue("Unresolved", countUnres);
93 }
94 
96 {
97  VectorSiteContainer* alignment = new VectorSiteContainer(block.getAlignment().getAlphabet());
98  for (size_t i = 0; i < species_.size(); ++i) {
99  if (block.hasSequenceForSpecies(species_[i])) {
100  vector<const MafSequence*> selection = block.getSequencesForSpecies(species_[i]);
101  for (size_t j = 0; j < selection.size(); ++j) {
102  alignment->addSequence(*selection[j]);
103  }
104  }
105  }
106  return alignment;
107 }
108 
110  species_(species)
111 {
112  size_t n = VectorTools::vectorUnion(species).size();
113  size_t m = 0;
114  for (size_t i = 0; i < species.size(); ++i)
115  m += species_[i].size();
116  if (m != n)
117  throw Exception("AbstractSpeciesMultipleSelectionMafStatistics (constructor). Species selections must be fully distinct.");
118 }
119 
121 {
122  vector<SiteContainer*> alignments;
123  for (size_t k = 0; k < species_.size(); ++k) {
124  VectorSiteContainer* alignment = new VectorSiteContainer(block.getAlignment().getAlphabet());
125  for (size_t i = 0; i < species_[k].size(); ++i) {
126  if (block.hasSequenceForSpecies(species_[k][i])) {
127  vector<const MafSequence*> selection = block.getSequencesForSpecies(species_[k][i]);
128  for (size_t j = 0; j < selection.size(); ++j) {
129  alignment->addSequence(*selection[j]);
130  }
131  }
132  }
133  alignments.push_back(alignment);
134  }
135  return alignments;
136 }
137 
139 {
140  vector<string> tags;
141  for (size_t i = 0; i < categorizer_.getNumberOfCategories(); ++i) {
142  tags.push_back("Bin" + TextTools::toString(i + 1));
143  }
144  tags.push_back("Unresolved");
145  tags.push_back("Saturated");
146  tags.push_back("Ignored");
147  return tags;
148 }
149 
151 {
152  unsigned int nbUnresolved = 0;
153  unsigned int nbSaturated = 0;
154  unsigned int nbIgnored = 0;
156  int state;
157  bool hasOutgroup = (outgroup_ != "");
158  bool isAnalyzable;
159  auto_ptr<SiteContainer> alignment;
160  const Sequence* outgroupSeq = 0;
161  if (hasOutgroup) {
162  isAnalyzable = (block.hasSequenceForSpecies(outgroup_) && block.getNumberOfSequences() > 1);
163  if (isAnalyzable) {
164  //We need to extract the outgroup sequence:
165  outgroupSeq = &block.getSequenceForSpecies(outgroup_); //Here we assume there is only one! Otherwise we take the first one...
166  alignment.reset(getSiteContainer_(block));
167  }
168  } else {
169  isAnalyzable = (block.getNumberOfSequences() > 0);
170  if (isAnalyzable)
171  alignment.reset(getSiteContainer_(block));
172  }
173  if (isAnalyzable) {
174  for (size_t i = 0; i < alignment->getNumberOfSites(); ++i) {
175  //Note: we do not rely on SiteTool::getCounts as it would be unefficient to count everything.
176  const Site& site = alignment->getSite(i);
177  map<int, unsigned int> counts;
178  bool isUnresolved = false;
179  bool isSaturated = false;
180  for (size_t j = 0; !isUnresolved && !isSaturated && j < site.size(); ++j) {
181  state = site[j];
182  if (alphabet_->isGap(state) || alphabet_->isUnresolved(state)) {
183  isUnresolved = true;
184  } else {
185  counts[state]++;
186  if (counts.size() > 2) {
187  isSaturated = true;
188  }
189  }
190  }
191  if (isUnresolved) {
192  nbUnresolved++;
193  } else if (isSaturated) {
194  nbSaturated++;
195  } else if (hasOutgroup && (
196  alignment->getAlphabet()->isGap((*outgroupSeq)[i]) ||
197  alignment->getAlphabet()->isUnresolved((*outgroupSeq)[i]))) {
198  nbUnresolved++;
199  } else {
200  //Determine frequency class:
201  double count;
202  if (counts.size() == 1) {
203  if (hasOutgroup) {
204  if (counts.begin()->first == (*outgroupSeq)[i])
205  count = 0; //This is the ancestral state.
206  else
207  count = counts.begin()->second; //This is a derived state.
208  } else {
209  count = 0; //In this case we do not know, so we put 0.
210  }
211  } else {
212  map<int, unsigned int>::iterator it = counts.begin();
213  unsigned int count1 = it->second;
214  it++;
215  unsigned int count2 = it->second;
216  if (hasOutgroup) {
217  if (counts.begin()->first == (*outgroupSeq)[i])
218  count = counts.rbegin()->second; //This is the ancestral state, therefore we other one is the derived state.
219  else if (counts.rbegin()->first == (*outgroupSeq)[i])
220  count = counts.begin()->second; //The second state is the ancestral one, therefore the first one is the derived state.
221  else {
222  //None of the two states are ancestral! The position is therefore discarded.
223  isSaturated = true;
224  }
225  } else {
226  count = min(count1, count2); //In this case we do not know, so we take the minimum of the two values.
227  }
228  }
229  if (isSaturated)
230  nbSaturated++;
231  else {
232  try {
233  counts_[categorizer_.getCategory(count) - 1]++;
234  } catch (OutOfRangeException& oof) {
235  nbIgnored++;
236  }
237  }
238  }
239  }
240  }
241  result_.setValue("Unresolved", nbUnresolved);
242  result_.setValue("Saturated", nbSaturated);
243  result_.setValue("Ignored", nbIgnored);
244  for (size_t i = 0; i < counts_.size(); ++i) {
245  result_.setValue("Bin" + TextTools::toString(i + 1), counts_[i]);
246  }
247 }
248 
250 {
251  vector<string> tags;
252  tags.push_back("f1100");
253  tags.push_back("f0110");
254  tags.push_back("f1010");
255  tags.push_back("Ignored");
256  return tags;
257 }
258 
260 {
261  counts_.assign(6, 0);
262  auto_ptr<SiteContainer> alignment(getSiteContainer_(block));
263  if (alignment->getNumberOfSequences() == 4) {
264  unsigned int nbIgnored = 0;
265  for (size_t i = 0; i < block.getNumberOfSites(); ++i) {
266  const Site& site = alignment->getSite(i);
267  if (SiteTools::isComplete(site)) {
268  if (site[0] == site[1] &&
269  site[2] != site[1] &&
270  site[3] == site[2])
271  counts_[0]++;
272  else if (site[1] == site[2] &&
273  site[1] != site[0] &&
274  site[3] == site[0])
275  counts_[1]++;
276  else if (site[0] == site[2] &&
277  site[1] != site[0] &&
278  site[3] == site[1])
279  counts_[2]++;
280  } else {
281  nbIgnored++;
282  }
283  }
284  result_.setValue("f1100", counts_[0]);
285  result_.setValue("f0110", counts_[1]);
286  result_.setValue("f1010", counts_[2]);
287  result_.setValue("Ignored", nbIgnored);
288  } else {
289  result_.setValue("f1100", 0);
290  result_.setValue("f0110", 0);
291  result_.setValue("f1010", 0);
292  result_.setValue("Ignored", static_cast<double>(block.getNumberOfSites()));
293  }
294 }
295 
297 {
298  vector<string> tags;
299  tags.push_back("NbWithoutGap");
300  tags.push_back("NbComplete");
301  tags.push_back("NbParsimonyInformative");
302  return tags;
303 }
304 
306 {
307  auto_ptr<SiteContainer> alignment(getSiteContainer_(block));
308  unsigned int nbNg = 0;
309  unsigned int nbCo = 0;
310  unsigned int nbPi = 0;
311  if (alignment->getNumberOfSequences() > 0) {
312  for (size_t i = 0; i < alignment->getNumberOfSites(); ++i) {
313  if (!SiteTools::hasGap(alignment->getSite(i)))
314  nbNg++;
315  if (SiteTools::isComplete(alignment->getSite(i)))
316  nbCo++;
317  if (SiteTools::isParsimonyInformativeSite(alignment->getSite(i)))
318  nbPi++;
319  }
320  }
321  result_.setValue("NbWithoutGap", nbNg);
322  result_.setValue("NbComplete", nbCo);
323  result_.setValue("NbParsimonyInformative", nbPi);
324 }
325 
327 {
328  vector<string> tags;
329  tags.push_back("F");
330  tags.push_back("P");
331  tags.push_back("FP");
332  tags.push_back("PF");
333  tags.push_back("FF");
334  tags.push_back("X");
335  tags.push_back("FX");
336  tags.push_back("PX");
337  tags.push_back("XF");
338  tags.push_back("XP");
339  return tags;
340 }
341 
342 vector<int> PolymorphismMafStatistics::getPatterns_(const SiteContainer& sites)
343 {
344  vector<int> patterns(sites.getNumberOfSites());
345  for (size_t i = 0; i < sites.getNumberOfSites(); ++i) {
346  const Site& site = sites.getSite(i);
347  int s = -1; //Unresolved
348  if (SiteTools::isComplete(site)) {
349  if (SiteTools::isConstant(site)) {
350  s = site[0]; //The fixed state
351  } else {
352  s = -10; //Polymorphic.
353  }
354  }
355  patterns[i] = s;
356  }
357  return patterns;
358 }
359 
361 {
362  vector<SiteContainer*> alignments(getSiteContainers_(block));
363  unsigned int nbF = 0;
364  unsigned int nbP = 0;
365  unsigned int nbFF = 0;
366  unsigned int nbFP = 0;
367  unsigned int nbPF = 0;
368  unsigned int nbX = 0;
369  unsigned int nbFX = 0;
370  unsigned int nbPX = 0;
371  unsigned int nbXF = 0;
372  unsigned int nbXP = 0;
373  //Get all patterns:
374  vector<int> patterns1(block.getNumberOfSites(), -1);
375  vector<int> patterns2(block.getNumberOfSites(), -1);
376  if (alignments[0]->getNumberOfSequences() > 0) {
377  patterns1 = getPatterns_(*alignments[0]);
378  }
379  if (alignments[1]->getNumberOfSequences() > 0) {
380  patterns2 = getPatterns_(*alignments[1]);
381  }
382  //Compare patterns:
383  for (size_t i = 0; i < block.getNumberOfSites(); ++i) {
384  int p1 = patterns1[i];
385  int p2 = patterns2[i];
386  switch (p1) {
387  case -1 :
388  switch (p2) {
389  case -1 :
390  nbX++;
391  break;
392  case -10 :
393  nbXP++;
394  break;
395  default :
396  nbXF++;
397  }
398  break;
399 
400  case -10 :
401  switch (p2) {
402  case -1 :
403  nbPX++;
404  break;
405  case -10 :
406  nbP++;
407  break;
408  default :
409  nbPF++;
410  }
411  break;
412 
413  default :
414  switch (p2) {
415  case -1 :
416  nbFX++;
417  break;
418  case -10 :
419  nbFP++;
420  break;
421  default :
422  if (p1 == p2)
423  nbF++;
424  else
425  nbFF++;
426  }
427  }
428  }
429 
430  //Set results:
431  result_.setValue("F", nbF);
432  result_.setValue("P", nbP);
433  result_.setValue("FF", nbFF);
434  result_.setValue("FP", nbFP);
435  result_.setValue("PF", nbPF);
436  result_.setValue("X", nbX);
437  result_.setValue("FX", nbFX);
438  result_.setValue("PX", nbPX);
439  result_.setValue("XF", nbXF);
440  result_.setValue("XP", nbXP);
441 }
442 
444 {
445  vector<string> tags;
446  tags.push_back("NbSeggregating");
447  tags.push_back("WattersonTheta");
448  return tags;
449 }
450 
452 {
453  auto_ptr<SiteContainer> alignment(getSiteContainer_(block));
454  unsigned int nbSeg = 0;
455  unsigned int nbTot = 0;
456  if (alignment->getNumberOfSequences() > 0) {
457  for (size_t i = 0; i < alignment->getNumberOfSites(); ++i) {
458  const Site& site = alignment->getSite(i);
459  if (SiteTools::isComplete(site)) {
460  nbTot++;
461  if (!SiteTools::isConstant(site))
462  nbSeg++;
463  }
464  }
465  }
466  double wt = 0;
467  if (nbSeg > 0) {
468  size_t n = alignment->getNumberOfSequences();
469  double hf = 0;
470  for (double i = 1; i < n; ++i)
471  hf += 1. / i;
472  wt = static_cast<double>(nbSeg) / (static_cast<double>(nbTot) * hf);
473  }
474  result_.setValue("NbSeggregating", nbSeg);
475  result_.setValue("WattersonTheta", wt);
476 }
477 
const MafSequence & getSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:139
std::vector< std::string > getSupportedTags() const
void compute(const MafBlock &block)
void compute(const MafBlock &block)
std::vector< const MafSequence * > getSequencesForSpecies(const std::string &species) const
Definition: MafBlock.h:149
STL namespace.
std::vector< std::string > getSupportedTags() const
std::vector< unsigned int > counts_
size_t getNumberOfSites() const
Definition: MafBlock.h:113
std::vector< std::string > getSupportedTags() const
void compute(const MafBlock &block)
AlignedSequenceContainer & getAlignment()
Definition: MafBlock.h:108
bool hasSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:129
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:55
std::vector< std::string > getSupportedTags() const
SiteContainer * getSiteContainer_(const MafBlock &block)
void compute(const MafBlock &block)
void compute(const MafBlock &block)
std::vector< std::vector< std::string > > species_
std::vector< SiteContainer * > getSiteContainers_(const MafBlock &block)
std::vector< unsigned int > counts_
void compute(const MafBlock &block)
std::vector< std::string > getSupportedTags() const
static std::vector< int > getPatterns_(const SiteContainer &sites)
virtual void setValue(const std::string &tag, double value)
Associate a value to a certain tag. Any existing tag will be overwritten.
Definition: MafStatistics.h:95
AbstractSpeciesMultipleSelectionMafStatistics(const std::vector< std::vector< std::string > > &species)
size_t getNumberOfSequences() const
Definition: MafBlock.h:111
MafStatisticsResult result_
std::vector< std::string > getSupportedTags() const