bpp-phyl  2.2.0
RewardMappingTools.cpp
Go to the documentation of this file.
1 //
2 // File: RewardMappingTools.cpp
3 // Created by: Laurent Guéguen
4 // Created on: vendredi 29 mars 2013, à 15h 01
5 //
6 
7 /*
8  Copyright or © or Copr. Bio++ Development Team, (November 16, 2004, 2005, 2006)
9 
10  This software is a computer program whose purpose is to provide classes
11  for phylogenetic data analysis.
12 
13  This software is governed by the CeCILL license under French law and
14  abiding by the rules of distribution of free software. You can use,
15  modify and/ or redistribute the software under the terms of the CeCILL
16  license as circulated by CEA, CNRS and INRIA at the following URL
17  "http://www.cecill.info".
18 
19  As a counterpart to the access to the source code and rights to copy,
20  modify and redistribute granted by the license, users are provided only
21  with a limited warranty and the software's author, the holder of the
22  economic rights, and the successive licensors have only limited
23  liability.
24 
25  In this respect, the user's attention is drawn to the risks associated
26  with loading, using, modifying and/or developing or reproducing the
27  software by the user in light of its specific status of free software,
28  that may mean that it is complicated to manipulate, and that also
29  therefore means that it is reserved for developers and experienced
30  professionals having in-depth computer knowledge. Users are therefore
31  encouraged to load and test the software's suitability as regards their
32  requirements in conditions enabling the security of their systems and/or
33  data to be ensured and, more generally, to use and operate it in the
34  same conditions as regards security.
35 
36  The fact that you are presently reading this means that you have had
37  knowledge of the CeCILL license and that you accept its terms.
38  */
39 
40 #include "RewardMappingTools.h"
41 #include "../Likelihood/DRTreeLikelihoodTools.h"
42 #include "../Likelihood/MarginalAncestralStateReconstruction.h"
43 
44 #include <Bpp/Text/TextTools.h>
45 #include <Bpp/App/ApplicationTools.h>
46 #include <Bpp/Numeric/Matrix/MatrixTools.h>
47 #include <Bpp/Numeric/DataTable.h>
48 
49 using namespace bpp;
50 
51 // From the STL:
52 #include <iomanip>
53 
54 using namespace std;
55 
56 /******************************************************************************/
57 
59  const DRTreeLikelihood& drtl,
60  const vector<int>& nodeIds,
61  Reward& reward,
62  bool verbose) throw (Exception)
63 {
64  // Preamble:
65  if (!drtl.isInitialized())
66  throw Exception("RewardMappingTools::computeRewardVectors(). Likelihood object is not initialized.");
67 
68  // A few variables we'll need:
69 
70  const TreeTemplate<Node> tree(drtl.getTree());
71  const SiteContainer* sequences = drtl.getData();
72  const DiscreteDistribution* rDist = drtl.getRateDistribution();
73 
74  size_t nbSites = sequences->getNumberOfSites();
75  size_t nbDistinctSites = drtl.getLikelihoodData()->getNumberOfDistinctSites();
76  size_t nbStates = sequences->getAlphabet()->getSize();
77  size_t nbClasses = rDist->getNumberOfCategories();
78  vector<const Node*> nodes = tree.getNodes();
79  const vector<size_t>* rootPatternLinks
80  = &drtl.getLikelihoodData()->getRootArrayPositions();
81  nodes.pop_back(); // Remove root node.
82  size_t nbNodes = nodes.size();
83 
84  // We create a new ProbabilisticRewardMapping object:
85  ProbabilisticRewardMapping* rewards = new ProbabilisticRewardMapping(tree, &reward, nbSites);
86 
87  // Store likelihood for each rate for each site:
88  VVVdouble lik;
89  drtl.computeLikelihoodAtNode(tree.getRootId(), lik);
90  Vdouble Lr(nbDistinctSites, 0);
91  Vdouble rcProbs = rDist->getProbabilities();
92  Vdouble rcRates = rDist->getCategories();
93  for (size_t i = 0; i < nbDistinctSites; i++)
94  {
95  VVdouble* lik_i = &lik[i];
96  for (size_t c = 0; c < nbClasses; c++)
97  {
98  Vdouble* lik_i_c = &(*lik_i)[c];
99  double rc = rDist->getProbability(c);
100  for (size_t s = 0; s < nbStates; s++)
101  {
102  Lr[i] += (*lik_i_c)[s] * rc;
103  }
104  }
105  }
106 
107  // Compute the reward for each class and each branch in the tree:
108  if (verbose)
109  ApplicationTools::displayTask("Compute joint node-pairs likelihood", true);
110 
111  for (size_t l = 0; l < nbNodes; ++l)
112  {
113  // For each node,
114  const Node* currentNode = nodes[l];
115  if (nodeIds.size() > 0 && !VectorTools::contains(nodeIds, currentNode->getId()))
116  continue;
117 
118  const Node* father = currentNode->getFather();
119 
120  double d = currentNode->getDistanceToFather();
121 
122  if (verbose)
123  ApplicationTools::displayGauge(l, nbNodes - 1);
124  Vdouble rewardsForCurrentNode(nbDistinctSites);
125 
126  // Now we've got to compute likelihoods in a smart manner... ;)
127  VVVdouble likelihoodsFatherConstantPart(nbDistinctSites);
128  for (size_t i = 0; i < nbDistinctSites; i++)
129  {
130  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
131  likelihoodsFatherConstantPart_i->resize(nbClasses);
132  for (size_t c = 0; c < nbClasses; c++)
133  {
134  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
135  likelihoodsFatherConstantPart_i_c->resize(nbStates);
136  double rc = rDist->getProbability(c);
137  for (size_t s = 0; s < nbStates; s++)
138  {
139  // (* likelihoodsFatherConstantPart_i_c)[s] = rc * model->freq(s);
140  // freq is already accounted in the array
141  (*likelihoodsFatherConstantPart_i_c)[s] = rc;
142  }
143  }
144  }
145 
146  // First, what will remain constant:
147  size_t nbSons = father->getNumberOfSons();
148  for (size_t n = 0; n < nbSons; n++)
149  {
150  const Node* currentSon = father->getSon(n);
151  if (currentSon->getId() != currentNode->getId())
152  {
153  const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId());
154 
155  // Now iterate over all site partitions:
156  auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentSon->getId()));
157  VVVdouble pxy;
158  bool first;
159  while (mit->hasNext())
160  {
162  auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
163  first = true;
164  while (sit->hasNext())
165  {
166  size_t i = sit->next();
167  // We retrieve the transition probabilities for this site partition:
168  if (first)
169  {
170  pxy = drtl.getTransitionProbabilitiesPerRateClass(currentSon->getId(), i);
171  first = false;
172  }
173  const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i];
174  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
175  for (size_t c = 0; c < nbClasses; c++)
176  {
177  const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c];
178  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
179  VVdouble* pxy_c = &pxy[c];
180  for (size_t x = 0; x < nbStates; x++)
181  {
182  Vdouble* pxy_c_x = &(*pxy_c)[x];
183  double likelihood = 0.;
184  for (size_t y = 0; y < nbStates; y++)
185  {
186  likelihood += (*pxy_c_x)[y] * (*likelihoodsFather_son_i_c)[y];
187  }
188  (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood;
189  }
190  }
191  }
192  }
193  }
194  }
195  if (father->hasFather())
196  {
197  const Node* currentSon = father->getFather();
198  const VVVdouble* likelihoodsFather_son = &drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentSon->getId());
199  // Now iterate over all site partitions:
200  auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(father->getId()));
201  VVVdouble pxy;
202  bool first;
203  while (mit->hasNext())
204  {
206  auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
207  first = true;
208  while (sit->hasNext())
209  {
210  size_t i = sit->next();
211  // We retrieve the transition probabilities for this site partition:
212  if (first)
213  {
214  pxy = drtl.getTransitionProbabilitiesPerRateClass(father->getId(), i);
215  first = false;
216  }
217  const VVdouble* likelihoodsFather_son_i = &(*likelihoodsFather_son)[i];
218  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
219  for (size_t c = 0; c < nbClasses; c++)
220  {
221  const Vdouble* likelihoodsFather_son_i_c = &(*likelihoodsFather_son_i)[c];
222  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
223  VVdouble* pxy_c = &pxy[c];
224  for (size_t x = 0; x < nbStates; x++)
225  {
226  double likelihood = 0.;
227  for (size_t y = 0; y < nbStates; y++)
228  {
229  Vdouble* pxy_c_x = &(*pxy_c)[y];
230  likelihood += (*pxy_c_x)[x] * (*likelihoodsFather_son_i_c)[y];
231  }
232  (*likelihoodsFatherConstantPart_i_c)[x] *= likelihood;
233  }
234  }
235  }
236  }
237  }
238  else
239  {
240  // Account for root frequencies:
241  for (size_t i = 0; i < nbDistinctSites; i++)
242  {
243  vector<double> freqs = drtl.getRootFrequencies(i);
244  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
245  for (size_t c = 0; c < nbClasses; c++)
246  {
247  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
248  for (size_t x = 0; x < nbStates; x++)
249  {
250  (*likelihoodsFatherConstantPart_i_c)[x] *= freqs[x];
251  }
252  }
253  }
254  }
255 
256  // Then, we deal with the node of interest.
257  // We first average upon 'y' to save computations, and then upon 'x'.
258  // ('y' is the state at 'node' and 'x' the state at 'father'.)
259 
260  // Iterate over all site partitions:
261  const VVVdouble* likelihoodsFather_node = &(drtl.getLikelihoodData()->getLikelihoodArray(father->getId(), currentNode->getId()));
262  auto_ptr<TreeLikelihood::ConstBranchModelIterator> mit(drtl.getNewBranchModelIterator(currentNode->getId()));
263  VVVdouble pxy;
264  bool first;
265  while (mit->hasNext())
266  {
268  reward.setSubstitutionModel(bmd->getModel());
269  // compute all nxy first:
270  VVVdouble nxy(nbClasses);
271  for (size_t c = 0; c < nbClasses; ++c)
272  {
273  VVdouble* nxy_c = &nxy[c];
274  double rc = rcRates[c];
275  Matrix<double>* nij = reward.getAllRewards(d * rc);
276  nxy_c->resize(nbStates);
277  for (size_t x = 0; x < nbStates; ++x)
278  {
279  Vdouble* nxy_c_x = &(*nxy_c)[x];
280  nxy_c_x->resize(nbStates);
281  for (size_t y = 0; y < nbStates; ++y)
282  {
283  (*nxy_c_x)[y] = (*nij)(x, y);
284  }
285  }
286  delete nij;
287  }
288 
289  // Now loop over sites:
290  auto_ptr<TreeLikelihood::SiteIterator> sit(bmd->getNewSiteIterator());
291  first = true;
292  while (sit->hasNext())
293  {
294  size_t i = sit->next();
295  // We retrieve the transition probabilities and substitution counts for this site partition:
296  if (first)
297  {
298  pxy = drtl.getTransitionProbabilitiesPerRateClass(currentNode->getId(), i);
299  first = false;
300  }
301  const VVdouble* likelihoodsFather_node_i = &(*likelihoodsFather_node)[i];
302  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
303  for (size_t c = 0; c < nbClasses; ++c)
304  {
305  const Vdouble* likelihoodsFather_node_i_c = &(*likelihoodsFather_node_i)[c];
306  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
307  const VVdouble* pxy_c = &pxy[c];
308  VVdouble* nxy_c = &nxy[c];
309  for (size_t x = 0; x < nbStates; ++x)
310  {
311  double* likelihoodsFatherConstantPart_i_c_x = &(*likelihoodsFatherConstantPart_i_c)[x];
312  const Vdouble* pxy_c_x = &(*pxy_c)[x];
313  for (size_t y = 0; y < nbStates; ++y)
314  {
315  double likelihood_cxy = (*likelihoodsFatherConstantPart_i_c_x)
316  * (*pxy_c_x)[y]
317  * (*likelihoodsFather_node_i_c)[y];
318 
319  // Now the vector computation:
320  rewardsForCurrentNode[i] += likelihood_cxy * (*nxy_c)[x][y];
321  // <------------> <--------------->
322  // Posterior probability | |
323  // for site i and rate class c * | |
324  // likelihood for this site------+ |
325  // |
326  // Reward function for site i and rate class c------+
327  }
328  }
329  }
330  }
331  }
332 
333  // Now we just have to copy the substitutions into the result vector:
334  for (size_t i = 0; i < nbSites; ++i)
335  {
336  (*rewards)(l, i) = rewardsForCurrentNode[(*rootPatternLinks)[i]] / Lr[(*rootPatternLinks)[i]];
337  }
338  }
339  if (verbose)
340  {
341  if (ApplicationTools::message)
342  *ApplicationTools::message << " ";
343  ApplicationTools::displayTaskDone();
344  }
345  return rewards;
346 }
347 
348 /**************************************************************************************************/
349 
351  const ProbabilisticRewardMapping& rewards,
352  const SiteContainer& sites,
353  ostream& out)
354 throw (IOException)
355 {
356  if (!out)
357  throw IOException("RewardMappingTools::writeToFile. Can't write to stream.");
358  out << "Branches";
359  out << "\tMean";
360  for (size_t i = 0; i < rewards.getNumberOfSites(); i++)
361  {
362  out << "\tSite" << sites.getSite(i).getPosition();
363  }
364  out << endl;
365 
366  for (size_t j = 0; j < rewards.getNumberOfBranches(); j++)
367  {
368  out << rewards.getNode(j)->getId() << "\t" << rewards.getNode(j)->getDistanceToFather();
369  for (size_t i = 0; i < rewards.getNumberOfSites(); i++)
370  {
371  out << "\t" << rewards(j, i);
372  }
373  out << endl;
374  }
375 }
376 
377 /**************************************************************************************************/
378 
380 throw (IOException)
381 {
382  try
383  {
384  DataTable* data = DataTable::read(in, "\t", true, -1);
385  vector<string> ids = data->getColumn(0);
386  data->deleteColumn(0); // Remove ids
387  data->deleteColumn(0); // Remove means
388  // Now parse the table:
389  size_t nbSites = data->getNumberOfColumns();
390  rewards.setNumberOfSites(nbSites);
391  size_t nbBranches = data->getNumberOfRows();
392  for (size_t i = 0; i < nbBranches; i++)
393  {
394  int id = TextTools::toInt(ids[i]);
395  size_t br = rewards.getNodeIndex(id);
396  for (size_t j = 0; j < nbSites; j++)
397  {
398  rewards(br, j) = TextTools::toDouble((*data)(i, j));
399  }
400  }
401  // Parse the header:
402  for (size_t i = 0; i < nbSites; i++)
403  {
404  string siteTxt = data->getColumnName(i);
405  int site = 0;
406  if (siteTxt.substr(0, 4) == "Site")
407  site = TextTools::to<int>(siteTxt.substr(4));
408  else
409  site = TextTools::to<int>(siteTxt);
410  rewards.setSitePosition(i, site);
411  }
412 
413  delete data;
414  }
415  catch (Exception& e)
416  {
417  throw IOException(string("Bad input file. ") + e.what());
418  }
419 }
420 
421 /**************************************************************************************************/
422 
423 double RewardMappingTools::computeSumForBranch(const RewardMapping& smap, size_t branchIndex)
424 {
425  size_t nbSites = smap.getNumberOfSites();
426  double v = 0;
427  for (size_t i = 0; i < nbSites; ++i)
428  {
429  v += smap(branchIndex, i);
430  }
431  return v;
432 }
433 
434 /**************************************************************************************************/
435 
436 double RewardMappingTools::computeSumForSite(const RewardMapping& smap, size_t siteIndex)
437 {
438  size_t nbBranches = smap.getNumberOfBranches();
439  double v = 0;
440  for (size_t i = 0; i < nbBranches; ++i)
441  {
442  v += smap(i, siteIndex);
443  }
444  return v;
445 }
446 
447 /**************************************************************************************************/
A pair of SubstitutionModel / SiteIterator.
virtual const SubstitutionModel * getModel() const =0
virtual SiteIterator * getNewSiteIterator() const =0
virtual const Node * getSon(size_t pos) const
Definition: Node.h:395
static double computeSumForSite(const RewardMapping &smap, size_t siteIndex)
Sum all substitutions for each type of a given site (specified by its index).
static void readFromStream(std::istream &in, ProbabilisticRewardMapping &rewards)
Read the reward vectors from a stream.
STL namespace.
The phylogenetic tree class.
General interface for storing reward mapping data.
Definition: RewardMapping.h:62
static ProbabilisticRewardMapping * computeRewardVectors(const DRTreeLikelihood &drtl, const std::vector< int > &nodeIds, Reward &reward, bool verbose=true)
Compute the reward vectors for a particular dataset using the double-recursive likelihood computation...
virtual const Node * getFather() const
Get the father of this node is there is one.
Definition: Node.h:339
The Reward interface.
Definition: Reward.h:70
static double computeSumForBranch(const RewardMapping &smap, size_t branchIndex)
Sum all rewards of a given branch (specified by its index).
virtual size_t getNumberOfBranches() const =0
virtual bool hasFather() const
Tell if this node has a father node.
Definition: Node.h:379
virtual double getDistanceToFather() const
Get the distance to the father node is there is one, otherwise throw a NodeException.
Definition: Node.h:283
virtual int getId() const
Get the node&#39;s id.
Definition: Node.h:203
virtual ConstBranchModelDescription * next()=0
The phylogenetic node class.
Definition: Node.h:90
Data storage class for probabilistic rewards mappings.
virtual size_t getNumberOfSons() const
Definition: Node.h:388
Interface for double-recursive (DR) implementation of the likelihood computation. ...
virtual size_t getNumberOfSites() const =0
static void writeToStream(const ProbabilisticRewardMapping &rewards, const SiteContainer &sites, std::ostream &out)
Write the reward vectors to a stream.
virtual std::vector< const N * > getNodes() const
Definition: TreeTemplate.h:411