bpp-phyl  2.2.0
CoalaCore.cpp
Go to the documentation of this file.
1 //
2 // File: CoalaCore.cpp
3 // Created by: Mathieu Groussin
4 // Created on: Sun Mar 13 12:00:00 2011
5 //
6 
7 /*
8  Copyright or � or Copr. CNRS, (November 16, 2004)
9 
10  This software is a computer program whose purpose is to provide classes
11  for phylogenetic data analysis.
12 
13  This software is governed by the CeCILL license under French law and
14  abiding by the rules of distribution of free software. You can use,
15  modify and/ or redistribute the software under the terms of the CeCILL
16  license as circulated by CEA, CNRS and INRIA at the following URL
17  "http://www.cecill.info".
18 
19  As a counterpart to the access to the source code and rights to copy,
20  modify and redistribute granted by the license, users are provided only
21  with a limited warranty and the software's author, the holder of the
22  economic rights, and the successive licensors have only limited
23  liability.
24 
25  In this respect, the user's attention is drawn to the risks associated
26  with loading, using, modifying and/or developing or reproducing the
27  software by the user in light of its specific status of free software,
28  that may mean that it is complicated to manipulate, and that also
29  therefore means that it is reserved for developers and experienced
30  professionals having in-depth computer knowledge. Users are therefore
31  encouraged to load and test the software's suitability as regards their
32  requirements in conditions enabling the security of their systems and/or
33  data to be ensured and, more generally, to use and operate it in the
34  same conditions as regards security.
35 
36  The fact that you are presently reading this means that you have had
37  knowledge of the CeCILL license and that you accept its terms.
38  */
39 
40 
41 #include "CoalaCore.h"
42 
43 // #include <Bpp/Io/FileTools.h>
44 #include <Bpp/Text/TextTools.h>
45 #include <Bpp/Text/StringTokenizer.h>
46 #include <Bpp/App/ApplicationTools.h>
47 #include <Bpp/Numeric/VectorTools.h>
48 #include <Bpp/Numeric/Matrix/MatrixTools.h>
49 #include <Bpp/Numeric/Stat/Mva/CorrespondenceAnalysis.h>
50 
51 #include <Bpp/Seq/SequenceTools.h>
52 
53 
54 using namespace bpp;
55 
56 // From the STL:
57 #include <iostream>
58 #include <fstream>
59 #include <string>
60 
61 using namespace std;
62 
63 /******************************************************************************/
64 
65 CoalaCore::CoalaCore(size_t nbAxes, const string& exch) :
66  init_(true),
67  nbrOfAxes_(nbAxes),
68  exch_(exch),
69  P_(),
70  R_(),
71  colWeights_(),
72  paramValues_()
73 {}
74 
75 /******************************************************************************/
76 
77 ParameterList CoalaCore::computeCOA(const SequenceContainer& data, bool param)
78 {
79  ParameterList pList;
80  // Now we perform the Correspondence Analysis on from the matrix of observed frequencies computed on the alignment, to obtain the matrix of principal axes.
81  // First, the matrix of amino acid frequencies is calculated from the alignment:
82  vector<string> names = data.getSequencesNames();
83  vector< map<int, double> > freqs(names.size()); // One map per sequence
84  // Each map is filled with the corresponding frequencies, which are then normalized.
85  for (size_t i = 0; i < names.size(); ++i)
86  {
87  Sequence* seq = new BasicSequence(data.getSequence(names[i]));
88  SymbolListTools::changeGapsToUnknownCharacters(*seq);
89  SequenceTools::getFrequencies(*seq, freqs.at(i));
90  // Unknown characters are now ignored:
91  double t = 0;
92  for (int k = 0; k < 20; ++k)
93  {
94  t += freqs.at(i)[k];
95  }
96  for (int k = 0; k < 20; k++)
97  {
98  freqs.at(i)[k] /= t;
99  }
100  delete seq;
101  }
102 
103  // The matrix of observed frequencies is filled. If an amino acid is completely absent from the alignment, its frequency is set to 10^-6.
104  RowMatrix<double> freqMatrix(names.size(), 20);
105  for (size_t i = 0; i < freqs.size(); i++)
106  {
107  bool normalize = false;
108  for (size_t j = 0; j < 20; j++)
109  {
110  map<int, double>::iterator it = freqs[i].find(static_cast<int>(j));
111  if (it != freqs[i].end())
112  {
113  freqMatrix(i, j) = (*it).second;
114  }
115  else
116  {
117  freqMatrix(i, j) = 0.000001;
118  normalize = true;
119  }
120  }
121  if (normalize)
122  {
123  double sum = 0;
124  for (size_t k = 0; k < 20; k++)
125  {
126  sum += freqMatrix(i, k);
127  }
128  for (size_t l = 0; l < 20; l++)
129  {
130  freqMatrix(i, l) = freqMatrix(i, l) / sum;
131  }
132  }
133  }
134 
135  // The COA analysis:
136  CorrespondenceAnalysis* coa = new CorrespondenceAnalysis(freqMatrix, 19);
137  // Matrix of principal axes:
138  RowMatrix<double> ppalAxes = coa->getPrincipalAxes();
139  // The transpose of the matrix of principal axes is computed:
140  MatrixTools::transpose(ppalAxes, P_);
141  // The matrix of row coordinates is stored:
142  R_ = coa->getRowCoordinates();
143  // The column weights are retrieved:
144  colWeights_ = coa->getColumnWeights();
145 
146  if (param)
147  {
148  // Parameters are defined:
149  size_t nbAxesConserved = coa->getNbOfKeptAxes();
150  if (nbrOfAxes_ > nbAxesConserved)
151  {
152  ApplicationTools::displayWarning("The specified number of parameters per branch (" + TextTools::toString(nbrOfAxes_) +
153  ") is higher than the number of axes (" + TextTools::toString(nbAxesConserved) +
154  ")... The number of parameters per branch is now equal to the number of axes kept by the COA analysis (" + TextTools::toString(nbAxesConserved) + ")");
155  nbrOfAxes_ = nbAxesConserved;
156  }
157  for (unsigned int i = 0; i < nbrOfAxes_; i++)
158  {
159  const vector<double> rCoords = R_.col(i);
160  double maxCoord = VectorTools::max(rCoords);
161  double minCoord = VectorTools::min(rCoords);
162  double sd = VectorTools::sd<double, double>(rCoords);
163  IntervalConstraint* constraint = new IntervalConstraint(minCoord - sd, maxCoord + sd, true, true);
164  if (paramValues_.find("AxPos" + TextTools::toString(i)) != paramValues_.end())
165  pList.addParameter(Parameter("Coala.AxPos" + TextTools::toString(i), TextTools::toDouble(paramValues_["AxPos" + TextTools::toString(i)].substr(0, 8)), constraint));
166  else
167  pList.addParameter(Parameter("Coala.AxPos" + TextTools::toString(i), 0., constraint));
168  }
169  }
170  return pList;
171 }
172 
173 /******************************************************************************/
174 /* Function that computes the product of a matrix P of size nbrOfAxes_x20 with a vector V of size nbrOfAxes_, and returns a vector of size 20.*/
175 
176 vector<double> CoalaCore::prodMatrixVector(RowMatrix<double>& P, vector<double>& V)
177 {
178  vector<double> E(20, 0.0);
179 
180  for (unsigned int i = 0; i < 20; i++)
181  {
182  for (unsigned int j = 0; j < V.size(); j++)
183  {
184  E[i] = E[i] + P(j, i) * V[j];
185  }
186  }
187  return E;
188 }
189 
190 /******************************************************************************/
RowMatrix< double > P_
Definition: CoalaCore.h:72
STL namespace.
std::vector< double > colWeights_
Definition: CoalaCore.h:74
RowMatrix< double > R_
Definition: CoalaCore.h:73
std::vector< double > prodMatrixVector(RowMatrix< double > &P, std::vector< double > &V)
Definition: CoalaCore.cpp:176
CoalaCore(size_t nbAxes=0, const std::string &exch="LG08")
Definition: CoalaCore.cpp:65
ParameterList computeCOA(const SequenceContainer &data, bool param=true)
Definition: CoalaCore.cpp:77
size_t nbrOfAxes_
Definition: CoalaCore.h:70
std::map< std::string, std::string > paramValues_
Definition: CoalaCore.h:75