bpp-phyl  2.2.0
HierarchicalClustering.cpp
Go to the documentation of this file.
1 //
2 // File: HierarchicalClustering.cpp
3 // From file Cluster.cpp in CoMap package.
4 // Created by: Julien Dutheil
5 // Created on: Tue Aug 30 17:19 2005
6 //
7 
8 /*
9  Copyright or © or Copr. Bio++ Development Team, (November 16, 2004, 2005, 2006)
10 
11  This software is a computer program whose purpose is to map substitutions
12  on a tree and to detect co-evolving positions in a dataset.
13 
14  This software is governed by the CeCILL license under French law and
15  abiding by the rules of distribution of free software. You can use,
16  modify and/ or redistribute the software under the terms of the CeCILL
17  license as circulated by CEA, CNRS and INRIA at the following URL
18  "http://www.cecill.info".
19 
20  As a counterpart to the access to the source code and rights to copy,
21  modify and redistribute granted by the license, users are provided only
22  with a limited warranty and the software's author, the holder of the
23  economic rights, and the successive licensors have only limited
24  liability.
25 
26  In this respect, the user's attention is drawn to the risks associated
27  with loading, using, modifying and/or developing or reproducing the
28  software by the user in light of its specific status of free software,
29  that may mean that it is complicated to manipulate, and that also
30  therefore means that it is reserved for developers and experienced
31  professionals having in-depth computer knowledge. Users are therefore
32  encouraged to load and test the software's suitability as regards their
33  requirements in conditions enabling the security of their systems and/or
34  data to be ensured and, more generally, to use and operate it in the
35  same conditions as regards security.
36 
37  The fact that you are presently reading this means that you have had
38  knowledge of the CeCILL license and that you accept its terms.
39  */
40 
41 #include "HierarchicalClustering.h"
42 #include "../NodeTemplate.h"
43 
44 using namespace bpp;
45 using namespace std;
46 
47 const string HierarchicalClustering::COMPLETE = "Complete";
48 const string HierarchicalClustering::SINGLE = "Single";
49 const string HierarchicalClustering::AVERAGE = "Average";
50 const string HierarchicalClustering::MEDIAN = "Median";
51 const string HierarchicalClustering::WARD = "Ward";
52 const string HierarchicalClustering::CENTROID = "Centroid";
53 
55 {
56  Node* root = TreeTemplateTools::cloneSubtree<Node>(*dynamic_cast<TreeTemplate<NodeTemplate<ClusterInfos> >*>(tree_)->getRootNode());
57  return new TreeTemplate<Node>(root);
58 }
59 
60 vector<size_t> HierarchicalClustering::getBestPair() throw (Exception)
61 {
62  vector<size_t> bestPair(2);
63  double distMin = -std::log(0.);
64  for (map<size_t, Node*>::iterator i = currentNodes_.begin(); i != currentNodes_.end(); i++)
65  {
66  size_t id = i->first;
67  map<size_t, Node*>::iterator j = i;
68  j++;
69  for ( ; j != currentNodes_.end(); j++)
70  {
71  size_t jd = j->first;
72  double dist = matrix_(id, jd);
73  if (dist < distMin)
74  {
75  distMin = dist;
76  bestPair[0] = id;
77  bestPair[1] = jd;
78  }
79  }
80  }
81  if (distMin == -std::log(0.))
82  {
83  cout << "---------------------------------------------------------------------------------" << endl;
84  for (map<size_t, Node*>::iterator i = currentNodes_.begin(); i != currentNodes_.end(); i++)
85  {
86  size_t id = i->first;
87  map<size_t, Node*>::iterator j = i;
88  j++;
89  for ( ; j != currentNodes_.end(); j++)
90  {
91  size_t jd = j->first;
92  double dist = matrix_(id, jd);
93  cout << dist << "\t";
94  }
95  cout << endl;
96  }
97  cout << "---------------------------------------------------------------------------------" << endl;
98 
99  throw Exception("Unexpected error: no minimum found in the distance matrix.");
100  }
101 
102  return bestPair;
103 }
104 vector<double> HierarchicalClustering::computeBranchLengthsForPair(const vector<size_t>& pair)
105 {
106  vector<double> d(2);
107  double dist = matrix_(pair[0], pair[1]) / 2.;
108  d[0] = dist - dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[0]])->getInfos().length;
109  d[1] = dist - dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[1]])->getInfos().length;
110  return d;
111 }
112 
113 double HierarchicalClustering::computeDistancesFromPair(const vector<size_t>& pair, const vector<double>& branchLengths, size_t pos)
114 {
115  double w1, w2, w3, w4;
116  if (method_ == "Single")
117  {
118  w1 = .5;
119  w2 = .5;
120  w3 = 0.;
121  w4 = -.5;
122  }
123  else if (method_ == "Complete")
124  {
125  w1 = .5;
126  w2 = .5;
127  w3 = 0.;
128  w4 = .5;
129  }
130  else if (method_ == "Median")
131  {
132  w1 = .5;
133  w2 = .5;
134  w3 = -0.25;
135  w4 = 0.;
136  }
137  else if (method_ == "Average")
138  {
139  double n1 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[0]])->getInfos().numberOfLeaves);
140  double n2 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[1]])->getInfos().numberOfLeaves);
141  w1 = n1 / (n1 + n2);
142  w2 = n2 / (n1 + n2);
143  w3 = 0.;
144  w4 = 0.;
145  }
146  else if (method_ == "Ward")
147  {
148  double n1 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[0]])->getInfos().numberOfLeaves);
149  double n2 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[1]])->getInfos().numberOfLeaves);
150  double n3 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pos])->getInfos().numberOfLeaves);
151  w1 = (n1 + n3) / (n1 + n2 + n3);
152  w2 = (n2 + n3) / (n1 + n2 + n3);
153  w3 = -n3 / (n1 + n2 + n3);
154  w4 = 0.;
155  }
156  else if (method_ == "Centroid")
157  {
158  double n1 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[0]])->getInfos().numberOfLeaves);
159  double n2 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[1]])->getInfos().numberOfLeaves);
160  w1 = n1 / (n1 + n2);
161  w2 = n2 / (n1 + n2);
162  w3 = -n1 * n2 / pow(n1 + n2, 2.);
163  w4 = 0.;
164  }
165  else
166  throw Exception("HierarchicalClustering::computeBranchLengthsForPair. unknown method '" + method_ + "'.");
167  double d1 = matrix_(pair[0], pos);
168  double d2 = matrix_(pair[1], pos);
169  double d3 = matrix_(pair[0], pair[1]);
170  return w1 * d1 + w2 * d2 + w3 * d3 + w4* std::abs(d1 - d2);
171 }
172 
174 {
176  map<size_t, Node*>::iterator it = currentNodes_.begin();
177  size_t i1 = it->first;
178  Node* n1 = it->second;
179  it++;
180  size_t i2 = it->first;
181  Node* n2 = it->second;
182  double d = matrix_(i1, i2) / 2;
183  root->addSon(n1);
184  root->addSon(n2);
185  n1->setDistanceToFather(d - dynamic_cast<NodeTemplate<ClusterInfos>*>(n1)->getInfos().length);
186  n2->setDistanceToFather(d - dynamic_cast<NodeTemplate<ClusterInfos>*>(n2)->getInfos().length);
187  tree_ = new TreeTemplate<NodeTemplate<ClusterInfos> >(root);
188 }
189 
190 Node* HierarchicalClustering::getLeafNode(int id, const string& name)
191 {
192  ClusterInfos infos;
193  infos.numberOfLeaves = 1;
194  infos.length = 0.;
196  leaf->setInfos(infos);
197  return leaf;
198 }
199 
201 {
202  ClusterInfos infos;
203  infos.numberOfLeaves =
204  dynamic_cast<NodeTemplate<ClusterInfos>*>(son1)->getInfos().numberOfLeaves
205  + dynamic_cast<NodeTemplate<ClusterInfos>*>(son2)->getInfos().numberOfLeaves;
206  infos.length = dynamic_cast<NodeTemplate<ClusterInfos>*>(son1)->getInfos().length + son1->getDistanceToFather();
207  Node* parent = new NodeTemplate<ClusterInfos>(id);
208  dynamic_cast<NodeTemplate<ClusterInfos>*>(parent)->setInfos(infos);
209  parent->addSon(son1);
210  parent->addSon(son2);
211  return parent;
212 }
213 
virtual void addSon(size_t pos, Node *node)
Definition: Node.h:407
virtual Node * getLeafNode(int id, const std::string &name)
Get a leaf node.
virtual void setInfos(const NodeInfos &infos)
Set the information to be associated to this node.
Definition: NodeTemplate.h:191
std::vector< double > computeBranchLengthsForPair(const std::vector< size_t > &pair)
Compute the branch lengths for two nodes to agglomerate.
static const std::string SINGLE
STL namespace.
The phylogenetic tree class.
static const std::string COMPLETE
double computeDistancesFromPair(const std::vector< size_t > &pair, const std::vector< double > &branchLengths, size_t pos)
Actualizes the distance matrix according to a given pair and the corresponding branch lengths...
virtual double getDistanceToFather() const
Get the distance to the father node is there is one, otherwise throw a NodeException.
Definition: Node.h:283
static const std::string CENTROID
std::vector< size_t > getBestPair()
Get the best pair of nodes to agglomerate.
The phylogenetic node class.
Definition: Node.h:90
TreeTemplate< Node > * getTree() const
Get the computed tree, if there is one.
virtual void setDistanceToFather(double distance)
Set or update the distance toward the father node.
Definition: Node.h:299
void finalStep(int idRoot)
Method called when there ar eonly three remaining node to agglomerate, and creates the root node of t...
static const std::string WARD
virtual Node * getParentNode(int id, Node *son1, Node *son2)
Get an inner node.
static const std::string MEDIAN
static const std::string AVERAGE
The NodeTemplate class.
Definition: NodeTemplate.h:73