bpp-seq  2.2.0
SymbolListTools.cpp
Go to the documentation of this file.
1 //
2 // File: SymbolListTools.cpp
3 // Created by: Julien Dutheil
4 // Created on: Wed Apr 9 2004
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
40 #include "SymbolListTools.h"
41 #include "Alphabet/AlphabetTools.h"
42 #include <Bpp/Numeric/Random/RandomTools.h>
43 
44 //From the STL:
45 #include <algorithm>
46 
47 using namespace std;
48 
49 using namespace bpp;
50 
51 void SymbolListTools::getCounts(const SymbolList& list, map<int, double>& counts, bool resolveUnknowns)
52 {
53  if (!resolveUnknowns)
54  {
55  for (vector<int>::const_iterator seqit = list.getContent().begin();
56  seqit != list.getContent().end();
57  seqit++)
58  counts[*seqit]++;
59  }
60  else
61  {
62  for (vector<int>::const_iterator seqit = list.getContent().begin();
63  seqit != list.getContent().end();
64  seqit++)
65  {
66  vector<int> alias = list.getAlphabet()->getAlias(*seqit);
67  double n = (double)alias.size();
68  for (size_t j = 0; j < alias.size(); j++) counts[alias[j]] += 1./n ;
69  }
70  }
71 }
72 
73 void SymbolListTools::getCounts(const SymbolList& list1, const SymbolList& list2, map< int, map<int, double> >& counts, bool resolveUnknowns) throw (DimensionException)
74 {
75  if (list1.size() != list2.size()) throw DimensionException("SymbolListTools::getCounts: the two sites must have the same size.", list1.size(), list2.size());
76  if (!resolveUnknowns)
77  {
78  for (size_t i = 0; i < list1.size(); i++)
79  counts[list1[i]][list2[i]]++;
80  }
81  else
82  {
83  for (size_t i = 0; i < list1.size(); i++)
84  {
85  vector<int> alias1 = list1.getAlphabet()->getAlias(list1[i]);
86  vector<int> alias2 = list2.getAlphabet()->getAlias(list2[i]);
87  double n1 = (double)alias1.size();
88  double n2 = (double)alias2.size();
89  for (size_t j = 0; j < alias1.size(); j++)
90  for (size_t k = 0; k < alias2.size(); k++)
91  counts[alias1[j]][alias2[k]] += 1./(n1*n2) ;
92  }
93  }
94 }
95 
96 void SymbolListTools::getFrequencies(const SymbolList& list, map<int, double>& frequencies, bool resolveUnknowns)
97 {
98  double n = (double)list.size();
99  map<int, double> counts;
100  getCounts(list, counts, resolveUnknowns);
101  for (map<int, double>::iterator i = counts.begin(); i != counts.end(); i++)
102  {
103  frequencies[i->first] = i->second / n;
104  }
105 }
106 
107 void SymbolListTools::getFrequencies(const SymbolList& list1, const SymbolList& list2, map<int, map<int, double> >& frequencies, bool resolveUnknowns) throw (DimensionException)
108 {
109  double n2 = (double)list1.size() * (double)list1.size();
110  map<int, map<int, double> > counts;
111  getCounts(list1, list2, counts, resolveUnknowns);
112  for (map<int, map<int, double> >::iterator i = counts.begin(); i != counts.end(); i++)
113  for (map<int, double>::iterator j = i->second.begin(); j != i->second.end(); j++)
114  {
115  frequencies[i->first][j->first] = j->second / n2;
116  }
117 }
118 
119 double SymbolListTools::getGCContent(const SymbolList& list, bool ignoreUnresolved, bool ignoreGap) throw (AlphabetException)
120 {
121  const Alphabet * alphabet = list.getAlphabet();
122  if (!AlphabetTools::isNucleicAlphabet(alphabet))
123  throw AlphabetException("SymbolListTools::getGCContent. Method only works on nucleotides.", alphabet);
124  double gc = 0;
125  double total = 0;
126  for (size_t i = 0; i < list.size(); i++) {
127  int state = list.getValue(i);
128  if (state > -1) { // not a gap
129  if (state == 1 || state == 2) { // G or C
130  gc++;
131  total++;
132  } else if (state == 0 || state == 3) { // A, T or U
133  total++;
134  } else { // Unresolved character
135  if (!ignoreUnresolved) {
136  total++;
137  switch(state) {
138  case(7): gc++; break;// G or C
139  case(4): gc+=0.5; break;// A or C
140  case(5): gc+=0.5; break;// A or G
141  case(6): gc+=0.5; break;// C or T
142  case(9): gc+=0.5; break;// G or T
143  case(10): gc+=2./3.; break;// A or C or G
144  case(11): gc+=1./3.; break;// A or C or T
145  case(12): gc+=1./3.; break;// A or G or T
146  case(13): gc+=2./3.; break;// C or G or T
147  case(14): gc+=0.5; break;// A or C or G or T
148  }
149  }
150  }
151  } else {
152  if (!ignoreGap) total++;
153  }
154  }
155  return total != 0 ? gc/total : 0;
156 }
157 
158 size_t SymbolListTools::getNumberOfDistinctPositions(const SymbolList& l1, const SymbolList& l2) throw (AlphabetMismatchException)
159 {
160  if (l1.getAlphabet()->getAlphabetType() != l2.getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("SymbolListTools::getNumberOfDistinctPositions.", l1.getAlphabet(), l2.getAlphabet());
161  size_t n = min(l1.size(), l2.size());
162  size_t count = 0;
163  for (size_t i = 0; i < n; i++) {
164  if (l1[i] != l2[i]) count++;
165  }
166  return count;
167 }
168 
169 size_t SymbolListTools::getNumberOfPositionsWithoutGap(const SymbolList& l1, const SymbolList& l2) throw (AlphabetMismatchException)
170 {
171  if (l1.getAlphabet() -> getAlphabetType() != l2.getAlphabet() -> getAlphabetType()) throw AlphabetMismatchException("SymbolListTools::getNumberOfDistinctPositions.", l1.getAlphabet(), l2.getAlphabet());
172  size_t n = min(l1.size(), l2.size());
173  size_t count = 0;
174  for (size_t i = 0; i < n; i++) {
175  if (l1[i] != -1 && l2[i] != -1) count++;
176  }
177  return count;
178 }
179 
180 void SymbolListTools::changeGapsToUnknownCharacters(SymbolList& l)
181 {
182  int unknownCode = l.getAlphabet()->getUnknownCharacterCode();
183  for (size_t i = 0; i < l.size(); i++)
184  {
185  if (l.getAlphabet()->isGap(l[i])) l[i] = unknownCode;
186  }
187 }
188 
189 void SymbolListTools::changeUnresolvedCharactersToGaps(SymbolList& l)
190 {
191  int gapCode = l.getAlphabet()->getGapCharacterCode();
192  for (size_t i = 0; i < l.size(); i++)
193  {
194  if (l.getAlphabet()->isUnresolved(l[i])) l[i] = gapCode;
195  }
196 }
197 
The SymbolList interface.
Definition: SymbolList.h:60
virtual bool isGap(int state) const =0
This alphabet is used to deal NumericAlphabet.
virtual bool isUnresolved(int state) const =0
The Alphabet interface.
Definition: Alphabet.h:130
STL namespace.
virtual std::vector< int > getAlias(int state) const =0
Get all resolved states that match a generic state.
virtual int getGapCharacterCode() const =0
The alphabet exception base class.
virtual const Alphabet * getAlphabet() const =0
Get the alphabet associated to the list.
virtual size_t size() const =0
Get the number of elements in the list.
virtual const std::vector< int > & getContent() const =0
Get the whole content of the list as a vector of int.
Exception thrown when two alphabets do not match.
virtual int getUnknownCharacterCode() const =0