bpp-seq  2.2.0
ProteicAlphabet.cpp
Go to the documentation of this file.
1 //
2 // File: ProteicAlphabet.cpp
3 // Authors: Guillaume Deuchst
4 // Julien Dutheil
5 // Sylvain Gaillard
6 // Created on: Tue Jul 22 2003
7 //
8 
9 /*
10  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
11 
12  This software is a computer program whose purpose is to provide classes
13  for sequences analysis.
14 
15  This software is governed by the CeCILL license under French law and
16  abiding by the rules of distribution of free software. You can use,
17  modify and/ or redistribute the software under the terms of the CeCILL
18  license as circulated by CEA, CNRS and INRIA at the following URL
19  "http://www.cecill.info".
20 
21  As a counterpart to the access to the source code and rights to copy,
22  modify and redistribute granted by the license, users are provided only
23  with a limited warranty and the software's author, the holder of the
24  economic rights, and the successive licensors have only limited
25  liability.
26 
27  In this respect, the user's attention is drawn to the risks associated
28  with loading, using, modifying and/or developing or reproducing the
29  software by the user in light of its specific status of free software,
30  that may mean that it is complicated to manipulate, and that also
31  therefore means that it is reserved for developers and experienced
32  professionals having in-depth computer knowledge. Users are therefore
33  encouraged to load and test the software's suitability as regards their
34  requirements in conditions enabling the security of their systems and/or
35  data to be ensured and, more generally, to use and operate it in the
36  same conditions as regards security.
37 
38  The fact that you are presently reading this means that you have had
39  knowledge of the CeCILL license and that you accept its terms.
40  */
41 
42 #include "ProteicAlphabet.h"
43 #include "ProteicAlphabetState.h"
44 #include <Bpp/Text/TextTools.h>
45 #include <Bpp/Utils/MapTools.h>
46 
47 using namespace bpp;
48 using namespace std;
49 
50 // From STL:
51 #include <map>
52 
53 /******************************************************************************/
54 
56 {
57  // Alphabet content definition
58  registerState(new ProteicAlphabetState(-1, "-", "GAP", "Gap"));
59  registerState(new ProteicAlphabetState( 0, "A", "ALA", "Alanine"));
60  registerState(new ProteicAlphabetState( 1, "R", "ARG", "Arginine"));
61  registerState(new ProteicAlphabetState( 2, "N", "ASN", "Asparagine"));
62  registerState(new ProteicAlphabetState( 3, "D", "ASP", "Asparatic Acid"));
63  registerState(new ProteicAlphabetState( 4, "C", "CYS", "Cysteine"));
64  registerState(new ProteicAlphabetState( 5, "Q", "GLN", "Glutamine"));
65  registerState(new ProteicAlphabetState( 6, "E", "GLU", "Glutamic acid"));
66  registerState(new ProteicAlphabetState( 7, "G", "GLY", "Glycine"));
67  registerState(new ProteicAlphabetState( 8, "H", "HIS", "Histidine"));
68  registerState(new ProteicAlphabetState( 9, "I", "ILE", "Isoleucine"));
69  registerState(new ProteicAlphabetState(10, "L", "LEU", "Leucine"));
70  registerState(new ProteicAlphabetState(11, "K", "LYS", "Lysine"));
71  registerState(new ProteicAlphabetState(12, "M", "MET", "Methionine"));
72  registerState(new ProteicAlphabetState(13, "F", "PHE", "Phenylalanine"));
73  registerState(new ProteicAlphabetState(14, "P", "PRO", "Proline"));
74  registerState(new ProteicAlphabetState(15, "S", "SER", "Serine"));
75  registerState(new ProteicAlphabetState(16, "T", "THR", "Threonine"));
76  registerState(new ProteicAlphabetState(17, "W", "TRP", "Tryptophan"));
77  registerState(new ProteicAlphabetState(18, "Y", "TYR", "Tyrosine"));
78  registerState(new ProteicAlphabetState(19, "V", "VAL", "Valine"));
79  registerState(new ProteicAlphabetState(20, "B", "B", "N or D"));
80  registerState(new ProteicAlphabetState(21, "Z", "Z", "Q or E"));
81  registerState(new ProteicAlphabetState(22, "X", "X", "Unresolved amino acid"));
82  registerState(new ProteicAlphabetState(22, "O", "O", "Unresolved amino acid"));
83  registerState(new ProteicAlphabetState(22, "0", "0", "Unresolved amino acid"));
84  registerState(new ProteicAlphabetState(22, "?", "?", "Unresolved amino acid"));
85  registerState(new ProteicAlphabetState(-2, "*", "STOP", "Stop"));
86 }
87 
88 /******************************************************************************/
89 
90 string ProteicAlphabet::getAbbr(const string& aa) const throw (AlphabetException)
91 {
92  string AA = TextTools::toUpper(aa);
93  return getState(aa).getAbbreviation();
94 }
95 
96 /******************************************************************************/
97 
98 string ProteicAlphabet::getAbbr(int aa) const throw (AlphabetException)
99 {
100  return getState(aa).getAbbreviation();
101 }
102 
103 /******************************************************************************/
104 
105 vector<int> ProteicAlphabet::getAlias(int state) const throw (BadIntException)
106 {
107  if (!isIntInAlphabet(state))
108  throw BadIntException(state, "ProteicAlphabet::getAlias(int): Specified base unknown.");
109  vector<int> v;
110  if (state == 20) // N or D
111  {
112  v.resize(2); v[0] = 2; v[1] = 3;
113  }
114  else if (state == 21) // Q or E
115  {
116  v.resize(2); v[0] = 5; v[1] = 6;
117  }
118  else if (state == 22) // all!
119  {
120  v.resize(20);
121  for (size_t i = 0; i < 20; i++)
122  {
123  v[i] = static_cast<int>(i);
124  }
125  }
126  else
127  {
128  v.resize(1); v[0] = state;
129  }
130  return v;
131 }
132 
133 /******************************************************************************/
134 
135 vector<string> ProteicAlphabet::getAlias(const string& state) const throw (BadCharException)
136 {
137  string locstate = TextTools::toUpper(state);
138  if (!isCharInAlphabet(locstate))
139  throw BadCharException(locstate, "ProteicAlphabet::getAlias(int): Specified base unknown.");
140  vector<string> v;
141  if (locstate == "B") // N or D
142  {
143  v.resize(2); v[0] = "N"; v[1] = "D";
144  }
145  else if (locstate == "Z") // Q or E
146  {
147  v.resize(2); v[0] = "Q"; v[1] = "E";
148  }
149  else if (locstate == "X"
150  || locstate == "O"
151  || locstate == "0"
152  || locstate == "?") // all!
153  {
154  v.resize(20);
155  for (int i = 0; i < 20; i++)
156  {
157  v[static_cast<size_t>(i)] = getState(i).getLetter();
158  }
159  }
160  else
161  {
162  v.resize(1); v[0] = locstate;
163  }
164  return v;
165 }
166 
167 /******************************************************************************/
168 
169 int ProteicAlphabet::getGeneric(const vector<int>& states) const throw (BadIntException)
170 {
171  map<int, int> m;
172  for (unsigned int i = 0; i < states.size(); ++i)
173  {
174  vector<int> tmp_s = this->getAlias(states[i]); // get the states for generic characters
175  for (unsigned int j = 0; j < tmp_s.size(); ++j)
176  {
177  m[tmp_s[j]]++; // add each state to the list
178  }
179  }
180  vector<int> ve = MapTools::getKeys(m);
181 
182  string key;
183  for (unsigned int i = 0; i < ve.size(); ++i)
184  {
185  if (!isIntInAlphabet(ve[i]))
186  throw BadIntException(ve[i], "ProteicAlphabet::getGeneric(const vector<int>): Specified base unknown.");
187  key += "_" + TextTools::toString(ve[i]);
188  }
189  map<string, int> g;
190  g["_2_3"] = 20;
191  g["_5_6"] = 21;
192  int v;
193  map<string, int>::iterator it = g.find(key);
194  if (ve.size() == 1)
195  {
196  v = ve[0];
197  }
198  else if (it != g.end())
199  {
200  v = it->second;
201  }
202  else
203  {
204  v = 22;
205  }
206  return v;
207 }
208 
209 /******************************************************************************/
210 
211 string ProteicAlphabet::getGeneric(const vector<string>& states) const throw (BadCharException)
212 {
213  map<string, int> m;
214  for (unsigned int i = 0; i < states.size(); ++i)
215  {
216  vector<string> tmp_s = this->getAlias(states[i]); // get the states for generic characters
217  for (unsigned int j = 0; j < tmp_s.size(); ++j)
218  {
219  m[tmp_s[j]]++; // add each state to the list
220  }
221  }
222  vector<string> ve = MapTools::getKeys(m);
223 
224  string key;
225  for (unsigned int i = 0; i < ve.size(); ++i)
226  {
227  if (!isCharInAlphabet(ve[i]))
228  throw BadCharException(ve[i], "ProteicAlphabet::getAlias(const vector<string>): Specified base unknown.");
229  key += TextTools::toString(ve[i]);
230  }
231  map<string, string> g;
232  g["DN"] = "B";
233  g["EQ"] = "Z";
234  string v;
235  map<string, string>::iterator it = g.find(key);
236  if (ve.size() == 1)
237  {
238  v = ve[0];
239  }
240  else if (it != g.end())
241  {
242  v = it->second;
243  }
244  else
245  {
246  v = "?";
247  }
248  return v;
249 }
250 
251 /******************************************************************************/
An alphabet exception thrown when trying to specify a bad char to the alphabet.
std::vector< int > getAlias(int state) const
Get all resolved states that match a generic state.
This alphabet is used to deal NumericAlphabet.
STL namespace.
int getGeneric(const std::vector< int > &states) const
Get the generic state that match a set of states.
std::string getAbbr(const std::string &aa) const
Get the abbreviation (3 letter code) for a state coded as char.
This is the base class to describe states in a ProteicAlphabet.
The alphabet exception base class.
An alphabet exception thrown when trying to specify a bad int to the alphabet.