bpp-seq  2.2.0
WordAlphabet.cpp
Go to the documentation of this file.
1 //
2 // File: WordAlphabet.h
3 // Authors: Laurent Gueguen
4 // Sylvain Gaillard
5 // Created on: Sun Dec 28 2008
6 //
7 
8 /*
9  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
10 
11  This software is a computer program whose purpose is to provide classes
12  for sequences analysis.
13 
14  This software is governed by the CeCILL license under French law and
15  abiding by the rules of distribution of free software. You can use,
16  modify and/ or redistribute the software under the terms of the CeCILL
17  license as circulated by CEA, CNRS and INRIA at the following URL
18  "http://www.cecill.info".
19 
20  As a counterpart to the access to the source code and rights to copy,
21  modify and redistribute granted by the license, users are provided only
22  with a limited warranty and the software's author, the holder of the
23  economic rights, and the successive licensors have only limited
24  liability.
25 
26  In this respect, the user's attention is drawn to the risks associated
27  with loading, using, modifying and/or developing or reproducing the
28  software by the user in light of its specific status of free software,
29  that may mean that it is complicated to manipulate, and that also
30  therefore means that it is reserved for developers and experienced
31  professionals having in-depth computer knowledge. Users are therefore
32  encouraged to load and test the software's suitability as regards their
33  requirements in conditions enabling the security of their systems and/or
34  data to be ensured and, more generally, to use and operate it in the
35  same conditions as regards security.
36 
37  The fact that you are presently reading this means that you have had
38  knowledge of the CeCILL license and that you accept its terms.
39  */
40 
41 #include "WordAlphabet.h"
42 #include <Bpp/Text/TextTools.h>
43 
44 using namespace bpp;
45 
46 // From the STL:
47 #include <iostream>
48 
49 using namespace std;
50 
51 WordAlphabet::WordAlphabet(const vector<const Alphabet*>& vAlpha) :
53  vAbsAlph_(vAlpha)
54 {
55  build_();
56 }
57 
58 WordAlphabet::WordAlphabet(const Alphabet* pAlpha, unsigned int num) :
60  vAbsAlph_(0)
61 {
62  for (unsigned int i = 0; i < num; i++)
63  {
64  vAbsAlph_.push_back(pAlpha);
65  }
66 
67  build_();
68 }
69 
71 {
72  size_t size = 1;
73 
74  for (size_t i = 0; i < vAbsAlph_.size(); ++i)
75  {
76  size *= vAbsAlph_[i]->getSize();
77  }
78 
79  vector<AlphabetState*> states(size + 2);
80 
81  string s = "";
82  for (size_t i = 0; i < vAbsAlph_.size(); ++i)
83  {
84  s += "-";
85  }
86 
87  states[0] = new AlphabetState(-1, s, "gap");
88 
89  for (size_t i = 0; i < size; ++i)
90  {
91  states[i + 1] = new AlphabetState(static_cast<int>(i), "", "");
92  }
93 
94  size_t lr = size;
95  char c;
96  for (size_t na = 0; na < vAbsAlph_.size(); ++na)
97  {
98  lr /= vAbsAlph_[na]->getSize();
99  size_t j = 1;
100  int i = 0;
101  while (j <= size)
102  {
103  c = vAbsAlph_[na]->intToChar(i)[0];
104 
105  for (size_t k = 0; k < lr; k++)
106  {
107  states[j]->setLetter(states[j]->getLetter() + c);
108  j++;
109  // alphabet[j++].letter += c;
110  }
111 
112  if (++i == static_cast<int>(vAbsAlph_[na]->getSize()))
113  i = 0;
114  }
115  }
116 
117  s = "";
118  for (size_t i = 0; i < vAbsAlph_.size(); ++i)
119  {
120  s += "N";
121  }
122 
123  states[size + 1] = new AlphabetState(static_cast<int>(size), s, "Unresolved");
124 
125  //Now register all states once for all:
126  for (size_t i = 0; i < states.size(); ++i) {
127  registerState(states[i]);
128  }
129  //jdutheil on 24/07/14: this should not be necessary anymore.
130  //remap();
131 }
132 
133 /******************************************************************************/
134 
135 std::string WordAlphabet::getAlphabetType() const
136 {
137  string s = "Word alphabet:";
138  for (unsigned int i = 0; i < vAbsAlph_.size(); i++)
139  {
140  s += " " + vAbsAlph_[i]->getAlphabetType();
141  }
142 
143  return s;
144 }
145 
147 {
148  string s = vAbsAlph_[0]->getAlphabetType();
149  for (unsigned int i = 1; i < vAbsAlph_.size(); i++)
150  {
151  if (vAbsAlph_[i]->getAlphabetType() != s)
152  return false;
153  }
154  return true;
155 }
156 
157 bool WordAlphabet::containsUnresolved(const std::string& state) const throw (BadCharException)
158 {
159  size_t s = vAbsAlph_.size();
160  if (state.length() != s)
161  throw BadCharException(state, "WordAlphabet::containsUnresolved", this);
162 
163  for (size_t i = 0; i < vAbsAlph_.size(); i++)
164  {
165  if (vAbsAlph_[i]->isUnresolved(state.substr(i, 1)))
166  {
167  return true;
168  }
169  }
170  return false;
171 }
172 
173 /******************************************************************************/
174 
175 bool WordAlphabet::containsGap(const std::string& state) const throw (BadCharException)
176 {
177  size_t s = vAbsAlph_.size();
178  if (state.length() != s)
179  throw BadCharException(state, "WordAlphabet::containsGap", this);
180 
181  for (size_t i = 0; i < vAbsAlph_.size(); i++)
182  {
183  if (vAbsAlph_[i]->isGap(state.substr(i, 1)))
184  return true;
185  }
186 
187  return false;
188 }
189 
190 /******************************************************************************/
191 
192 std::string WordAlphabet::getName(const std::string& state) const throw (BadCharException)
193 {
194  if (state.size() != vAbsAlph_.size())
195  throw BadCharException(state, "WordAlphabet::getName", this);
196  if (containsUnresolved(state))
197  return getStateAt(getSize() + 1).getName();
198  if (containsGap(state))
199  return getStateAt(0).getName();
200  else
201  return AbstractAlphabet::getName(state);
202 }
203 
204 /******************************************************************************/
205 
206 std::vector<int> WordAlphabet::getAlias(int state) const throw (BadIntException)
207 {
208  if (!isIntInAlphabet(state))
209  throw BadIntException(state, "WordAlphabet::getAlias(int): Specified base unknown.");
210  vector<int> v;
211  size_t s = getSize();
212 
213  if (static_cast<size_t>(state) == s)
214  {
215  v.resize(s);
216  for (size_t i = 0; i < s; ++i)
217  {
218  v[i] = static_cast<int>(i);
219  }
220  }
221  else
222  {
223  v.resize(1); v[0] = state;
224  }
225  return v;
226 }
227 
228 /******************************************************************************/
229 
230 std::vector<std::string> WordAlphabet::getAlias(const std::string& state) const throw (BadCharException)
231 {
232  string locstate = TextTools::toUpper(state);
233  if (!isCharInAlphabet(locstate))
234  throw BadCharException(locstate, "WordAlphabet::getAlias(string): Specified base unknown.");
235  vector<string> v;
236 
237  size_t s = getSize();
238 
239  string st = "";
240  for (size_t i = 0; i < vAbsAlph_.size(); ++i)
241  {
242  st += "N";
243  }
244 
245  if (locstate == st)
246  {
247  v.resize(s);
248  for (size_t i = 0; i < s; ++i)
249  {
250  v[i] = intToChar(static_cast<int>(i));
251  }
252  }
253  else
254  {
255  v.resize(1); v[0] = state;
256  }
257  return v;
258 }
259 
260 /******************************************************************************/
261 
262 int WordAlphabet::getGeneric(const std::vector<int>& states) const throw (BadIntException)
263 {
264  return states[0];
265 }
266 
267 /******************************************************************************/
268 
269 std::string WordAlphabet::getGeneric(const std::vector<std::string>& states) const throw (BadCharException)
270 {
271  return states[0];
272 }
273 
274 /******************************************************************************/
275 
276 int WordAlphabet::getWord(const std::vector<int>& vint, size_t pos) const throw (IndexOutOfBoundsException)
277 {
278  if (vint.size() < pos + vAbsAlph_.size())
279  throw IndexOutOfBoundsException("WordAlphabet::getWord", pos, 0, vint.size() - vAbsAlph_.size());
280 
281  vector<string> vs;
282  for (size_t i = 0; i < vAbsAlph_.size(); i++)
283  {
284  vs.push_back(vAbsAlph_[i]->intToChar(vint[i + pos]));
285  }
286 
287  return charToInt(getWord(vs)); // This can't throw a BadCharException!
288 }
289 
290 /****************************************************************************************/
291 
292 std::string WordAlphabet::getWord(const std::vector<string>& vpos, size_t pos) const throw (IndexOutOfBoundsException, BadCharException)
293 {
294  if (vpos.size() < pos + vAbsAlph_.size())
295  throw IndexOutOfBoundsException("WordAlphabet::getWord", pos, 0, vpos.size() - vAbsAlph_.size());
296 
297  string s = "";
298  for (size_t i = 0; i < vAbsAlph_.size(); i++)
299  {
300  s += vpos[pos + i];
301  }
302  // test
303  charToInt(s);
304  return s;
305 }
306 
307 /****************************************************************************************/
308 
309 Sequence* WordAlphabet::translate(const Sequence& sequence, size_t pos) const throw (AlphabetMismatchException, Exception)
310 {
311  if ((!hasUniqueAlphabet()) or
312  (sequence.getAlphabet()->getAlphabetType() != vAbsAlph_[0]->getAlphabetType()))
313  throw AlphabetMismatchException("No matching alphabets", sequence.getAlphabet(), vAbsAlph_[0]);
314 
315  vector<int> v1 = sequence.getContent();
316  vector<int> v2;
317 
318  size_t s = sequence.size();
319  unsigned int l = getLength();
320  size_t i = pos;
321 
322  while (i + l <= s)
323  {
324  v2.push_back(getWord(v1, i));
325  i += l;
326  }
327 
328  return new BasicSequence(sequence.getName(), v2, this);
329 }
330 
331 /****************************************************************************************/
332 
333 Sequence* WordAlphabet::reverse(const Sequence& sequence) const throw (AlphabetMismatchException, Exception)
334 {
335  if ((!hasUniqueAlphabet()) or
336  (sequence.getAlphabet()->getAlphabetType() != getAlphabetType()))
337  throw AlphabetMismatchException("No matching alphabets");
338 
339  Sequence* pseq = new BasicSequence(sequence.getName(), "", getNAlphabet(0));
340 
341  size_t s = sequence.size();
342  for (size_t i = 0; i < s; i++)
343  {
344  pseq->append(getPositions(sequence[i]));
345  }
346 
347  return pseq;
348 }
349 
350 /****************************************************************************************/
351 
This is the base class to describe states in an Alphabet.
Definition: AlphabetState.h:54
An alphabet exception thrown when trying to specify a bad char to the alphabet.
Sequence * reverse(const Sequence &sequence) const
Translate a whole sequence from words alphabet to letters alphabet.
int getGeneric(const std::vector< int > &states) const
Get the generic state that match a set of states.
bool containsUnresolved(const std::string &state) const
This alphabet is used to deal NumericAlphabet.
Sequence * translate(const Sequence &sequence, size_t=0) const
Translate a whole sequence from letters alphabet to words alphabet.
The Alphabet interface.
Definition: Alphabet.h:130
STL namespace.
std::string getAlphabetType() const
Identification method.
unsigned int getSize() const
Get the number of resolved states in the alphabet (e.g. return 4 for DNA alphabet). This is the method you&#39;ll need in most cases.
Definition: WordAlphabet.h:138
WordAlphabet(const std::vector< const Alphabet *> &vAlpha)
Builds a new word alphabet from a vector of Alphabets.
virtual void append(const std::vector< int > &content)=0
Append the specified content to the sequence.
bool containsGap(const std::string &state) const
std::vector< const Alphabet *> vAbsAlph_
Definition: WordAlphabet.h:70
A basic implementation of the Sequence interface.
Definition: Sequence.h:207
A partial implementation of the Alphabet interface.
virtual size_t size() const
Get the number of elements in the list.
Definition: SymbolList.h:350
std::string getName(const std::string &state) const
Get the complete name of a state given its string description.
The sequence interface.
Definition: Sequence.h:74
virtual int getWord(const std::vector< int > &vint, size_t pos=0) const
Get the int code for a word given the int code of the underlying positions.
An alphabet exception thrown when trying to specify a bad int to the alphabet.
Exception thrown when two alphabets do not match.
virtual void registerState(AlphabetState *st)
Add a state to the Alphabet.
bool hasUniqueAlphabet() const
Returns True if the Alphabet of the letters in the word are the same type.
std::string getName(const std::string &state) const
Get the complete name of a state given its string description.
std::vector< int > getAlias(int state) const
Get all resolved states that match a generic state.