bpp-seq  2.2.0
StringSequenceTools.cpp
Go to the documentation of this file.
1 //
2 // File: StringSequenceTools.cpp
3 // Created by: Julien Dutheil
4 // Created on: Sun Nov 30 11:29:07 2003
5 //
6 
7 /*
8  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
9 
10  This software is a computer program whose purpose is to provide classes
11  for sequences analysis.
12 
13  This software is governed by the CeCILL license under French law and
14  abiding by the rules of distribution of free software. You can use,
15  modify and/ or redistribute the software under the terms of the CeCILL
16  license as circulated by CEA, CNRS and INRIA at the following URL
17  "http://www.cecill.info".
18 
19  As a counterpart to the access to the source code and rights to copy,
20  modify and redistribute granted by the license, users are provided only
21  with a limited warranty and the software's author, the holder of the
22  economic rights, and the successive licensors have only limited
23  liability.
24 
25  In this respect, the user's attention is drawn to the risks associated
26  with loading, using, modifying and/or developing or reproducing the
27  software by the user in light of its specific status of free software,
28  that may mean that it is complicated to manipulate, and that also
29  therefore means that it is reserved for developers and experienced
30  professionals having in-depth computer knowledge. Users are therefore
31  encouraged to load and test the software's suitability as regards their
32  requirements in conditions enabling the security of their systems and/or
33  data to be ensured and, more generally, to use and operate it in the
34  same conditions as regards security.
35 
36  The fact that you are presently reading this means that you have had
37  knowledge of the CeCILL license and that you accept its terms.
38  */
39 
40 #include "StringSequenceTools.h"
41 
42 #include "Alphabet/AlphabetTools.h"
43 #include "Alphabet/DNA.h"
44 #include "Alphabet/RNA.h"
46 #include <Bpp/Text/TextTools.h>
47 #include <Bpp/Numeric/Random/RandomTools.h>
48 
49 using namespace bpp;
50 
51 // From the STL:
52 #include <map>
53 #include <ctype.h>
54 #include <algorithm>
55 #include <iostream>
56 
57 using namespace std;
58 
59 /****************************************************************************************/
60 
61 string StringSequenceTools::subseq(const string& sequence, size_t begin, size_t end) throw (Exception)
62 {
63  // Checking interval
64  if (end < begin)
65  throw Exception ("StringSequenceTools::subseq: Invalid interval");
66 
67  // Copy sequence
68  string temp(sequence);
69 
70  // Truncate sequence
71  temp.erase(temp.begin() + static_cast<ptrdiff_t>(end + 1), temp.end());
72  temp.erase(temp.begin(), temp.begin() + static_cast<ptrdiff_t>(begin));
73 
74  // Send result
75  return temp;
76 }
77 
78 /****************************************************************************************/
79 
80 string StringSequenceTools::setToSizeR(const string& sequence, size_t size)
81 {
82  return TextTools::resizeRight(sequence, size, '-');
83 }
84 
85 string StringSequenceTools::setToSizeL(const string& sequence, size_t size)
86 {
87  return TextTools::resizeLeft(sequence, size, '-');
88 }
89 
90 /****************************************************************************************/
91 
92 string StringSequenceTools::deleteChar(const string& sequence, char chars)
93 {
94  // Copy sequence
95  string result(sequence);
96 
97  // Search and delete specified char
98  for (unsigned int i = 0; i < result.size(); i++)
99  {
100  if (result[i] == chars)
101  result.erase(result.begin() + i);
102  }
103 
104  return result;
105 }
106 
107 /****************************************************************************************/
108 
109 string StringSequenceTools::deleteChar(const string& sequence, string chars)
110 {
111  // Copy sequence
112  string result(sequence);
113 
114  // For all characters to delete
115  for (unsigned int i = 0; i < chars.size(); i++)
116  {
117  // Search and delete char
118  for (unsigned int j = 0; j < result.size(); j++)
119  {
120  if (result[j] == chars[i])
121  result.erase(result.begin() + j);
122  }
123  }
124 
125  return result;
126 }
127 
128 /****************************************************************************************/
129 
130 string* StringSequenceTools::reverse(const string& sequence)
131 {
132  // Initializing
133  string* result = new string;
134 
135  // Main loop : reverse all characters of sequence
136  size_t size = sequence.size();
137  for (size_t i = 0; i < size; i++)
138  {
139  *result += sequence[size - i - 1];
140  }
141 
142  // Send result
143  return result;
144 }
145 
146 /****************************************************************************************/
147 
148 string* StringSequenceTools::complement(const string& sequence)
149 {
150  // Initializing
151  string* result = new string;
152 
153  // Main loop : completement all characters
154  size_t size = sequence.size();
155  for (unsigned int i = 0; i < size; i++)
156  {
157  switch (sequence[i])
158  {
159  case 'A': *result += 'T';
160  break;
161  case 'C': *result += 'G';
162  break;
163  case 'G': *result += 'C';
164  break;
165  case 'T': *result += 'A';
166  break;
167  case 'M': *result += 'K';
168  break;
169  case 'R': *result += 'Y';
170  break;
171  case 'Y': *result += 'R';
172  break;
173  case 'K': *result += 'M';
174  break;
175  case 'V': *result += 'B';
176  break;
177  case 'H': *result += 'D';
178  break;
179  case 'D': *result += 'H';
180  break;
181  case 'B': *result += 'V';
182  break;
183  default: *result += sequence[i];
184  break;
185  }
186  }
187 
188  // Send new sequence
189  return result;
190 }
191 
192 /****************************************************************************************/
193 
194 double StringSequenceTools::getGCcontent(const string& sequence, size_t pos, size_t window) throw (BadIntegerException, Exception)
195 {
196  // Frequency counts for nucleotids A, C, G, T
197  map<char, double> counts;
198 
199  // Window size checking
200  if (window < sequence.size())
201  throw BadIntegerException("StringSequenceTools::getGCContent : specified window too high", static_cast<int>(window));
202 
203  // For last nucleotides
204  if (pos + window > sequence.size())
205  {
206  pos = sequence.size() - window;
207  }
208 
209  // Main loop
210  for (size_t i = pos; i < pos + window; i++)
211  {
212  switch (toupper(sequence[i]))
213  {
214  case 'A': counts['A'] += 1;
215  break;
216  case 'C': counts['C'] += 1;
217  break;
218  case 'G': counts['G'] += 1;
219  break;
220  case 'T': counts['T'] += 1;
221  break;
222  case 'M': counts['A'] += 0.5;
223  counts['C'] += 0.5;
224  break;
225  case 'R': counts['A'] += 0.5;
226  counts['G'] += 0.5;
227  break;
228  case 'W': counts['A'] += 0.5;
229  counts['T'] += 0.5;
230  break;
231  case 'S': counts['C'] += 0.5;
232  counts['G'] += 0.5;
233  break;
234  case 'Y': counts['C'] += 0.5;
235  counts['T'] += 0.5;
236  break;
237  case 'K': counts['G'] += 0.5;
238  counts['T'] += 0.5;
239  break;
240  case 'V': counts['A'] += 0.34;
241  counts['C'] += 0.34;
242  counts['G'] += 0.34;
243  break;
244  case 'H': counts['A'] += 0.34;
245  counts['C'] += 0.34;
246  counts['T'] += 0.34;
247  break;
248  case 'D': counts['A'] += 0.34;
249  counts['G'] += 0.34;
250  counts['T'] += 0.34;
251  break;
252  case 'B': counts['C'] += 0.34;
253  counts['G'] += 0.34;
254  counts['T'] += 0.34;
255  break;
256  case '-': throw Exception("StringSequenceTools::getGCContent : Gap found in sequence");
257  break;
258  // Unresolved bases
259  default: counts['A'] += 0.25;
260  counts['C'] += 0.25;
261  counts['G'] += 0.25;
262  counts['T'] += 0.25;
263  }
264  }
265 
266  // Calculate and send GC rate
267  return (counts['G'] + counts['C']) / static_cast<double>(window);
268 }
269 
270 /****************************************************************************************/
271 
272 vector<int> StringSequenceTools::codeSequence(const string& sequence, const Alphabet* alphabet)
273 throw (BadCharException)
274 {
275  unsigned int size = AlphabetTools::getAlphabetCodingSize(alphabet); // Warning, an exception may be casted here!
276  vector<int> code(static_cast<size_t>(floor(static_cast<double>(sequence.size()) / static_cast<double>(size))));
277  size_t pos = 0;
278  size_t count = 0;
279  while (pos + size <= sequence.size())
280  {
281  code[count] = alphabet->charToInt(sequence.substr(pos, size));
282  count++;
283  pos += size;
284  }
285  return code;
286 }
287 
288 /****************************************************************************************/
289 
290 string StringSequenceTools::decodeSequence(const vector<int>& sequence, const Alphabet* alphabet) throw (BadIntException)
291 {
292  string result = "";
293  for (unsigned int i = 0; i < sequence.size(); i++)
294  {
295  result += alphabet->intToChar(sequence[i]);
296  }
297  return result;
298 }
299 
300 /****************************************************************************************/
301 
304 {
305  // empty sequence test
306  if (sequence.size() == 0)
307  {
308  throw EmptySequenceException("Sequence::getAlphabetFromSequence : Empty sequence string");
309  }
310 
311  // initialisation
312  bool p = false; // indicates that a protein specific character is found
313  bool r = false; // indicates that a RNA specific character is found
314  bool u = false; // indicates that an unknown character is found
315  bool pd = false; // Protein or DNA (T)
316 
317  // Main loop : for all character in sequence
318  for (unsigned int i = 0; i < sequence.size(); i++)
319  {
320  // Character analyse
321  switch (AlphabetTools::getType(sequence[i]))
322  {
323  case 0: u = true; break;
324  case 3: p = true; break;
325  case 2: r = true; break;
326  case 5: pd = true; break;
327  }
328  }
329 
330  if (u)
331  throw AlphabetException ("Sequence::getAlphabetFromSequence : Unknow character detected in specified sequence");
332  if (r && pd)
333  throw SequenceException ("Sequence::getAlphabetFromSequence : Both 'T' and 'U' in the same sequence!");
334  if (r && p)
335  throw SequenceException ("Sequence::getAlphabetFromSequence : Protein character and 'U' in the same sequence!");
336  if (p)
337  return new ProteicAlphabet();
338  if (r)
339  return new RNA();
340  return new DNA();
341 }
342 
343 /****************************************************************************************/
344 
static unsigned int getAlphabetCodingSize(const Alphabet &alphabet)
In case that all states in the given alphabet have a string description of same length, send this length; e.g. 3 for codon alphabets.
An alphabet exception thrown when trying to specify a bad char to the alphabet.
static std::string * reverse(const std::string &sequence)
Reverse the sequence.
This alphabet is used to deal NumericAlphabet.
The Alphabet interface.
Definition: Alphabet.h:130
static Alphabet * getAlphabetFromSequence(const std::string &sequence)
Parse a sequence and try to guess the correct alphabet to use.
STL namespace.
static std::string decodeSequence(const std::vector< int > &sequence, const Alphabet *alphabet)
Convert a sequence to its string representation.
static std::string setToSizeR(const std::string &sequence, size_t size)
Set up the size of a sequence from the right side.
static std::vector< int > codeSequence(const std::string &sequence, const Alphabet *alphabet)
Convert a string sequence to a vector of int.
This alphabet is used to deal with proteins.
static double getGCcontent(const std::string &sequence, size_t pos, size_t window)
Calculate the local GC content of a sequence.
static std::string setToSizeL(const std::string &sequence, size_t size)
Set up the size of a sequence from the left side.
static std::string deleteChar(const std::string &sequence, char chars)
Delete all occurence of a character in the sequence.
The alphabet exception base class.
static int getType(char state)
Character identification method for sequence&#39;s alphabet identification.
The sequence exception base class.
This alphabet is used to deal with DNA sequences.
Definition: DNA.h:60
static std::string subseq(const std::string &sequence, size_t begin, size_t end)
Get a subsequence.
An alphabet exception thrown when trying to specify a bad int to the alphabet.
This alphabet is used to deal with RNA sequences.
Definition: RNA.h:58
static std::string * complement(const std::string &sequence)
Get the complement of a sequence.
Exception thrown when a sequence is found to be empty and it should not.