bpp-seq  2.2.0
SequenceTools.cpp
Go to the documentation of this file.
1 //
2 // File: SequenceTools.cpp
3 // Authors: Guillaume Deuchst
4 // Julien Dutheil
5 // Sylvain Gaillard
6 // Created on: Tue Aug 21 2003
7 //
8 
9 /*
10  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
11 
12  This software is a computer program whose purpose is to provide classes
13  for sequences analysis.
14 
15  This software is governed by the CeCILL license under French law and
16  abiding by the rules of distribution of free software. You can use,
17  modify and/ or redistribute the software under the terms of the CeCILL
18  license as circulated by CEA, CNRS and INRIA at the following URL
19  "http://www.cecill.info".
20 
21  As a counterpart to the access to the source code and rights to copy,
22  modify and redistribute granted by the license, users are provided only
23  with a limited warranty and the software's author, the holder of the
24  economic rights, and the successive licensors have only limited
25  liability.
26 
27  In this respect, the user's attention is drawn to the risks associated
28  with loading, using, modifying and/or developing or reproducing the
29  software by the user in light of its specific status of free software,
30  that may mean that it is complicated to manipulate, and that also
31  therefore means that it is reserved for developers and experienced
32  professionals having in-depth computer knowledge. Users are therefore
33  encouraged to load and test the software's suitability as regards their
34  requirements in conditions enabling the security of their systems and/or
35  data to be ensured and, more generally, to use and operate it in the
36  same conditions as regards security.
37 
38  The fact that you are presently reading this means that you have had
39  knowledge of the CeCILL license and that you accept its terms.
40  */
41 
42 #include "SequenceTools.h"
43 
44 #include "Alphabet/AlphabetTools.h"
45 #include "StringSequenceTools.h"
46 #include <Bpp/Numeric/Matrix/Matrix.h>
47 #include <Bpp/Numeric/VectorTools.h>
48 
49 using namespace bpp;
50 
51 // From the STL:
52 #include <ctype.h>
53 #include <cmath>
54 #include <list>
55 #include <iostream>
56 
57 using namespace std;
58 
65 
66 /******************************************************************************/
67 
68 Sequence* SequenceTools::subseq(const Sequence& sequence, size_t begin, size_t end) throw (IndexOutOfBoundsException, Exception)
69 {
70  // Checking interval
71  if (end >= sequence.size())
72  throw IndexOutOfBoundsException ("SequenceTools::subseq : Invalid upper bound", end, 0, sequence.size());
73  if (end < begin)
74  throw Exception ("SequenceTools::subseq : Invalid interval");
75 
76  // Copy sequence
77  vector<int> temp(sequence.getContent());
78 
79  // Truncate sequence
80  temp.erase(temp.begin() + static_cast<ptrdiff_t>(end + 1), temp.end());
81  temp.erase(temp.begin(), temp.begin() + static_cast<ptrdiff_t>(begin));
82 
83  // New sequence creation
84  return new BasicSequence(sequence.getName(), temp, sequence.getComments(), sequence.getAlphabet());
85 }
86 
87 /******************************************************************************/
88 
89 Sequence* SequenceTools::concatenate(const Sequence& seq1, const Sequence& seq2) throw (AlphabetMismatchException, Exception)
90 {
91  // Sequence's alphabets matching verification
92  if ((seq1.getAlphabet()->getAlphabetType()) != (seq2.getAlphabet()->getAlphabetType()))
93  throw AlphabetMismatchException("SequenceTools::concatenate : Sequence's alphabets don't match ", seq1.getAlphabet(), seq2.getAlphabet());
94 
95  // Sequence's names matching verification
96  if (seq1.getName() != seq2.getName())
97  throw Exception ("SequenceTools::concatenate : Sequence's names don't match");
98 
99  // Concatenate sequences and send result
100  vector<int> sequence = seq1.getContent();
101  vector<int> sequence2 = seq2.getContent();
102  sequence.insert(sequence.end(), sequence2.begin(), sequence2.end());
103  return new BasicSequence(seq1.getName(), sequence, seq1.getComments(), seq1.getAlphabet());
104 }
105 
106 /******************************************************************************/
107 
109 {
110  // Alphabet type checking
112  if (seq.getAlphabet()->getAlphabetType() == "DNA alphabet")
113  {
114  NAR = &_DNARep;
115  }
116  else if (seq.getAlphabet()->getAlphabetType() == "RNA alphabet")
117  {
118  NAR = &_RNARep;
119  }
120  else
121  {
122  throw AlphabetException("SequenceTools::complement: Sequence must be nucleic.", seq.getAlphabet());
123  }
124  for (size_t i = 0; i < seq.size(); i++)
125  {
126  seq.setElement(i, NAR->translate(seq.getValue(i)));
127  }
128  return seq;
129 }
130 
131 /******************************************************************************/
132 
134 {
135  // Alphabet type checking
137  if (sequence.getAlphabet()->getAlphabetType() == "DNA alphabet")
138  {
139  NAR = &_DNARep;
140  }
141  else if (sequence.getAlphabet()->getAlphabetType() == "RNA alphabet")
142  {
143  NAR = &_RNARep;
144  }
145  else
146  {
147  throw AlphabetException ("SequenceTools::getComplement: Sequence must be nucleic.", sequence.getAlphabet());
148  }
149 
150  return NAR->translate(sequence);
151 }
152 
153 /******************************************************************************/
154 
156 {
157  // Alphabet type checking
158  if (sequence.getAlphabet()->getAlphabetType() != "DNA alphabet")
159  {
160  throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", sequence.getAlphabet());
161  }
162 
163  return _transc.translate(sequence);
164 }
165 
166 /******************************************************************************/
167 
169 {
170  // Alphabet type checking
171  if (sequence.getAlphabet()->getAlphabetType() != "RNA alphabet")
172  {
173  throw AlphabetException ("SequenceTools::reverseTranscript : Sequence must be RNA", sequence.getAlphabet());
174  }
175 
176  return _transc.reverse(sequence);
177 }
178 
179 /******************************************************************************/
180 
182 {
183  size_t seq_size = seq.size(); // store seq size for efficiency
184  int tmp_state = 0; // to store one state when swapping positions
185  size_t j = seq_size; // symetric position iterator from sequence end
186  for (size_t i = 0; i < seq_size / 2; i++)
187  {
188  j = seq_size - 1 - i;
189  tmp_state = seq.getValue(i);
190  seq.setElement(i, seq.getValue(j));
191  seq.setElement(j, tmp_state);
192  }
193  return seq;
194 }
195 
196 /******************************************************************************/
197 
199 {
200  Sequence* iSeq = sequence.clone();
201  invert(*iSeq);
202  return iSeq;
203 }
204 
205 /******************************************************************************/
206 
208 {
209  // Alphabet type checking
211  if (seq.getAlphabet()->getAlphabetType() == "DNA alphabet")
212  {
213  NAR = &_DNARep;
214  }
215  else if (seq.getAlphabet()->getAlphabetType() == "RNA alphabet")
216  {
217  NAR = &_RNARep;
218  }
219  else
220  {
221  throw AlphabetException("SequenceTools::complement: Sequence must be nucleic.", seq.getAlphabet());
222  }
223  // for (size_t i = 0 ; i < seq.size() ; i++) {
224  // seq.setElement(i, NAR->translate(seq.getValue(i)));
225  // }
226  size_t seq_size = seq.size(); // store seq size for efficiency
227  int tmp_state = 0; // to store one state when swapping positions
228  size_t j = seq_size; // symetric position iterator from sequence end
229  for (size_t i = 0; i < seq_size / 2; i++)
230  {
231  j = seq_size - 1 - i;
232  tmp_state = seq.getValue(i);
233  seq.setElement(i, NAR->translate(seq.getValue(j)));
234  seq.setElement(j, NAR->translate(tmp_state));
235  }
236  if (seq_size % 2) // treate the state in the middle of odd sequences
237  {
238  seq.setElement(seq_size / 2, NAR->translate(seq.getValue(seq_size / 2)));
239  }
240  return seq;
241 }
242 
243 /******************************************************************************/
244 
246 {
247  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
248  throw AlphabetMismatchException("SequenceTools::getPercentIdentity", seq1.getAlphabet(), seq2.getAlphabet());
249  if (seq1.size() != seq2.size())
250  throw SequenceNotAlignedException("SequenceTools::getPercentIdentity", &seq2);
251  int gap = seq1.getAlphabet()->getGapCharacterCode();
252  size_t id = 0;
253  size_t tot = 0;
254  for (size_t i = 0; i < seq1.size(); i++)
255  {
256  int x = seq1.getValue(i);
257  int y = seq2.getValue(i);
258  if (ignoreGaps)
259  {
260  if (x != gap && y != gap)
261  {
262  tot++;
263  if (x == y)
264  id++;
265  }
266  }
267  else
268  {
269  tot++;
270  if (x == y)
271  id++;
272  }
273  }
274  return static_cast<double>(id) / static_cast<double>(tot) * 100.;
275 }
276 
277 /******************************************************************************/
278 
280 {
281  size_t count = 0;
282  const Alphabet* alpha = seq.getAlphabet();
283  for (size_t i = 0; i < seq.size(); i++)
284  {
285  if (!alpha->isGap(seq[i]))
286  count++;
287  }
288  return count;
289 }
290 
291 /******************************************************************************/
292 
294 {
295  size_t count = 0;
296  const Alphabet* alpha = seq.getAlphabet();
297  for (size_t i = 0; i < seq.size(); i++)
298  {
299  if (!alpha->isGap(seq[i]) && !alpha->isUnresolved(seq[i]))
300  count++;
301  }
302  return count;
303 }
304 
305 /******************************************************************************/
306 
308 {
309  const Alphabet* alpha = seq.getAlphabet();
310  vector<int> content;
311  for (size_t i = 0; i < seq.size(); i++)
312  {
313  if (!(alpha->isGap(seq[i]) || alpha->isUnresolved(seq[i])))
314  content.push_back(seq[i]);
315  }
316  Sequence* newSeq = dynamic_cast<Sequence*>(seq.clone());
317  newSeq->setContent(content);
318  return newSeq;
319 }
320 
321 /******************************************************************************/
322 
324 {
325  size_t count = 0;
326  const Alphabet* alpha = seq.getAlphabet();
327  for (size_t i = 0; i < seq.size(); i++)
328  {
329  if (alpha->isUnresolved(seq[i]))
330  count++;
331  }
332  return count;
333 }
334 
335 /******************************************************************************/
336 
338 {
339  const Alphabet* alpha = seq.getAlphabet();
340  vector<int> content;
341  for (size_t i = 0; i < seq.size(); i++)
342  {
343  if (!alpha->isGap(seq[i]))
344  content.push_back(seq[i]);
345  }
346  Sequence* newSeq = dynamic_cast<Sequence*>(seq.clone());
347  newSeq->setContent(content);
348  return newSeq;
349 }
350 
351 /******************************************************************************/
352 
354 {
355  const Alphabet* alpha = seq.getAlphabet();
356  for (size_t i = seq.size(); i > 0; --i)
357  {
358  if (alpha->isGap(seq[i - 1]))
359  seq.deleteElement(i - 1);
360  }
361 }
362 
363 /******************************************************************************/
364 
365 Sequence* SequenceTools::getSequenceWithoutStops(const Sequence& seq, const GeneticCode& gCode) throw (Exception)
366 {
367  const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
368  if (!calpha)
369  throw Exception("SequenceTools::getSequenceWithoutStops. Input sequence should have a codon alphabet.");
370  vector<int> content;
371  for (size_t i = 0; i < seq.size(); i++)
372  {
373  if (!gCode.isStop(seq[i]))
374  content.push_back(seq[i]);
375  }
376  Sequence* newSeq = dynamic_cast<Sequence*>(seq.clone());
377  newSeq->setContent(content);
378  return newSeq;
379 }
380 
381 /******************************************************************************/
382 
383 void SequenceTools::removeStops(Sequence& seq, const GeneticCode& gCode) throw (Exception)
384 {
385  const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
386  if (!calpha)
387  throw Exception("SequenceTools::removeStops. Input sequence should have a codon alphabet.");
388  for (size_t i = seq.size(); i > 0; --i)
389  {
390  if (gCode.isStop(seq[i - 1]))
391  seq.deleteElement(i - 1);
392  }
393 }
394 
395 /******************************************************************************/
396 
397 void SequenceTools::replaceStopsWithGaps(Sequence& seq, const GeneticCode& gCode) throw (Exception)
398 {
399  const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(seq.getAlphabet());
400  if (!calpha)
401  throw Exception("SequenceTools::replaceStopsWithGaps. Input sequence should have a codon alphabet.");
402  int gap = calpha->getGapCharacterCode();
403  for (size_t i = 0; i < seq.size(); ++i)
404  {
405  if (gCode.isStop(seq[i]))
406  seq.setElement(i, gap);
407  }
408 }
409 
410 /******************************************************************************/
411 
414 {
415  if (seq1.size() != seq2.size())
416  throw SequenceNotAlignedException("SequenceTools::bowkerTest.", &seq2);
417  size_t n = seq1.size();
418  const Alphabet* alpha = seq1.getAlphabet();
419  unsigned int r = alpha->getSize();
420 
421  // Compute contingency table:
422  RowMatrix<double> array(r, r);
423  int x, y;
424  for (size_t i = 0; i < n; i++)
425  {
426  x = seq1[i];
427  y = seq2[i];
428  if (!alpha->isGap(x) && !alpha->isUnresolved(x)
429  && !alpha->isGap(y) && !alpha->isUnresolved(y))
430  {
431  array(static_cast<size_t>(x), static_cast<size_t>(y))++;
432  }
433  }
434 
435  // Compute Bowker's statistic:
436  double sb2 = 0, nij, nji;
437  for (unsigned int i = 1; i < r; ++i)
438  {
439  for (unsigned int j = 0; j < i; ++j)
440  {
441  nij = array(i, j);
442  nji = array(j, i);
443  if (nij != 0 || nji != 0)
444  {
445  sb2 += pow(nij - nji, 2) / (nij + nji);
446  }
447  // Else: we should display a warning there.
448  }
449  }
450 
451  // Compute p-value:
452  double pvalue = 1. - RandomTools::pChisq(sb2, (r - 1) * r / 2);
453 
454  // Return results:
455  BowkerTest* test = new BowkerTest();
456  test->setStatistic(sb2);
457  test->setPValue(pvalue);
458  return test;
459 }
460 
461 /******************************************************************************/
462 
463 void SequenceTools::getPutativeHaplotypes(const Sequence& seq, std::vector<Sequence*>& hap, unsigned int level)
464 {
465  vector< vector< int > > states(seq.size());
466  list<Sequence*> t_hap;
467  const Alphabet* alpha = seq.getAlphabet();
468  unsigned int hap_count = 1;
469  // Vector of available states at each position
470  for (size_t i = 0; i < seq.size(); i++)
471  {
472  vector<int> st = alpha->getAlias(seq[i]);
473  if (!st.size())
474  {
475  st.push_back(alpha->getGapCharacterCode());
476  }
477  if (st.size() <= level)
478  {
479  states[i] = st;
480  }
481  else
482  {
483  states[i] = vector<int>(1, seq[i]);
484  }
485  }
486  // Combinatorial haplotypes building (the use of tree may be more accurate)
487  t_hap.push_back(new BasicSequence(seq.getName() + "_hap" + TextTools::toString(hap_count++), "", alpha));
488  for (size_t i = 0; i < states.size(); i++)
489  {
490  for (list<Sequence*>::iterator it = t_hap.begin(); it != t_hap.end(); it++)
491  {
492  for (unsigned int j = 0; j < states[i].size(); j++)
493  {
494  Sequence* tmp_seq = new BasicSequence(seq.getName() + "_hap", (**it).getContent(), alpha);
495  if (j < states[i].size() - 1)
496  {
497  tmp_seq->setName(tmp_seq->getName() + TextTools::toString(hap_count++));
498  tmp_seq->addElement(states[i][j]);
499  t_hap.insert(it, tmp_seq);
500  }
501  else
502  {
503  (**it).addElement(states[i][j]);
504  }
505  }
506  }
507  }
508  for (list<Sequence*>::reverse_iterator it = t_hap.rbegin(); it != t_hap.rend(); it++)
509  {
510  hap.push_back(*it);
511  }
512 }
513 
514 /******************************************************************************/
515 
517 {
518  if (s1.getAlphabet()->getAlphabetType() != s2.getAlphabet()->getAlphabetType())
519  {
520  throw AlphabetMismatchException("SequenceTools::combineSequences(const Sequence& s1, const Sequence& s2): s1 and s2 don't have same Alphabet.", s1.getAlphabet(), s2.getAlphabet());
521  }
522  const Alphabet* alpha = s1.getAlphabet();
523  vector<int> st;
524  vector<int> seq;
525  size_t length = NumTools::max(s1.size(), s2.size());
526  for (size_t i = 0; i < length; i++)
527  {
528  if (i < s1.size())
529  st.push_back(s1.getValue(i));
530  if (i < s2.size())
531  st.push_back(s2.getValue(i));
532  seq.push_back(alpha->getGeneric(st));
533  st.clear();
534  }
535  Sequence* s = new BasicSequence(s1.getName() + "+" + s2.getName(), seq, alpha);
536  return s;
537 }
538 
539 /******************************************************************************/
540 
541 Sequence* SequenceTools::subtractHaplotype(const Sequence& s, const Sequence& h, string name, unsigned int level) throw (SequenceNotAlignedException)
542 {
543  const Alphabet* alpha = s.getAlphabet();
544  if (name.size() == 0)
545  name = s.getName() + "_haplotype";
546  string seq;
547  if (s.size() != h.size())
548  throw SequenceNotAlignedException("SequenceTools::subtractHaplotype: haplotype must be aligned with the sequence.", &h);
549  for (unsigned int i = 0; i < s.size(); ++i)
550  {
551  string c;
552  vector<int> nucs = alpha->getAlias(s.getValue(i));
553  if (nucs.size() > 1)
554  {
555  remove(nucs.begin(), nucs.end(), h.getValue(i));
556  nucs = vector<int>(nucs.begin(), nucs.end() - 1);
557  }
558  else
559  {
560  nucs = vector<int>(nucs.begin(), nucs.end());
561  }
562  c = alpha->intToChar(alpha->getGeneric(nucs));
563  if (level <= nucs.size() && (alpha->isUnresolved(s.getValue(i)) || alpha->isUnresolved(h.getValue(i))))
564  {
565  c = alpha->intToChar(alpha->getUnknownCharacterCode());
566  }
567  seq += c;
568  }
569  Sequence* hap = new BasicSequence(name, seq, alpha);
570  return hap;
571 }
572 
573 /******************************************************************************/
574 
576 {
577  // Alphabet type checking
578  if (seq.getAlphabet()->getAlphabetType() != "DNA alphabet")
579  {
580  throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", seq.getAlphabet());
581  }
582 
583  if ((ph < 1) || (ph > 3))
584  throw Exception("Bad phase for RNYSlice: " + TextTools::toString(ph) + ". Should be between 1 and 3.");
585 
586  size_t s = seq.size();
587  size_t n = (s - static_cast<size_t>(ph) + 3) / 3;
588 
589  vector<int> content(n);
590 
591  int tir = seq.getAlphabet()->getGapCharacterCode();
592  size_t j;
593 
594  for (size_t i = 0; i < n; i++)
595  {
596  j = i * 3 + static_cast<size_t>(ph) - 1;
597 
598  if (j == 0)
599  content[i] = _RNY.getRNY(tir, seq[0], seq[1], *seq.getAlphabet());
600  else
601  {
602  if (j == s - 1)
603  content[i] = _RNY.getRNY(seq[j - 1], seq[j], tir, *seq.getAlphabet());
604  else
605  content[i] = _RNY.getRNY(seq[j - 1], seq[j], seq[j + 1], *seq.getAlphabet());
606  }
607  }
608 
609  // New sequence creating, and sense reversing
610  Sequence* sq = new BasicSequence(seq.getName(), content,
611  seq.getComments(), &_RNY);
612 
613  // Send result
614  return sq;
615 }
616 
618 {
619  // Alphabet type checking
620  if (seq.getAlphabet()->getAlphabetType() != "DNA alphabet")
621  {
622  throw AlphabetException ("SequenceTools::transcript : Sequence must be DNA", seq.getAlphabet());
623  }
624 
625  size_t n = seq.size();
626 
627  vector<int> content(n);
628 
629  int tir = seq.getAlphabet()->getGapCharacterCode();
630 
631  if (seq.size() >= 2)
632  {
633  content[0] = _RNY.getRNY(tir, seq[0], seq[1], *seq.getAlphabet());
634 
635  for (unsigned int i = 1; i < n - 1; i++)
636  {
637  content[i] = _RNY.getRNY(seq[i - 1], seq[i], seq[i + 1],
638  *seq.getAlphabet());
639  }
640 
641  content[n - 1] = _RNY.getRNY(seq[n - 2], seq[n - 1], tir, *seq.getAlphabet());
642  }
643 
644  // New sequence creating, and sense reversing
645  Sequence* s = new BasicSequence(seq.getName(), content,
646  seq.getComments(), &_RNY);
647 
648  // Send result
649  return s;
650 }
651 
652 /******************************************************************************/
653 
654 
655 void SequenceTools::getCDS(Sequence& sequence, const GeneticCode& gCode, bool checkInit, bool checkStop, bool includeInit, bool includeStop)
656 {
657  const CodonAlphabet* alphabet = dynamic_cast<const CodonAlphabet*>(sequence.getAlphabet());
658  if (!alphabet)
659  throw AlphabetException("SequenceTools::getCDS. Sequence is not a codon sequence.");
660  if (checkInit)
661  {
662  size_t i;
663  for (i = 0; i < sequence.size() && !gCode.isStart(sequence[i]); ++i)
664  {}
665  for (size_t j = 0; includeInit ? j < i : j <= i; ++j)
666  {
667  sequence.deleteElement(j);
668  }
669  }
670  if (checkStop)
671  {
672  size_t i;
673  for (i = 0; i < sequence.size() && !gCode.isStop(sequence[i]); ++i)
674  {}
675  for (size_t j = includeStop ? i + 1 : i; j < sequence.size(); ++j)
676  {
677  sequence.deleteElement(j);
678  }
679  }
680 }
681 
682 /******************************************************************************/
683 
684 size_t SequenceTools::findFirstOf(const Sequence& seq, const Sequence& motif, bool strict)
685 {
686  if (motif.size() > seq.size())
687  return seq.size();
688  for (size_t seqi = 0; seqi < seq.size() - motif.size() + 1; seqi++)
689  {
690  bool match = false;
691  for (size_t moti = 0; moti < motif.size(); moti++)
692  {
693  if (strict)
694  {
695  match = seq.getValue(seqi + moti) == motif.getValue(moti);
696  }
697  else
698  {
699  match = AlphabetTools::match(seq.getAlphabet(), seq.getValue(seqi + moti), motif.getValue(moti));
700  }
701  if (!match)
702  {
703  break;
704  }
705  }
706  if (match)
707  {
708  return seqi;
709  }
710  }
711  return seq.size();
712 }
713 
714 /******************************************************************************/
715 
716 Sequence* SequenceTools::getRandomSequence(const Alphabet* alphabet, size_t length)
717 {
718  int s = static_cast<int>(alphabet->getSize());
719  vector<int> content(length);
720  for (size_t i = 0; i < length; ++i) {
721  content[i] = RandomTools::giveIntRandomNumberBetweenZeroAndEntry(s);
722  }
723  return new BasicSequence("random", content, alphabet);
724 }
725 
726 /******************************************************************************/
727 
static Sequence * getSequenceWithoutStops(const Sequence &seq, const GeneticCode &gCode)
Get a copy of the codon sequence without stops.
static size_t getNumberOfUnresolvedSites(const Sequence &seq)
static bool match(const Alphabet *alphabet, int i, int j)
Tell if two characters match according to a given alphabet.
virtual std::string getName(int state) const =0
Get the complete name of a state given its int description.
static Sequence * concatenate(const Sequence &seq1, const Sequence &seq2)
Concatenate two sequences.
virtual int getGeneric(const std::vector< int > &states) const =0
Get the generic state that match a set of states.
virtual bool isStart(int state) const
Tells is a particular codon is a start codon.
Definition: GeneticCode.h:163
Definition: RNY.h:65
static void getPutativeHaplotypes(const Sequence &seq, std::vector< Sequence *> &hap, unsigned int level=2)
Get all putatives haplotypes from an heterozygous sequence.
virtual bool isGap(int state) const =0
Sequence * clone() const =0
static Sequence * RNYslice(const Sequence &sequence, int ph)
Get the RNY decomposition of a DNA sequence; with a given phase between 1 and 3, it gives the decompo...
This alphabet is used to deal NumericAlphabet.
static Sequence * getInvert(const Sequence &sequence)
Inverse a sequence from 5&#39;->3&#39; to 3&#39;->5&#39; and vice-versa.
virtual bool isUnresolved(int state) const =0
static void removeGaps(Sequence &seq)
Remove gaps from a sequence.
static Sequence * getComplement(const Sequence &sequence)
Get the complementary sequence of a nucleotide sequence.
static size_t getNumberOfSites(const Sequence &seq)
virtual unsigned int getSize() const =0
Get the number of resolved states in the alphabet (e.g. return 4 for DNA alphabet). This is the method you&#39;ll need in most cases.
The Alphabet interface.
Definition: Alphabet.h:130
STL namespace.
static Sequence * getSequenceWithoutGaps(const Sequence &seq)
Get a copy of the sequence without gaps.
virtual std::vector< int > getAlias(int state) const =0
Get all resolved states that match a generic state.
virtual int getGapCharacterCode() const =0
virtual const std::string & getName() const =0
Get the name of this sequence.
static size_t getNumberOfCompleteSites(const Sequence &seq)
int translate(int state) const
Translate a given state coded as a int from source alphabet to target alphabet.
static void getCDS(Sequence &sequence, const GeneticCode &gCode, bool checkInit, bool checkStop, bool includeInit=true, bool includeStop=true)
Extract CDS part from a codon sequence. Optionally check for intiator and stop codons, or both.
virtual void addElement(const std::string &c)=0
Add a character to the end of the list.
virtual void setName(const std::string &name)=0
Set the name of this sequence.
static Sequence * subtractHaplotype(const Sequence &s, const Sequence &h, std::string name="", unsigned int level=1)
Subtract haplotype from an heterozygous sequence.
static double getPercentIdentity(const Sequence &seq1, const Sequence &seq2, bool ignoreGaps=false)
static void removeStops(Sequence &seq, const GeneticCode &gCode)
Remove stops from a codon sequence.
static BowkerTest * bowkerTest(const Sequence &seq1, const Sequence &seq2)
Bowker&#39;s test for homogeneity.
static NucleicAcidsReplication _DNARep
virtual void deleteElement(size_t pos)=0
Delete the element at position &#39;pos&#39;.
Codon alphabet class.
Definition: CodonAlphabet.h:63
static NucleicAcidsReplication _RNARep
static Sequence * getSequenceWithCompleteSites(const Sequence &seq)
keep only complete sites in a sequence.
static Sequence & complement(Sequence &seq)
Complement the nucleotide sequence itself.
virtual int getValue(size_t pos) const =0
Get the element at position &#39;pos&#39; as an int.
void setPValue(double pvalue)
Definition: SequenceTools.h:89
The alphabet exception base class.
A basic implementation of the Sequence interface.
Definition: Sequence.h:207
virtual void setElement(size_t pos, const std::string &c)=0
Set the element at position &#39;pos&#39; to character &#39;c&#39;.
static void replaceStopsWithGaps(Sequence &seq, const GeneticCode &gCode)
Replace stop codons by gaps.
int getGapCharacterCode() const
virtual void setContent(const std::string &sequence)=0
Set the whole content of the sequence.
virtual const Alphabet * getAlphabet() const =0
Get the alphabet associated to the list.
NucleicAcidsReplication SequenceTools::_RNARep & _RNA
static Sequence & invert(Sequence &seq)
Inverse a sequence from 5&#39;->3&#39; to 3&#39;->5&#39; and vice-versa.
static Sequence * transcript(const Sequence &sequence)
Get the transcription sequence of a DNA sequence.
static Sequence & invertComplement(Sequence &seq)
Inverse and complement a sequence.
virtual size_t size() const =0
Get the number of elements in the list.
This alphabet is used to deal with DNA sequences.
Definition: DNA.h:60
virtual bool isStop(int state) const =0
Tells is a particular codon is a stop codon.
The sequence interface.
Definition: Sequence.h:74
static Sequence * combineSequences(const Sequence &s1, const Sequence &s2)
Combine two sequences.
virtual std::string intToChar(int state) const =0
Give the string description of a state given its int description.
static Sequence * subseq(const Sequence &sequence, size_t begin, size_t end)
Get a sub-sequence.
This alphabet is used to deal with RNA sequences.
Definition: RNA.h:58
static Sequence * reverseTranscript(const Sequence &sequence)
Get the reverse-transcription sequence of a RNA sequence.
NucleicAcidsReplication SequenceTools::_DNARep & _DNA
Partial implementation of the Transliterator interface for genetic code object.
Definition: GeneticCode.h:79
Exception thrown when two alphabets do not match.
static NucleicAcidsReplication _transc
Bowker&#39;s homogeneity test results class.
Definition: SequenceTools.h:68
static size_t findFirstOf(const Sequence &seq, const Sequence &motif, bool strict=true)
Find the position of a motif in a sequence.
Replication between to nucleic acids.
virtual int getUnknownCharacterCode() const =0
Exception thrown when a sequence is not align with others.
void setStatistic(double stat)
Definition: SequenceTools.h:88
virtual std::string getAlphabetType() const =0
Identification method.
static Sequence * getRandomSequence(const Alphabet *alphabet, size_t length)
Get a random sequence of given size and alphabet, with all state with equal probability.