46 #include <Bpp/Numeric/Matrix/Matrix.h> 47 #include <Bpp/Numeric/VectorTools.h> 71 if (end >= sequence.size())
72 throw IndexOutOfBoundsException (
"SequenceTools::subseq : Invalid upper bound", end, 0, sequence.size());
74 throw Exception (
"SequenceTools::subseq : Invalid interval");
77 vector<int> temp(sequence.getContent());
80 temp.erase(temp.begin() +
static_cast<ptrdiff_t
>(end + 1), temp.end());
81 temp.erase(temp.begin(), temp.begin() +
static_cast<ptrdiff_t
>(begin));
84 return new BasicSequence(sequence.getName(), temp, sequence.getComments(), sequence.getAlphabet());
92 if ((seq1.getAlphabet()->getAlphabetType()) != (seq2.getAlphabet()->getAlphabetType()))
93 throw AlphabetMismatchException(
"SequenceTools::concatenate : Sequence's alphabets don't match ", seq1.getAlphabet(), seq2.getAlphabet());
96 if (seq1.getName() != seq2.getName())
97 throw Exception (
"SequenceTools::concatenate : Sequence's names don't match");
100 vector<int> sequence = seq1.getContent();
101 vector<int> sequence2 = seq2.getContent();
102 sequence.insert(sequence.end(), sequence2.begin(), sequence2.end());
103 return new BasicSequence(seq1.getName(), sequence, seq1.getComments(), seq1.getAlphabet());
112 if (seq.getAlphabet()->getAlphabetType() ==
"DNA alphabet")
116 else if (seq.getAlphabet()->getAlphabetType() ==
"RNA alphabet")
122 throw AlphabetException(
"SequenceTools::complement: Sequence must be nucleic.", seq.getAlphabet());
124 for (
size_t i = 0; i < seq.size(); i++)
126 seq.setElement(i, NAR->
translate(seq.getValue(i)));
137 if (sequence.getAlphabet()->getAlphabetType() ==
"DNA alphabet")
141 else if (sequence.getAlphabet()->getAlphabetType() ==
"RNA alphabet")
147 throw AlphabetException (
"SequenceTools::getComplement: Sequence must be nucleic.", sequence.getAlphabet());
158 if (sequence.getAlphabet()->getAlphabetType() !=
"DNA alphabet")
160 throw AlphabetException (
"SequenceTools::transcript : Sequence must be DNA", sequence.getAlphabet());
163 return _transc.translate(sequence);
171 if (sequence.getAlphabet()->getAlphabetType() !=
"RNA alphabet")
173 throw AlphabetException (
"SequenceTools::reverseTranscript : Sequence must be RNA", sequence.getAlphabet());
176 return _transc.reverse(sequence);
183 size_t seq_size = seq.
size();
186 for (
size_t i = 0; i < seq_size / 2; i++)
188 j = seq_size - 1 - i;
226 size_t seq_size = seq.
size();
229 for (
size_t i = 0; i < seq_size / 2; i++)
231 j = seq_size - 1 - i;
247 if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
249 if (seq1.size() != seq2.size())
251 int gap = seq1.getAlphabet()->getGapCharacterCode();
254 for (
size_t i = 0; i < seq1.size(); i++)
256 int x = seq1.getValue(i);
257 int y = seq2.getValue(i);
260 if (x != gap && y != gap)
274 return static_cast<double>(id) / static_cast<double>(tot) * 100.;
283 for (
size_t i = 0; i < seq.
size(); i++)
285 if (!alpha->
isGap(seq[i]))
297 for (
size_t i = 0; i < seq.
size(); i++)
311 for (
size_t i = 0; i < seq.
size(); i++)
314 content.push_back(seq[i]);
327 for (
size_t i = 0; i < seq.
size(); i++)
341 for (
size_t i = 0; i < seq.
size(); i++)
343 if (!alpha->
isGap(seq[i]))
344 content.push_back(seq[i]);
356 for (
size_t i = seq.
size(); i > 0; --i)
358 if (alpha->
isGap(seq[i - 1]))
369 throw Exception(
"SequenceTools::getSequenceWithoutStops. Input sequence should have a codon alphabet.");
371 for (
size_t i = 0; i < seq.size(); i++)
373 if (!gCode.isStop(seq[i]))
374 content.push_back(seq[i]);
387 throw Exception(
"SequenceTools::removeStops. Input sequence should have a codon alphabet.");
388 for (
size_t i = seq.size(); i > 0; --i)
390 if (gCode.isStop(seq[i - 1]))
391 seq.deleteElement(i - 1);
401 throw Exception(
"SequenceTools::replaceStopsWithGaps. Input sequence should have a codon alphabet.");
403 for (
size_t i = 0; i < seq.size(); ++i)
405 if (gCode.isStop(seq[i]))
406 seq.setElement(i, gap);
415 if (seq1.size() != seq2.size())
417 size_t n = seq1.size();
418 const Alphabet* alpha = seq1.getAlphabet();
419 unsigned int r = alpha->
getSize();
422 RowMatrix<double> array(r, r);
424 for (
size_t i = 0; i < n; i++)
431 array(static_cast<size_t>(x), static_cast<size_t>(y))++;
436 double sb2 = 0, nij, nji;
437 for (
unsigned int i = 1; i < r; ++i)
439 for (
unsigned int j = 0; j < i; ++j)
443 if (nij != 0 || nji != 0)
445 sb2 += pow(nij - nji, 2) / (nij + nji);
452 double pvalue = 1. - RandomTools::pChisq(sb2, (r - 1) * r / 2);
465 vector< vector< int > > states(seq.
size());
466 list<Sequence*> t_hap;
468 unsigned int hap_count = 1;
470 for (
size_t i = 0; i < seq.
size(); i++)
472 vector<int> st = alpha->
getAlias(seq[i]);
477 if (st.size() <= level)
483 states[i] = vector<int>(1, seq[i]);
487 t_hap.push_back(
new BasicSequence(seq.
getName() +
"_hap" + TextTools::toString(hap_count++),
"", alpha));
488 for (
size_t i = 0; i < states.size(); i++)
490 for (list<Sequence*>::iterator it = t_hap.begin(); it != t_hap.end(); it++)
492 for (
unsigned int j = 0; j < states[i].size(); j++)
495 if (j < states[i].size() - 1)
497 tmp_seq->
setName(tmp_seq->
getName() + TextTools::toString(hap_count++));
499 t_hap.insert(it, tmp_seq);
503 (**it).addElement(states[i][j]);
508 for (list<Sequence*>::reverse_iterator it = t_hap.rbegin(); it != t_hap.rend(); it++)
518 if (s1.getAlphabet()->getAlphabetType() != s2.getAlphabet()->getAlphabetType())
520 throw AlphabetMismatchException(
"SequenceTools::combineSequences(const Sequence& s1, const Sequence& s2): s1 and s2 don't have same Alphabet.", s1.getAlphabet(), s2.getAlphabet());
522 const Alphabet* alpha = s1.getAlphabet();
525 size_t length = NumTools::max(s1.size(), s2.size());
526 for (
size_t i = 0; i < length; i++)
529 st.push_back(s1.getValue(i));
531 st.push_back(s2.getValue(i));
543 const Alphabet* alpha = s.getAlphabet();
544 if (name.size() == 0)
545 name = s.
getName() +
"_haplotype";
547 if (s.size() != h.size())
549 for (
unsigned int i = 0; i < s.size(); ++i)
552 vector<int> nucs = alpha->
getAlias(s.getValue(i));
555 remove(nucs.begin(), nucs.end(), h.getValue(i));
556 nucs = vector<int>(nucs.begin(), nucs.end() - 1);
560 nucs = vector<int>(nucs.begin(), nucs.end());
578 if (seq.getAlphabet()->getAlphabetType() !=
"DNA alphabet")
580 throw AlphabetException (
"SequenceTools::transcript : Sequence must be DNA", seq.getAlphabet());
583 if ((ph < 1) || (ph > 3))
584 throw Exception(
"Bad phase for RNYSlice: " + TextTools::toString(ph) +
". Should be between 1 and 3.");
586 size_t s = seq.size();
587 size_t n = (s -
static_cast<size_t>(ph) + 3) / 3;
589 vector<int> content(n);
591 int tir = seq.getAlphabet()->getGapCharacterCode();
594 for (
size_t i = 0; i < n; i++)
596 j = i * 3 +
static_cast<size_t>(ph) - 1;
599 content[i] = _RNY.getRNY(tir, seq[0], seq[1], *seq.getAlphabet());
603 content[i] = _RNY.getRNY(seq[j - 1], seq[j], tir, *seq.getAlphabet());
605 content[i] = _RNY.getRNY(seq[j - 1], seq[j], seq[j + 1], *seq.getAlphabet());
611 seq.getComments(), &_RNY);
620 if (seq.getAlphabet()->getAlphabetType() !=
"DNA alphabet")
622 throw AlphabetException (
"SequenceTools::transcript : Sequence must be DNA", seq.getAlphabet());
625 size_t n = seq.size();
627 vector<int> content(n);
629 int tir = seq.getAlphabet()->getGapCharacterCode();
633 content[0] = _RNY.getRNY(tir, seq[0], seq[1], *seq.getAlphabet());
635 for (
unsigned int i = 1; i < n - 1; i++)
637 content[i] = _RNY.getRNY(seq[i - 1], seq[i], seq[i + 1],
641 content[n - 1] = _RNY.getRNY(seq[n - 2], seq[n - 1], tir, *seq.getAlphabet());
646 seq.getComments(), &_RNY);
659 throw AlphabetException(
"SequenceTools::getCDS. Sequence is not a codon sequence.");
663 for (i = 0; i < sequence.
size() && !gCode.
isStart(sequence[i]); ++i)
665 for (
size_t j = 0; includeInit ? j < i : j <= i; ++j)
673 for (i = 0; i < sequence.
size() && !gCode.
isStop(sequence[i]); ++i)
675 for (
size_t j = includeStop ? i + 1 : i; j < sequence.
size(); ++j)
688 for (
size_t seqi = 0; seqi < seq.
size() - motif.
size() + 1; seqi++)
691 for (
size_t moti = 0; moti < motif.
size(); moti++)
718 int s =
static_cast<int>(alphabet->
getSize());
719 vector<int> content(length);
720 for (
size_t i = 0; i < length; ++i) {
721 content[i] = RandomTools::giveIntRandomNumberBetweenZeroAndEntry(s);
virtual std::string getName(int state) const =0
Get the complete name of a state given its int description.
virtual int getGeneric(const std::vector< int > &states) const =0
Get the generic state that match a set of states.
virtual bool isStart(int state) const
Tells is a particular codon is a start codon.
virtual bool isGap(int state) const =0
Sequence * clone() const =0
This alphabet is used to deal NumericAlphabet.
virtual bool isUnresolved(int state) const =0
virtual unsigned int getSize() const =0
Get the number of resolved states in the alphabet (e.g. return 4 for DNA alphabet). This is the method you'll need in most cases.
virtual std::vector< int > getAlias(int state) const =0
Get all resolved states that match a generic state.
virtual int getGapCharacterCode() const =0
virtual const std::string & getName() const =0
Get the name of this sequence.
int translate(int state) const
Translate a given state coded as a int from source alphabet to target alphabet.
virtual void addElement(const std::string &c)=0
Add a character to the end of the list.
virtual void setName(const std::string &name)=0
Set the name of this sequence.
virtual void deleteElement(size_t pos)=0
Delete the element at position 'pos'.
virtual int getValue(size_t pos) const =0
Get the element at position 'pos' as an int.
void setPValue(double pvalue)
The alphabet exception base class.
A basic implementation of the Sequence interface.
virtual void setElement(size_t pos, const std::string &c)=0
Set the element at position 'pos' to character 'c'.
int getGapCharacterCode() const
virtual void setContent(const std::string &sequence)=0
Set the whole content of the sequence.
virtual const Alphabet * getAlphabet() const =0
Get the alphabet associated to the list.
virtual size_t size() const =0
Get the number of elements in the list.
This alphabet is used to deal with DNA sequences.
virtual bool isStop(int state) const =0
Tells is a particular codon is a stop codon.
virtual std::string intToChar(int state) const =0
Give the string description of a state given its int description.
This alphabet is used to deal with RNA sequences.
Partial implementation of the Transliterator interface for genetic code object.
Exception thrown when two alphabets do not match.
Bowker's homogeneity test results class.
Replication between to nucleic acids.
virtual int getUnknownCharacterCode() const =0
Exception thrown when a sequence is not align with others.
void setStatistic(double stat)
virtual std::string getAlphabetType() const =0
Identification method.