45 #include "../SiteTools.h" 46 #include "../CodonSiteTools.h" 47 #include "../Alphabet/AlphabetTools.h" 48 #include "../SequenceTools.h" 49 #include <Bpp/App/ApplicationTools.h> 99 for (
unsigned int i = 0; i < selection.size(); i++)
117 if (selection.size() % wsize != 0)
118 throw IOException(
"SiteContainerTools::getSelectedPositions: Positions selection is not compatible with the alphabet in use in the container.");
120 for (
size_t i = 0; i < selection.size(); i += wsize)
122 if (selection[i] % wsize != 0)
123 throw IOException(
"SiteContainerTools::getSelectedPositions: Positions selection is not compatible with the alphabet in use in the container.");
125 for (
size_t j = 1; j < wsize; ++j)
127 if (selection[i + j] != (selection[i + j - 1] + 1))
128 throw IOException(
"SiteContainerTools::getSelectedPositions: Positions selection is not compatible with the alphabet in use in the container.");
130 selection2.push_back(selection[i] / wsize);
132 return getSelectedSites(sequences, selection2);
136 return getSelectedSites(sequences, selection);
150 map<int, double> freq;
156 for (map<int, double>::iterator it = freq.begin(); it != freq.end(); it++)
158 if (it->second > max && it->first != -1)
167 for (map<int, double>::iterator it = freq.begin(); it != freq.end(); it++)
169 if (it->second > max)
176 consensus.push_back(cons);
192 int* element = &sites(j, i);
194 *element = unknownCode;
209 int* element = &sites(j, i);
240 ApplicationTools::displayGauge(n - i + 1, n);
257 ApplicationTools::displayGauge(n, n);
274 noGapCont->
addSite(*site,
false);
287 ApplicationTools::displayGauge(n - i + 1, n);
304 ApplicationTools::displayGauge(n, n);
318 map<int, double> freq;
320 if (freq[-1] <= maxFreqGaps)
331 map<int, double> freq;
333 if (freq[-1] > maxFreqGaps)
345 vector<string> seqNames = sites.getSequencesNames();
348 for (
size_t i = 0; i < sites.getNumberOfSites(); i++)
350 const Site* site = &sites.getSite(i);
352 noStopCont->
addSite(*site,
false);
364 throw AlphabetException(
"SiteContainerTools::resolveDottedAlignment. Alignment alphabet should of class 'DefaultAlphabet'.", dottedAln.getAlphabet());
367 size_t n = dottedAln.getNumberOfSequences();
369 throw Exception(
"SiteContainerTools::resolveDottedAlignment. Input alignment contains no sequence.");
372 for (
size_t i = 0; i < n; ++i)
374 const Sequence* seq = &dottedAln.getSequence(i);
376 for (
unsigned int j = 0; isRef && j < seq->
size(); ++j)
387 throw Exception(
"SiteContainerTools::resolveDottedAlignment. No reference sequence was found in the input alignment.");
393 size_t m = dottedAln.getNumberOfSites();
395 for (
unsigned int i = 0; i < m; ++i)
397 string resolved = refSeq->
getChar(i);
398 const Site* site = &dottedAln.getSite(i);
400 for (
unsigned int j = 0; j < n; j++)
407 resolvedSite.addElement(state);
427 map<size_t, size_t> tln;
430 unsigned int count = 0;
431 for (
size_t i = 0; i < seq.
size(); i++)
446 map<size_t, size_t> tln;
449 unsigned int count = 0;
450 for (
size_t i = 0; i < seq.
size(); i++)
466 if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
468 map<size_t, size_t> tln;
469 if (seq1.size() == 0)
471 unsigned int count1 = 0;
472 unsigned int count2 = 0;
473 if (seq2.size() == 0)
474 throw Exception(
"SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) +
" and " + TextTools::toString(count2 + 1) +
".");
475 int state1 = seq1[count1];
476 int state2 = seq2[count2];
483 if (count1 < seq1.size())
484 state1 = seq1[count1];
491 if (count2 < seq2.size())
492 state2 = seq2[count2];
496 if (state1 != state2)
497 throw Exception(
"SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) +
" and " + TextTools::toString(count2 + 1) +
".");
498 tln[count1 + 1] = count2 + 1;
499 if (count1 == seq1.size() - 1)
503 if (count2 == seq2.size() - 1)
505 state1 = seq1[++count1];
509 if (count1 < seq1.size())
510 state1 = seq1[count1];
517 throw Exception(
"SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) +
" and " + TextTools::toString(count2 + 1) +
".");
521 state1 = seq1[++count1];
522 state2 = seq2[++count2];
535 map<size_t, size_t> tln;
550 tln[count1] = (state2 == -1 ? 0 : count2);
565 if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
567 if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType())
570 auto_ptr<Sequence> s1(seq1.clone());
572 auto_ptr<Sequence> s2(seq2.clone());
576 RowMatrix<double> m(s1->size() + 1, s2->size() + 1);
577 RowMatrix<char> p(s1->size(), s2->size());
578 double choice1, choice2, choice3, mx;
580 for (
size_t i = 0; i <= s1->size(); i++)
582 m(i, 0) =
static_cast<double>(i) * gap;
584 for (
size_t j = 0; j <= s2->size(); j++)
586 m(0, j) =
static_cast<double>(j) * gap;
588 for (
size_t i = 1; i <= s1->size(); i++)
590 for (
size_t j = 1; j <= s2->size(); j++)
592 choice1 = m(i - 1, j - 1) +
static_cast<double>(s.getIndex((*s1)[i - 1], (*s2)[j - 1]));
593 choice2 = m(i - 1, j) + gap;
594 choice3 = m(i, j - 1) + gap;
595 mx = choice1; px =
'd';
598 mx = choice2; px =
'u';
602 mx = choice3; px =
'l';
605 p(i - 1, j - 1) = px;
611 size_t i = s1->size(), j = s2->size();
613 while (i > 0 && j > 0)
618 a1.push_front((*s1)[i - 1]);
619 a2.push_front((*s2)[j - 1]);
625 a1.push_front((*s1)[i - 1]);
632 a2.push_front((*s2)[j - 1]);
638 a1.push_front((*s1)[i - 1]);
645 a2.push_front((*s2)[j - 1]);
648 s1->setContent(vector<int>(a1.begin(), a1.end()));
649 s2->setContent(vector<int>(a2.begin(), a2.end()));
666 if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
668 if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType())
671 auto_ptr<Sequence> s1(seq1.clone());
673 auto_ptr<Sequence> s2(seq2.clone());
677 RowMatrix<double> m(s1->size() + 1, s2->size() + 1);
678 RowMatrix<double> v(s1->size() + 1, s2->size() + 1);
679 RowMatrix<double> h(s1->size() + 1, s2->size() + 1);
680 RowMatrix<char> p(s1->size(), s2->size());
682 double choice1, choice2, choice3, mx;
685 for (
size_t i = 0; i <= s1->size(); i++)
689 for (
size_t j = 0; j <= s2->size(); j++)
693 for (
size_t i = 1; i <= s1->size(); i++)
695 m(i, 0) = h(i, 0) = opening +
static_cast<double>(i) * extending;
697 for (
size_t j = 1; j <= s2->size(); j++)
699 m(0, j) = v(0, j) = opening +
static_cast<double>(j) * extending;
702 for (
size_t i = 1; i <= s1->size(); i++)
704 for (
size_t j = 1; j <= s2->size(); j++)
706 choice1 = m(i - 1, j - 1) + s.getIndex((*s1)[i - 1], (*s2)[j - 1]);
707 choice2 = h(i - 1, j - 1) + opening + extending;
708 choice3 = v(i - 1, j - 1) + opening + extending;
720 choice1 = m(i, j - 1) + opening + extending;
721 choice2 = h(i, j - 1) + extending;
729 choice1 = m(i - 1, j) + opening + extending;
730 choice2 = v(i - 1, j) + extending;
739 if (v(i, j) > m(i, j))
741 if (h(i, j) > m(i, j))
743 p(i - 1, j - 1) = px;
749 size_t i = s1->size(), j = s2->size();
751 while (i > 0 && j > 0)
756 a1.push_front((*s1)[i - 1]);
757 a2.push_front((*s2)[j - 1]);
763 a1.push_front((*s1)[i - 1]);
770 a2.push_front((*s2)[j - 1]);
776 a1.push_front((*s1)[i - 1]);
783 a2.push_front((*s2)[j - 1]);
786 s1->setContent(vector<int>(a1.begin(), a1.end()));
787 s2->setContent(vector<int>(a2.begin(), a2.end()));
799 for (
size_t i = 0; i < nbSites; i++)
801 size_t pos =
static_cast<size_t>(RandomTools::giveIntRandomNumberBetweenZeroAndEntry(static_cast<int>(sites.
getNumberOfSites())));
804 index->push_back(pos);
827 if (seq1.size() != seq2.size())
829 if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
832 const Alphabet* alpha = seq1.getAlphabet();
835 for (
size_t i = 0; i < seq1.size(); i++)
847 if (gapOption == SIMILARITY_ALL)
850 if (x == y && !alpha->
isGap(x) && !alpha->
isGap(y))
853 else if (gapOption == SIMILARITY_NODOUBLEGAP)
862 else if (gapOption == SIMILARITY_NOGAP)
872 throw Exception(
"SiteContainerTools::computeSimilarity. Invalid gap option: " + gapOption);
874 double r = (t == 0 ? 0. :
static_cast<double>(s) / static_cast<double>(t));
875 return dist ? 1 - r : r;
884 string pairwiseGapOption = gapOption;
886 if (gapOption == SIMILARITY_NOFULLGAP)
900 pairwiseGapOption = SIMILARITY_ALL;
907 for (
size_t i = 0; i < n; i++)
909 (*mat)(i, i) = dist ? 0. : 1.;
911 for (
size_t j = i + 1; j < n; j++)
914 (*mat)(i, j) = (*mat)(j, i) = computeSimilarity(*seq1, *seq2, dist, pairwiseGapOption, unresolvedAsGap);
926 if (seqCont1.getAlphabet()->getAlphabetType() != seqCont2.getAlphabet()->getAlphabetType())
930 vector<string> seqNames1 = seqCont1.getSequencesNames();
931 vector<string> seqNames2 = seqCont2.getSequencesNames();
934 if (seqNames1 == seqNames2)
936 seqCont2bis = &seqCont2;
943 seqCont2bis = seqCont2ter;
947 if (leavePositionAsIs)
956 int offset =
static_cast<int>(seqCont1.getNumberOfSites());
975 unsigned int pos = 0;
979 positions(i, j) = pos;
991 if (positions1.getNumberOfRows() != positions2.getNumberOfRows())
992 throw Exception(
"SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!");
993 vector<int> scores(positions1.getNumberOfColumns());
994 for (
size_t i = 0; i < positions1.getNumberOfColumns(); ++i) {
998 for (
size_t j = 0; j < positions1.getNumberOfRows(); ++j) {
999 if (positions1(j, i) > 0) {
1001 whichPos = positions1(j, i);
1005 if (whichPos == 0) {
1013 for (
size_t j = 0; !found && j < positions2.getNumberOfColumns(); ++j) {
1014 if (positions2(whichSeq, j) == whichPos) {
1020 throw Exception(
"SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) +
" of sequence " + TextTools::toString(whichSeq) +
" not found in reference alignment. Please make sure the two indexes are built from the same data!");
1024 for (
size_t j = 0; test && j < positions1.getNumberOfRows(); ++j) {
1025 test = (positions1(j, i) == positions2(j, i2));
1027 scores[i] = test ? 1 : 0;
1036 if (positions1.getNumberOfRows() != positions2.getNumberOfRows())
1037 throw Exception(
"SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!");
1038 vector<double> scores(positions1.getNumberOfColumns());
1039 for (
size_t i = 0; i < positions1.getNumberOfColumns(); ++i) {
1041 size_t countAlignable = 0;
1042 size_t countAligned = 0;
1043 for (
size_t j = 0; j < positions1.getNumberOfRows(); ++j) {
1045 size_t whichPos = positions1(j, i);
1046 if (whichPos == 0) {
1053 for (
size_t k = 0; !found && k < positions2.getNumberOfColumns(); ++k) {
1054 if (positions2(j, k) == whichPos) {
1060 throw Exception(
"SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) +
" of sequence " + TextTools::toString(j) +
" not found in reference alignment. Please make sure the two indexes are built from the same data!");
1064 for (
size_t k = j + 1; k < positions1.getNumberOfRows(); ++k) {
1065 size_t whichPos2 = positions1(k, i);
1066 if (whichPos2 == 0) {
1072 if (positions2(k, i2) == whichPos2)
1076 scores[i] = countAlignable == 0 ? na :
static_cast<double>(countAligned) / static_cast<double>(countAlignable);
std::vector< size_t > SiteSelection
The SiteContainer interface.
virtual bool isGap(int state) const =0
virtual int getPosition() const
Get the position of this site.
Loop over all sites in a SiteContainer.
Aligned sequences container.
This alphabet is used to deal NumericAlphabet.
virtual std::string getChar(size_t pos) const
Get the element at position 'pos' as a character.
virtual bool isUnresolved(int state) const =0
Loop over all complete sites in a SiteContainer (i.e. sites without gap and unresolved characters)...
virtual unsigned int getStateCodingSize() const =0
Get the size of the string coding a state.
virtual int getGapCharacterCode() const =0
void setSequencesNames(const std::vector< std::string > &names, bool checkNames=true)
Set all sequence names.
virtual const Sequence & getSequence(size_t sequenceIndex) const =0
Retrieve a sequence object from the container.
virtual const Comments & getGeneralComments() const =0
Get the comments of this container.
void addSite(const Site &site, bool checkPosition=true)
Add a site in the container.
bool hasMoreSites() const
virtual std::string getChar(size_t pos) const =0
Get the element at position 'pos' as a character.
Loop over all sites without gaps in a SiteContainer.
virtual void addSite(const Site &site, bool checkPosition)=0
Add a site in the container.
void addSequence(const Sequence &sequence, bool checkName=true)
Add a sequence at the end of the container.
virtual const Site & getSite(size_t siteIndex) const =0
Get a site from the container.
The alphabet exception base class.
A basic implementation of the Sequence interface.
bool hasMoreSites() const
Two dimensionnal alphabet index interface.
bool hasMoreSites() const
virtual size_t getNumberOfSites() const =0
Get the number of sites in the container.
virtual size_t size() const =0
Get the number of elements in the list.
A Matrix class to store phylogenetic distances.
virtual const Alphabet * getAlphabet() const =0
Get sequence container's alphabet.
Exception thrown when two alphabets do not match.
Partial implementation of the Transliterator interface for genetic code object.
virtual std::vector< std::string > getSequencesNames() const =0
Get all the names of the sequences in the container.
virtual void deleteSites(size_t siteIndex, size_t length)=0
Delete a continuous range of sites in the container.
virtual int getUnknownCharacterCode() const =0
Exception thrown when a sequence is not align with others.
void setGeneralComments(const Comments &comments)
Set the comments of this container.
The VectorSiteContainer class.
virtual size_t getNumberOfSequences() const =0
Get the number of sequences in the container.
virtual void deleteSite(size_t siteIndex)=0
Delete a site in the container.