bpp-seq  2.2.0
SiteContainerTools.cpp
Go to the documentation of this file.
1 //
2 // File: SiteContainerTools.cpp
3 // Created by: Julien Dutheil
4 // Sylvain Glémin
5 // Created on: Fri Dec 12 18:55:06 2003
6 //
7 
8 /*
9  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
10 
11  This software is a computer program whose purpose is to provide classes
12  for sequences analysis.
13 
14  This software is governed by the CeCILL license under French law and
15  abiding by the rules of distribution of free software. You can use,
16  modify and/ or redistribute the software under the terms of the CeCILL
17  license as circulated by CEA, CNRS and INRIA at the following URL
18  "http://www.cecill.info".
19 
20  As a counterpart to the access to the source code and rights to copy,
21  modify and redistribute granted by the license, users are provided only
22  with a limited warranty and the software's author, the holder of the
23  economic rights, and the successive licensors have only limited
24  liability.
25 
26  In this respect, the user's attention is drawn to the risks associated
27  with loading, using, modifying and/or developing or reproducing the
28  software by the user in light of its specific status of free software,
29  that may mean that it is complicated to manipulate, and that also
30  therefore means that it is reserved for developers and experienced
31  professionals having in-depth computer knowledge. Users are therefore
32  encouraged to load and test the software's suitability as regards their
33  requirements in conditions enabling the security of their systems and/or
34  data to be ensured and, more generally, to use and operate it in the
35  same conditions as regards security.
36 
37  The fact that you are presently reading this means that you have had
38  knowledge of the CeCILL license and that you accept its terms.
39  */
40 
41 #include "SiteContainerTools.h"
42 #include "SequenceContainerTools.h"
43 #include "VectorSiteContainer.h"
44 #include "SiteContainerIterator.h"
45 #include "../SiteTools.h"
46 #include "../CodonSiteTools.h"
47 #include "../Alphabet/AlphabetTools.h"
48 #include "../SequenceTools.h"
49 #include <Bpp/App/ApplicationTools.h>
50 
51 using namespace bpp;
52 
53 // From the STL:
54 #include <vector>
55 #include <deque>
56 #include <string>
57 
58 using namespace std;
59 
60 /******************************************************************************/
61 
63 {
64  vector<string> seqNames = sites.getSequencesNames();
65  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
66  noGapCont->setSequencesNames(seqNames, false);
67  NoGapSiteContainerIterator ngsi(sites);
68  while (ngsi.hasMoreSites())
69  {
70  noGapCont->addSite(*ngsi.nextSite());
71  }
72  return noGapCont;
73 }
74 
75 /******************************************************************************/
76 
78 {
79  vector<string> seqNames = sites.getSequencesNames();
80  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
81  noGapCont->setSequencesNames(seqNames, false);
83  while (csi.hasMoreSites())
84  {
85  noGapCont->addSite(*csi.nextSite());
86  }
87  return noGapCont;
88 }
89 
90 /******************************************************************************/
91 
93  const SiteContainer& sequences,
94  const SiteSelection& selection)
95 {
96  vector<string> seqNames = sequences.getSequencesNames();
97  VectorSiteContainer* sc = new VectorSiteContainer(seqNames.size(), sequences.getAlphabet());
98  sc->setSequencesNames(seqNames, false);
99  for (unsigned int i = 0; i < selection.size(); i++)
100  {
101  sc->addSite(sequences.getSite(selection[i]), false);
102  // We do not check names, we suppose that the container passed as an argument is correct.
103  // WARNING: what if selection contains many times the same indice? ...
104  }
105  sc->setGeneralComments(sequences.getGeneralComments());
106  return sc;
107 }
108 
109 /******************************************************************************/
111  const SiteContainer& sequences,
112  const SiteSelection& selection)
113 {
114  size_t wsize = sequences.getAlphabet()->getStateCodingSize();
115  if (wsize > 1)
116  {
117  if (selection.size() % wsize != 0)
118  throw IOException("SiteContainerTools::getSelectedPositions: Positions selection is not compatible with the alphabet in use in the container.");
119  SiteSelection selection2;
120  for (size_t i = 0; i < selection.size(); i += wsize)
121  {
122  if (selection[i] % wsize != 0)
123  throw IOException("SiteContainerTools::getSelectedPositions: Positions selection is not compatible with the alphabet in use in the container.");
124 
125  for (size_t j = 1; j < wsize; ++j)
126  {
127  if (selection[i + j] != (selection[i + j - 1] + 1))
128  throw IOException("SiteContainerTools::getSelectedPositions: Positions selection is not compatible with the alphabet in use in the container.");
129  }
130  selection2.push_back(selection[i] / wsize);
131  }
132  return getSelectedSites(sequences, selection2);
133  }
134  else
135  {
136  return getSelectedSites(sequences, selection);
137  }
138 }
139 
140 /******************************************************************************/
141 
142 Sequence* SiteContainerTools::getConsensus(const SiteContainer& sc, const std::string& name, bool ignoreGap, bool resolveUnknown)
143 {
144  Vint consensus;
146  const Site* site;
147  while (ssi.hasMoreSites())
148  {
149  site = ssi.nextSite();
150  map<int, double> freq;
151  SiteTools::getFrequencies(*site, freq, resolveUnknown);
152  double max = 0;
153  int cons = -1; // default result
154  if (ignoreGap)
155  {
156  for (map<int, double>::iterator it = freq.begin(); it != freq.end(); it++)
157  {
158  if (it->second > max && it->first != -1)
159  {
160  max = it->second;
161  cons = it->first;
162  }
163  }
164  }
165  else
166  {
167  for (map<int, double>::iterator it = freq.begin(); it != freq.end(); it++)
168  {
169  if (it->second > max)
170  {
171  max = it->second;
172  cons = it->first;
173  }
174  }
175  }
176  consensus.push_back(cons);
177  }
178  Sequence* seqConsensus = new BasicSequence(name, consensus, sc.getAlphabet());
179  return seqConsensus;
180 }
181 
182 /******************************************************************************/
183 
185 {
186  // NB: use iterators for a better algorithm?
187  int unknownCode = sites.getAlphabet()->getUnknownCharacterCode();
188  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
189  {
190  for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
191  {
192  int* element = &sites(j, i);
193  if (sites.getAlphabet()->isGap(*element))
194  *element = unknownCode;
195  }
196  }
197 }
198 
199 /******************************************************************************/
200 
202 {
203  // NB: use iterators for a better algorithm?
204  int gapCode = sites.getAlphabet()->getGapCharacterCode();
205  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
206  {
207  for (unsigned int j = 0; j < sites.getNumberOfSequences(); j++)
208  {
209  int* element = &sites(j, i);
210  if (sites.getAlphabet()->isUnresolved(*element))
211  *element = gapCode;
212  }
213  }
214 }
215 
216 /******************************************************************************/
217 
219 {
220  vector<string> seqNames = sites.getSequencesNames();
221  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
222  noGapCont->setSequencesNames(seqNames, false);
223  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
224  {
225  const Site* site = &sites.getSite(i);
226  if (!SiteTools::isGapOnly(*site))
227  noGapCont->addSite(*site);
228  }
229  return noGapCont;
230 }
231 
232 /******************************************************************************/
233 
235 {
236  size_t n = sites.getNumberOfSites();
237  size_t i = n;
238  while (i > 1)
239  {
240  ApplicationTools::displayGauge(n - i + 1, n);
241  const Site* site = &sites.getSite(i - 1);
242  if (SiteTools::isGapOnly(*site))
243  {
244  size_t end = i;
245  while (SiteTools::isGapOnly(*site) && i > 1)
246  {
247  --i;
248  site = &sites.getSite(i - 1);
249  }
250  sites.deleteSites(i, end - i);
251  }
252  else
253  {
254  --i;
255  }
256  }
257  ApplicationTools::displayGauge(n, n);
258  const Site* site = &sites.getSite(0);
259  if (SiteTools::isGapOnly(*site))
260  sites.deleteSite(0);
261 }
262 
263 /******************************************************************************/
264 
266 {
267  vector<string> seqNames = sites.getSequencesNames();
268  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
269  noGapCont->setSequencesNames(seqNames, false);
270  for (unsigned int i = 0; i < sites.getNumberOfSites(); i++)
271  {
272  const Site* site = &sites.getSite(i);
274  noGapCont->addSite(*site, false);
275  }
276  return noGapCont;
277 }
278 
279 /******************************************************************************/
280 
282 {
283  size_t n = sites.getNumberOfSites();
284  size_t i = n;
285  while (i > 1)
286  {
287  ApplicationTools::displayGauge(n - i + 1, n);
288  const Site* site = &sites.getSite(i - 1);
289  if (SiteTools::isGapOnly(*site))
290  {
291  size_t end = i;
292  while (SiteTools::isGapOrUnresolvedOnly(*site) && i > 1)
293  {
294  --i;
295  site = &sites.getSite(i - 1);
296  }
297  sites.deleteSites(i, end - i);
298  }
299  else
300  {
301  --i;
302  }
303  }
304  ApplicationTools::displayGauge(n, n);
305  const Site* site = &sites.getSite(0);
307  sites.deleteSite(0);
308 }
309 
310 /******************************************************************************/
311 
313 {
314  vector<string> seqNames = sites.getSequencesNames();
315  VectorSiteContainer* noGapCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
316  noGapCont->setSequencesNames(seqNames, false);
317  for (unsigned int i = 0; i < sites.getNumberOfSites(); ++i) {
318  map<int, double> freq;
319  SiteTools::getFrequencies(sites.getSite(i), freq);
320  if (freq[-1] <= maxFreqGaps)
321  noGapCont->addSite(sites.getSite(i), false);
322  }
323  return noGapCont;
324 }
325 
326 /******************************************************************************/
327 
328 void SiteContainerTools::removeGapSites(SiteContainer& sites, double maxFreqGaps)
329 {
330  for (size_t i = sites.getNumberOfSites(); i > 0; i--) {
331  map<int, double> freq;
332  SiteTools::getFrequencies(sites.getSite(i - 1), freq);
333  if (freq[-1] > maxFreqGaps)
334  sites.deleteSite(i - 1);
335  }
336 }
337 
338 /******************************************************************************/
339 
341 {
342  const CodonAlphabet* pca = dynamic_cast<const CodonAlphabet*>(sites.getAlphabet());
343  if (!pca)
344  throw AlphabetException("Not a Codon Alphabet", sites.getAlphabet());
345  vector<string> seqNames = sites.getSequencesNames();
346  VectorSiteContainer* noStopCont = new VectorSiteContainer(seqNames.size(), sites.getAlphabet());
347  noStopCont->setSequencesNames(seqNames, false);
348  for (size_t i = 0; i < sites.getNumberOfSites(); i++)
349  {
350  const Site* site = &sites.getSite(i);
351  if (!CodonSiteTools::hasStop(*site, gCode))
352  noStopCont->addSite(*site, false);
353  }
354  return noStopCont;
355 }
356 
357 /******************************************************************************/
358 
360  const SiteContainer& dottedAln,
361  const Alphabet* resolvedAlphabet) throw (AlphabetException, Exception)
362 {
363  if (!AlphabetTools::isDefaultAlphabet(dottedAln.getAlphabet()))
364  throw AlphabetException("SiteContainerTools::resolveDottedAlignment. Alignment alphabet should of class 'DefaultAlphabet'.", dottedAln.getAlphabet());
365 
366  // First we look for the reference sequence:
367  size_t n = dottedAln.getNumberOfSequences();
368  if (n == 0)
369  throw Exception("SiteContainerTools::resolveDottedAlignment. Input alignment contains no sequence.");
370 
371  const Sequence* refSeq = 0;
372  for (size_t i = 0; i < n; ++i) // Test each sequence
373  {
374  const Sequence* seq = &dottedAln.getSequence(i);
375  bool isRef = true;
376  for (unsigned int j = 0; isRef && j < seq->size(); ++j) // For each site in the sequence
377  {
378  if (seq->getChar(j) == ".")
379  isRef = false;
380  }
381  if (isRef) // We found the reference sequence!
382  {
383  refSeq = new BasicSequence(*seq);
384  }
385  }
386  if (!refSeq)
387  throw Exception("SiteContainerTools::resolveDottedAlignment. No reference sequence was found in the input alignment.");
388 
389  // Now we build a new VectorSiteContainer:
390  VectorSiteContainer* sites = new VectorSiteContainer(n, resolvedAlphabet);
391 
392  // We add each site one by one:
393  size_t m = dottedAln.getNumberOfSites();
394  string state;
395  for (unsigned int i = 0; i < m; ++i)
396  {
397  string resolved = refSeq->getChar(i);
398  const Site* site = &dottedAln.getSite(i);
399  Site resolvedSite(resolvedAlphabet, site->getPosition());
400  for (unsigned int j = 0; j < n; j++)
401  {
402  state = site->getChar(j);
403  if (state == ".")
404  {
405  state = resolved;
406  }
407  resolvedSite.addElement(state);
408  }
409  // Add the new site:
410  sites->addSite(resolvedSite);
411  }
412 
413  // Seq sequence names:
414  sites->setSequencesNames(dottedAln.getSequencesNames());
415 
416  // Delete the copied sequence:
417  delete refSeq;
418 
419  // Return result:
420  return sites;
421 }
422 
423 /******************************************************************************/
424 
425 std::map<size_t, size_t> SiteContainerTools::getSequencePositions(const Sequence& seq)
426 {
427  map<size_t, size_t> tln;
428  if (seq.size() == 0)
429  return tln;
430  unsigned int count = 0;
431  for (size_t i = 0; i < seq.size(); i++)
432  {
433  if (seq[i] != -1)
434  {
435  count++;
436  tln[i + 1] = count;
437  }
438  }
439  return tln;
440 }
441 
442 /******************************************************************************/
443 
444 std::map<size_t, size_t> SiteContainerTools::getAlignmentPositions(const Sequence& seq)
445 {
446  map<size_t, size_t> tln;
447  if (seq.size() == 0)
448  return tln;
449  unsigned int count = 0;
450  for (size_t i = 0; i < seq.size(); i++)
451  {
452  if (seq[i] != -1)
453  {
454  count++;
455  tln[count] = i + 1;
456  }
457  }
458  return tln;
459 }
460 
461 /******************************************************************************/
462 
463 std::map<size_t, size_t> SiteContainerTools::translateAlignment(const Sequence& seq1, const Sequence& seq2)
464 throw (AlphabetMismatchException, Exception)
465 {
466  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
467  throw AlphabetMismatchException("SiteContainerTools::translateAlignment", seq1.getAlphabet(), seq2.getAlphabet());
468  map<size_t, size_t> tln;
469  if (seq1.size() == 0)
470  return tln;
471  unsigned int count1 = 0;
472  unsigned int count2 = 0;
473  if (seq2.size() == 0)
474  throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
475  int state1 = seq1[count1];
476  int state2 = seq2[count2];
477  bool end = false;
478  while (!end)
479  {
480  while (state1 == -1)
481  {
482  count1++;
483  if (count1 < seq1.size())
484  state1 = seq1[count1];
485  else
486  break;
487  }
488  while (state2 == -1)
489  {
490  count2++;
491  if (count2 < seq2.size())
492  state2 = seq2[count2];
493  else
494  break;
495  }
496  if (state1 != state2)
497  throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
498  tln[count1 + 1] = count2 + 1; // Count start at 1
499  if (count1 == seq1.size() - 1)
500  end = true;
501  else
502  {
503  if (count2 == seq2.size() - 1)
504  {
505  state1 = seq1[++count1];
506  while (state1 == -1)
507  {
508  count1++;
509  if (count1 < seq1.size())
510  state1 = seq1[count1];
511  else
512  break;
513  }
514  if (state1 == -1)
515  end = true;
516  else
517  throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
518  }
519  else
520  {
521  state1 = seq1[++count1];
522  state2 = seq2[++count2];
523  }
524  }
525  }
526  return tln;
527 }
528 
529 /******************************************************************************/
530 
531 std::map<size_t, size_t> SiteContainerTools::translateSequence(const SiteContainer& sequences, size_t i1, size_t i2)
532 {
533  const Sequence* seq1 = &sequences.getSequence(i1);
534  const Sequence* seq2 = &sequences.getSequence(i2);
535  map<size_t, size_t> tln;
536  size_t count1 = 0; // Sequence 1 counter
537  size_t count2 = 0; // Sequence 2 counter
538  int state1;
539  int state2;
540  for (size_t i = 0; i < sequences.getNumberOfSites(); i++)
541  {
542  state1 = (*seq1)[i];
543  if (state1 != -1)
544  count1++;
545  state2 = (*seq2)[i];
546  if (state2 != -1)
547  count2++;
548  if (state1 != -1)
549  {
550  tln[count1] = (state2 == -1 ? 0 : count2);
551  }
552  }
553  return tln;
554 }
555 
556 /******************************************************************************/
557 
559  const Sequence& seq1,
560  const Sequence& seq2,
561  const AlphabetIndex2& s,
562  double gap)
564 {
565  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
566  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), seq2.getAlphabet());
567  if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType())
568  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), s.getAlphabet());
569  // Check that sequences have no gap!
570  auto_ptr<Sequence> s1(seq1.clone());
572  auto_ptr<Sequence> s2(seq2.clone());
574 
575  // 1) Initialize matrix:
576  RowMatrix<double> m(s1->size() + 1, s2->size() + 1);
577  RowMatrix<char> p(s1->size(), s2->size());
578  double choice1, choice2, choice3, mx;
579  char px;
580  for (size_t i = 0; i <= s1->size(); i++)
581  {
582  m(i, 0) = static_cast<double>(i) * gap;
583  }
584  for (size_t j = 0; j <= s2->size(); j++)
585  {
586  m(0, j) = static_cast<double>(j) * gap;
587  }
588  for (size_t i = 1; i <= s1->size(); i++)
589  {
590  for (size_t j = 1; j <= s2->size(); j++)
591  {
592  choice1 = m(i - 1, j - 1) + static_cast<double>(s.getIndex((*s1)[i - 1], (*s2)[j - 1]));
593  choice2 = m(i - 1, j) + gap;
594  choice3 = m(i, j - 1) + gap;
595  mx = choice1; px = 'd'; // Default in case of equality of scores.
596  if (choice2 > mx)
597  {
598  mx = choice2; px = 'u';
599  }
600  if (choice3 > mx)
601  {
602  mx = choice3; px = 'l';
603  }
604  m(i, j) = mx;
605  p(i - 1, j - 1) = px;
606  }
607  }
608 
609  // 2) Get alignment:
610  deque<int> a1, a2;
611  size_t i = s1->size(), j = s2->size();
612  char c;
613  while (i > 0 && j > 0)
614  {
615  c = p(i - 1, j - 1);
616  if (c == 'd')
617  {
618  a1.push_front((*s1)[i - 1]);
619  a2.push_front((*s2)[j - 1]);
620  i--;
621  j--;
622  }
623  else if (c == 'u')
624  {
625  a1.push_front((*s1)[i - 1]);
626  a2.push_front(-1);
627  i--;
628  }
629  else
630  {
631  a1.push_front(-1);
632  a2.push_front((*s2)[j - 1]);
633  j--;
634  }
635  }
636  while (i > 0)
637  {
638  a1.push_front((*s1)[i - 1]);
639  a2.push_front(-1);
640  i--;
641  }
642  while (j > 0)
643  {
644  a1.push_front(-1);
645  a2.push_front((*s2)[j - 1]);
646  j--;
647  }
648  s1->setContent(vector<int>(a1.begin(), a1.end()));
649  s2->setContent(vector<int>(a2.begin(), a2.end()));
650  AlignedSequenceContainer* asc = new AlignedSequenceContainer(s1->getAlphabet());
651  asc->addSequence(*s1, false);
652  asc->addSequence(*s2, false); // Do not check for sequence names.
653  return asc;
654 }
655 
656 /******************************************************************************/
657 
659  const Sequence& seq1,
660  const Sequence& seq2,
661  const AlphabetIndex2& s,
662  double opening,
663  double extending)
665 {
666  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
667  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), seq2.getAlphabet());
668  if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType())
669  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), s.getAlphabet());
670  // Check that sequences have no gap!
671  auto_ptr<Sequence> s1(seq1.clone());
673  auto_ptr<Sequence> s2(seq2.clone());
675 
676  // 1) Initialize matrix:
677  RowMatrix<double> m(s1->size() + 1, s2->size() + 1);
678  RowMatrix<double> v(s1->size() + 1, s2->size() + 1);
679  RowMatrix<double> h(s1->size() + 1, s2->size() + 1);
680  RowMatrix<char> p(s1->size(), s2->size());
681 
682  double choice1, choice2, choice3, mx;
683  char px;
684  m(0, 0) = 0.;
685  for (size_t i = 0; i <= s1->size(); i++)
686  {
687  v(i, 0) = log(0.);
688  }
689  for (size_t j = 0; j <= s2->size(); j++)
690  {
691  h(0, j) = log(0.);
692  }
693  for (size_t i = 1; i <= s1->size(); i++)
694  {
695  m(i, 0) = h(i, 0) = opening + static_cast<double>(i) * extending;
696  }
697  for (size_t j = 1; j <= s2->size(); j++)
698  {
699  m(0, j) = v(0, j) = opening + static_cast<double>(j) * extending;
700  }
701 
702  for (size_t i = 1; i <= s1->size(); i++)
703  {
704  for (size_t j = 1; j <= s2->size(); j++)
705  {
706  choice1 = m(i - 1, j - 1) + s.getIndex((*s1)[i - 1], (*s2)[j - 1]);
707  choice2 = h(i - 1, j - 1) + opening + extending;
708  choice3 = v(i - 1, j - 1) + opening + extending;
709  mx = choice1; // Default in case of equality of scores.
710  if (choice2 > mx)
711  {
712  mx = choice2;
713  }
714  if (choice3 > mx)
715  {
716  mx = choice3;
717  }
718  m(i, j) = mx;
719 
720  choice1 = m(i, j - 1) + opening + extending;
721  choice2 = h(i, j - 1) + extending;
722  mx = choice1; // Default in case of equality of scores.
723  if (choice2 > mx)
724  {
725  mx = choice2;
726  }
727  h(i, j) = mx;
728 
729  choice1 = m(i - 1, j) + opening + extending;
730  choice2 = v(i - 1, j) + extending;
731  mx = choice1; // Default in case of equality of scores.
732  if (choice2 > mx)
733  {
734  mx = choice2;
735  }
736  v(i, j) = mx;
737 
738  px = 'd';
739  if (v(i, j) > m(i, j))
740  px = 'u';
741  if (h(i, j) > m(i, j))
742  px = 'l';
743  p(i - 1, j - 1) = px;
744  }
745  }
746 
747  // 2) Get alignment:
748  deque<int> a1, a2;
749  size_t i = s1->size(), j = s2->size();
750  char c;
751  while (i > 0 && j > 0)
752  {
753  c = p(i - 1, j - 1);
754  if (c == 'd')
755  {
756  a1.push_front((*s1)[i - 1]);
757  a2.push_front((*s2)[j - 1]);
758  i--;
759  j--;
760  }
761  else if (c == 'u')
762  {
763  a1.push_front((*s1)[i - 1]);
764  a2.push_front(-1);
765  i--;
766  }
767  else
768  {
769  a1.push_front(-1);
770  a2.push_front((*s2)[j - 1]);
771  j--;
772  }
773  }
774  while (i > 0)
775  {
776  a1.push_front((*s1)[i - 1]);
777  a2.push_front(-1);
778  i--;
779  }
780  while (j > 0)
781  {
782  a1.push_front(-1);
783  a2.push_front((*s2)[j - 1]);
784  j--;
785  }
786  s1->setContent(vector<int>(a1.begin(), a1.end()));
787  s2->setContent(vector<int>(a2.begin(), a2.end()));
788  AlignedSequenceContainer* asc = new AlignedSequenceContainer(s1->getAlphabet());
789  asc->addSequence(*s1, false);
790  asc->addSequence(*s2, false); // Do not check for sequence names.
791  return asc;
792 }
793 
794 /******************************************************************************/
795 
796 VectorSiteContainer* SiteContainerTools::sampleSites(const SiteContainer& sites, size_t nbSites, vector<size_t>* index)
797 {
799  for (size_t i = 0; i < nbSites; i++)
800  {
801  size_t pos = static_cast<size_t>(RandomTools::giveIntRandomNumberBetweenZeroAndEntry(static_cast<int>(sites.getNumberOfSites())));
802  sample->addSite(sites.getSite(pos), false);
803  if (index)
804  index->push_back(pos);
805  }
806  return sample;
807 }
808 
809 /******************************************************************************/
810 
812 {
813  return sampleSites(sites, sites.getNumberOfSites());
814 }
815 
816 /******************************************************************************/
817 
818 const string SiteContainerTools::SIMILARITY_ALL = "all sites";
819 const string SiteContainerTools::SIMILARITY_NOFULLGAP = "no full gap";
820 const string SiteContainerTools::SIMILARITY_NODOUBLEGAP = "no double gap";
821 const string SiteContainerTools::SIMILARITY_NOGAP = "no gap";
822 
823 /******************************************************************************/
824 
825 double SiteContainerTools::computeSimilarity(const Sequence& seq1, const Sequence& seq2, bool dist, const std::string& gapOption, bool unresolvedAsGap) throw (SequenceNotAlignedException, AlphabetMismatchException, Exception)
826 {
827  if (seq1.size() != seq2.size())
828  throw SequenceNotAlignedException("SiteContainerTools::computeSimilarity.", &seq2);
829  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
830  throw AlphabetMismatchException("SiteContainerTools::computeSimilarity.", seq1.getAlphabet(), seq2.getAlphabet());
831 
832  const Alphabet* alpha = seq1.getAlphabet();
833  unsigned int s = 0;
834  unsigned int t = 0;
835  for (size_t i = 0; i < seq1.size(); i++)
836  {
837  int x = seq1[i];
838  int y = seq2[i];
839  int gapCode = alpha->getGapCharacterCode();
840  if (unresolvedAsGap)
841  {
842  if (alpha->isUnresolved(x))
843  x = gapCode;
844  if (alpha->isUnresolved(y))
845  y = gapCode;
846  }
847  if (gapOption == SIMILARITY_ALL)
848  {
849  t++;
850  if (x == y && !alpha->isGap(x) && !alpha->isGap(y))
851  s++;
852  }
853  else if (gapOption == SIMILARITY_NODOUBLEGAP)
854  {
855  if (!alpha->isGap(x) || !alpha->isGap(y))
856  {
857  t++;
858  if (x == y)
859  s++;
860  }
861  }
862  else if (gapOption == SIMILARITY_NOGAP)
863  {
864  if (!alpha->isGap(x) && !alpha->isGap(y))
865  {
866  t++;
867  if (x == y)
868  s++;
869  }
870  }
871  else
872  throw Exception("SiteContainerTools::computeSimilarity. Invalid gap option: " + gapOption);
873  }
874  double r = (t == 0 ? 0. : static_cast<double>(s) / static_cast<double>(t));
875  return dist ? 1 - r : r;
876 }
877 
878 /******************************************************************************/
879 
880 DistanceMatrix* SiteContainerTools::computeSimilarityMatrix(const SiteContainer& sites, bool dist, const std::string& gapOption, bool unresolvedAsGap)
881 {
882  size_t n = sites.getNumberOfSequences();
883  DistanceMatrix* mat = new DistanceMatrix(sites.getSequencesNames());
884  string pairwiseGapOption = gapOption;
885  SiteContainer* sites2;
886  if (gapOption == SIMILARITY_NOFULLGAP)
887  {
888  if (unresolvedAsGap)
889  {
890  SiteContainer* tmp = removeGapOrUnresolvedOnlySites(sites);
891  sites2 = new AlignedSequenceContainer(*tmp);
892  delete tmp;
893  }
894  else
895  {
896  SiteContainer* tmp = removeGapOnlySites(sites);
897  sites2 = new AlignedSequenceContainer(*tmp);
898  delete tmp;
899  }
900  pairwiseGapOption = SIMILARITY_ALL;
901  }
902  else
903  {
904  sites2 = new AlignedSequenceContainer(sites);
905  }
906 
907  for (size_t i = 0; i < n; i++)
908  {
909  (*mat)(i, i) = dist ? 0. : 1.;
910  const Sequence* seq1 = &sites2->getSequence(i);
911  for (size_t j = i + 1; j < n; j++)
912  {
913  const Sequence* seq2 = &sites2->getSequence(j);
914  (*mat)(i, j) = (*mat)(j, i) = computeSimilarity(*seq1, *seq2, dist, pairwiseGapOption, unresolvedAsGap);
915  }
916  }
917  delete sites2;
918  return mat;
919 }
920 
921 /******************************************************************************/
922 
923 void SiteContainerTools::merge(SiteContainer& seqCont1, const SiteContainer& seqCont2, bool leavePositionAsIs)
924 throw (AlphabetMismatchException, Exception)
925 {
926  if (seqCont1.getAlphabet()->getAlphabetType() != seqCont2.getAlphabet()->getAlphabetType())
927  throw AlphabetMismatchException("SiteContainerTools::merge.", seqCont1.getAlphabet(), seqCont2.getAlphabet());
928 
929 
930  vector<string> seqNames1 = seqCont1.getSequencesNames();
931  vector<string> seqNames2 = seqCont2.getSequencesNames();
932  const SiteContainer* seqCont2bis = 0;
933  bool del = false;
934  if (seqNames1 == seqNames2)
935  {
936  seqCont2bis = &seqCont2;
937  }
938  else
939  {
940  // We shall reorder sequences first:
941  SiteContainer* seqCont2ter = new VectorSiteContainer(seqCont2.getAlphabet());
942  SequenceContainerTools::getSelectedSequences(seqCont2, seqNames1, *seqCont2ter);
943  seqCont2bis = seqCont2ter;
944  del = true;
945  }
946 
947  if (leavePositionAsIs)
948  {
949  for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++)
950  {
951  seqCont1.addSite(seqCont2bis->getSite(i), false);
952  }
953  }
954  else
955  {
956  int offset = static_cast<int>(seqCont1.getNumberOfSites());
957  for (size_t i = 0; i < seqCont2bis->getNumberOfSites(); i++)
958  {
959  seqCont1.addSite(seqCont2bis->getSite(i), offset + seqCont2bis->getSite(i).getPosition(), false);
960  }
961  }
962 
963  if (del)
964  delete seqCont2bis;
965 }
966 
967 /******************************************************************************/
968 
969 void SiteContainerTools::getSequencePositions(const SiteContainer& sites, Matrix<size_t>& positions)
970 {
971  positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites());
972  int gap = sites.getAlphabet()->getGapCharacterCode();
973  for (size_t i = 0; i < sites.getNumberOfSequences(); ++i) {
974  const Sequence& seq = sites.getSequence(i);
975  unsigned int pos = 0;
976  for (size_t j = 0; j < sites.getNumberOfSites(); ++j) {
977  if (seq[j] != gap) {
978  ++pos;
979  positions(i, j) = pos;
980  } else {
981  positions(i, j) = 0;
982  }
983  }
984  }
985 }
986 
987 /******************************************************************************/
988 
989 vector<int> SiteContainerTools::getColumnScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, int na)
990 {
991  if (positions1.getNumberOfRows() != positions2.getNumberOfRows())
992  throw Exception("SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!");
993  vector<int> scores(positions1.getNumberOfColumns());
994  for (size_t i = 0; i < positions1.getNumberOfColumns(); ++i) {
995  //Find an anchor point:
996  size_t whichSeq = 0;
997  size_t whichPos = 0;
998  for (size_t j = 0; j < positions1.getNumberOfRows(); ++j) {
999  if (positions1(j, i) > 0) {
1000  whichSeq = j;
1001  whichPos = positions1(j, i);
1002  break;
1003  }
1004  }
1005  if (whichPos == 0) {
1006  //No anchor found, this alignment column is only made of gaps. We assign a score of 'na' and move to the next column.
1007  scores[i] = na;
1008  continue;
1009  }
1010  //We look for the anchor in the reference alignment:
1011  size_t i2 = 0;
1012  bool found = false;
1013  for (size_t j = 0; !found && j < positions2.getNumberOfColumns(); ++j) {
1014  if (positions2(whichSeq, j) == whichPos) {
1015  i2 = j;
1016  found = true;
1017  }
1018  }
1019  if (!found) {
1020  throw Exception("SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) + " of sequence " + TextTools::toString(whichSeq) + " not found in reference alignment. Please make sure the two indexes are built from the same data!");
1021  }
1022  //Now we compare all pairs of sequences between the two positions:
1023  bool test = true;
1024  for (size_t j = 0; test && j < positions1.getNumberOfRows(); ++j) {
1025  test = (positions1(j, i) == positions2(j, i2));
1026  }
1027  scores[i] = test ? 1 : 0;
1028  }
1029  return scores;
1030 }
1031 
1032 /******************************************************************************/
1033 
1034 vector<double> SiteContainerTools::getSumOfPairsScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, double na)
1035 {
1036  if (positions1.getNumberOfRows() != positions2.getNumberOfRows())
1037  throw Exception("SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!");
1038  vector<double> scores(positions1.getNumberOfColumns());
1039  for (size_t i = 0; i < positions1.getNumberOfColumns(); ++i) {
1040  //For all positions in alignment 1...
1041  size_t countAlignable = 0;
1042  size_t countAligned = 0;
1043  for (size_t j = 0; j < positions1.getNumberOfRows(); ++j) {
1044  //Get the corresponding column in alignment 2:
1045  size_t whichPos = positions1(j, i);
1046  if (whichPos == 0) {
1047  //No position for this sequence here.
1048  continue;
1049  }
1050  //We look for the position in the second alignment:
1051  size_t i2 = 0;
1052  bool found = false;
1053  for (size_t k = 0; !found && k < positions2.getNumberOfColumns(); ++k) {
1054  if (positions2(j, k) == whichPos) {
1055  i2 = k;
1056  found = true;
1057  }
1058  }
1059  if (!found) {
1060  throw Exception("SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) + " of sequence " + TextTools::toString(j) + " not found in reference alignment. Please make sure the two indexes are built from the same data!");
1061  }
1062 
1063  //Now we check all other positions and see if they are aligned with this one:
1064  for (size_t k = j + 1; k < positions1.getNumberOfRows(); ++k) {
1065  size_t whichPos2 = positions1(k, i);
1066  if (whichPos2 == 0) {
1067  //Empty position
1068  continue;
1069  }
1070  countAlignable++;
1071  //check position in alignment 2:
1072  if (positions2(k, i2) == whichPos2)
1073  countAligned++;
1074  }
1075  }
1076  scores[i] = countAlignable == 0 ? na : static_cast<double>(countAligned) / static_cast<double>(countAlignable);
1077  }
1078  return scores;
1079 }
1080 
1081 /******************************************************************************/
1082 
std::vector< size_t > SiteSelection
static SiteContainer * removeGapSites(const SiteContainer &sites, double maxFreqGaps)
Get a siteset with sites with less than a given amount of gaps.
static std::vector< double > getSumOfPairsScores(const Matrix< size_t > &positions1, const Matrix< size_t > &positions2, double na=0)
Compare an alignment to a reference alignment, and compute the sum-of-pairs scores.
The SiteContainer interface.
Definition: SiteContainer.h:63
virtual bool isGap(int state) const =0
virtual int getPosition() const
Get the position of this site.
Definition: Site.h:162
Loop over all sites in a SiteContainer.
Aligned sequences container.
This alphabet is used to deal NumericAlphabet.
static std::map< size_t, size_t > getSequencePositions(const Sequence &seq)
Get the index of each sequence position in an aligned sequence.
virtual std::string getChar(size_t pos) const
Get the element at position &#39;pos&#39; as a character.
Definition: SymbolList.cpp:145
virtual bool isUnresolved(int state) const =0
static void changeGapsToUnknownCharacters(SiteContainer &sites)
Change all gaps to unknown state in a container, according to its alphabet.
static DistanceMatrix * computeSimilarityMatrix(const SiteContainer &sites, bool dist=false, const std::string &gapOption=SIMILARITY_NOFULLGAP, bool unresolvedAsGap=true)
Compute the similarity matrix of an alignment.
static void removeGaps(Sequence &seq)
Remove gaps from a sequence.
Loop over all complete sites in a SiteContainer (i.e. sites without gap and unresolved characters)...
The Alphabet interface.
Definition: Alphabet.h:130
static VectorSiteContainer * bootstrapSites(const SiteContainer &sites)
Bootstrap sites in an alignment.
STL namespace.
virtual unsigned int getStateCodingSize() const =0
Get the size of the string coding a state.
static std::vector< int > getColumnScores(const Matrix< size_t > &positions1, const Matrix< size_t > &positions2, int na=0)
Compare an alignment to a reference alignment, and compute the column scores.
static const std::string SIMILARITY_ALL
static void getFrequencies(const SymbolList &list, std::map< int, double > &frequencies, bool resolveUnknowns=false)
Get all states frequencies in the list.
static SiteContainer * getCompleteSites(const SiteContainer &sites)
Retrieves complete sites from SiteContainer.
virtual int getGapCharacterCode() const =0
static SiteContainer * getSelectedPositions(const SiteContainer &sequences, const SiteSelection &selection)
Create a new container with a specified set of positions.
static bool isGapOrUnresolvedOnly(const Site &site)
Definition: SiteTools.cpp:82
void setSequencesNames(const std::vector< std::string > &names, bool checkNames=true)
Set all sequence names.
static SiteContainer * removeStopCodonSites(const SiteContainer &sites, const GeneticCode &gCode)
Get a site set without stop codons, if the alphabet is a CodonAlphabet, otherwise throws an Exception...
static AlignedSequenceContainer * alignNW(const Sequence &seq1, const Sequence &seq2, const AlphabetIndex2 &s, double gap)
Align two sequences using the Needleman-Wunsch dynamic algorithm.
static SiteContainer * getSelectedSites(const SiteContainer &sequences, const SiteSelection &selection)
Create a new container with a specified set of sites.
static Sequence * getConsensus(const SiteContainer &sc, const std::string &name="consensus", bool ignoreGap=true, bool resolveUnknown=false)
create the consensus sequence of the alignment.
virtual const Sequence & getSequence(size_t sequenceIndex) const =0
Retrieve a sequence object from the container.
static SiteContainer * removeGapOrUnresolvedOnlySites(const SiteContainer &sites)
Get a site set without gap/unresolved-only sites.
Codon alphabet class.
Definition: CodonAlphabet.h:63
virtual const Comments & getGeneralComments() const =0
Get the comments of this container.
void addSite(const Site &site, bool checkPosition=true)
Add a site in the container.
static VectorSiteContainer * sampleSites(const SiteContainer &sites, size_t nbSites, std::vector< size_t > *index=0)
Sample sites in an alignment.
static double computeSimilarity(const Sequence &seq1, const Sequence &seq2, bool dist=false, const std::string &gapOption=SIMILARITY_NODOUBLEGAP, bool unresolvedAsGap=true)
Compute the similarity/distance score between two aligned sequences.
static const std::string SIMILARITY_NOGAP
virtual std::string getChar(size_t pos) const =0
Get the element at position &#39;pos&#39; as a character.
static void getSelectedSequences(const OrderedSequenceContainer &sequences, const SequenceSelection &selection, SequenceContainer &outputCont)
Add a specified set of sequences from a container to another.
static void changeUnresolvedCharactersToGaps(SiteContainer &sites)
Change all unresolved characters to gaps in a container, according to its alphabet.
static bool hasStop(const Site &site, const GeneticCode &gCode)
Method to know if a codon site contains stop codon or not.
Loop over all sites without gaps in a SiteContainer.
virtual void addSite(const Site &site, bool checkPosition)=0
Add a site in the container.
void addSequence(const Sequence &sequence, bool checkName=true)
Add a sequence at the end of the container.
virtual const Site & getSite(size_t siteIndex) const =0
Get a site from the container.
The alphabet exception base class.
static std::map< size_t, size_t > getAlignmentPositions(const Sequence &seq)
Get the index of each alignment position in an aligned sequence.
A basic implementation of the Sequence interface.
Definition: Sequence.h:207
Two dimensionnal alphabet index interface.
virtual size_t getNumberOfSites() const =0
Get the number of sites in the container.
virtual size_t size() const =0
Get the number of elements in the list.
A Matrix class to store phylogenetic distances.
static std::map< size_t, size_t > translateAlignment(const Sequence &seq1, const Sequence &seq2)
Translate alignement positions from an aligned sequence to the same sequence in a different alignment...
static SiteContainer * removeGapOnlySites(const SiteContainer &sites)
Get a site set without gap-only sites.
The sequence interface.
Definition: Sequence.h:74
static const std::string SIMILARITY_NOFULLGAP
static bool isGapOnly(const Site &site)
Definition: SiteTools.cpp:69
static void merge(SiteContainer &seqCont1, const SiteContainer &seqCont2, bool leavePositionAsIs=false)
Add the content of a site container to an exhisting one.
static const std::string SIMILARITY_NODOUBLEGAP
static SiteContainer * getSitesWithoutGaps(const SiteContainer &sites)
Retrieves sites without gaps from SiteContainer.
virtual const Alphabet * getAlphabet() const =0
Get sequence container&#39;s alphabet.
The Site class.
Definition: Site.h:61
Exception thrown when two alphabets do not match.
Partial implementation of the Transliterator interface for genetic code object.
Definition: GeneticCode.h:79
static bool isDefaultAlphabet(const Alphabet *alphabet)
virtual std::vector< std::string > getSequencesNames() const =0
Get all the names of the sequences in the container.
static SiteContainer * resolveDottedAlignment(const SiteContainer &dottedAln, const Alphabet *resolvedAlphabet)
Resolve a container with "." notations.
virtual void deleteSites(size_t siteIndex, size_t length)=0
Delete a continuous range of sites in the container.
virtual int getUnknownCharacterCode() const =0
Exception thrown when a sequence is not align with others.
static std::map< size_t, size_t > translateSequence(const SiteContainer &sequences, size_t i1, size_t i2)
Translate sequence positions from a sequence to another in the same alignment.
void setGeneralComments(const Comments &comments)
Set the comments of this container.
The VectorSiteContainer class.
virtual size_t getNumberOfSequences() const =0
Get the number of sequences in the container.
virtual void deleteSite(size_t siteIndex)=0
Delete a site in the container.