bpp-popgen  2.2.0
PolymorphismSequenceContainerTools.cpp
Go to the documentation of this file.
1 //
2 // File: PolymorphismSequenceContainerTools.cpp
3 // Authors: Eric Bazin
4 // Sylvain Gaillard
5 // Created on: Thursday July 29 2004
6 //
7 
8 /*
9  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
10 
11  This software is a computer program whose purpose is to provide classes
12  for population genetics analysis.
13 
14  This software is governed by the CeCILL license under French law and
15  abiding by the rules of distribution of free software. You can use,
16  modify and/ or redistribute the software under the terms of the CeCILL
17  license as circulated by CEA, CNRS and INRIA at the following URL
18  "http://www.cecill.info".
19 
20  As a counterpart to the access to the source code and rights to copy,
21  modify and redistribute granted by the license, users are provided only
22  with a limited warranty and the software's author, the holder of the
23  economic rights, and the successive licensors have only limited
24  liability.
25 
26  In this respect, the user's attention is drawn to the risks associated
27  with loading, using, modifying and/or developing or reproducing the
28  software by the user in light of its specific status of free software,
29  that may mean that it is complicated to manipulate, and that also
30  therefore means that it is reserved for developers and experienced
31  professionals having in-depth computer knowledge. Users are therefore
32  encouraged to load and test the software's suitability as regards their
33  requirements in conditions enabling the security of their systems and/or
34  data to be ensured and, more generally, to use and operate it in the
35  same conditions as regards security.
36 
37  The fact that you are presently reading this means that you have had
38  knowledge of the CeCILL license and that you accept its terms.
39  */
40 
42 
43 using namespace bpp;
44 using namespace std;
45 
47 
48 /******************************************************************************/
49 
50 PolymorphismSequenceContainer* PolymorphismSequenceContainerTools::read(const std::string& path, const Alphabet* alpha) throw (Exception)
51 {
52  Mase ms;
53  string key;
54  const OrderedSequenceContainer* seqc = 0;
55  try
56  {
57  seqc = dynamic_cast<OrderedSequenceContainer*>(ms.readSequences(path, alpha ));
58  }
59  catch (Exception& e)
60  {
61  if (seqc != 0)
62  delete seqc;
63  throw e;
64  }
66  Comments maseFileHeader = seqc->getGeneralComments();
67  delete seqc;
68  map<string, size_t> groupMap = MaseTools::getAvailableSequenceSelections(maseFileHeader);
69  for (map<string, size_t>::iterator mi = groupMap.begin(); mi != groupMap.end(); mi++)
70  {
71  key = mi->first;
72  if (key.compare(0, 8, "OUTGROUP") == 0)
73  {
74  SequenceSelection ss;
75  try
76  {
77  ss = MaseTools::getSequenceSet(maseFileHeader, key);
78  }
79  catch (IOException& ioe)
80  {
81  delete psc;
82  throw ioe;
83  }
84  for (size_t i = 0; i != ss.size(); i++)
85  {
86  try
87  {
88  psc->setAsOutgroupMember(ss[i]);
89  }
90  catch (SequenceNotFoundException& snfe)
91  {
92  delete psc;
93  throw snfe;
94  }
95  }
96  }
97  }
98  return psc;
99 }
100 
101 /******************************************************************************/
102 
104 {
105  SequenceSelection ss;
106  PolymorphismSequenceContainer* psci = dynamic_cast<PolymorphismSequenceContainer*>(psc.clone());
107  for (size_t i = 0; i < psc.getNumberOfSequences(); i++)
108  {
109  if (!psc.isIngroupMember(i))
110  ss.push_back(i);
111  }
112  if (ss.size() == psc.getNumberOfSequences())
113  {
114  delete psci;
115  throw Exception("PolymorphismSequenceContainerTools::extractIngroup: no Ingroup sequences found.");
116  }
117  for (size_t i = ss.size(); i > 0; --i)
118  {
119  psci->deleteSequence(ss[i - 1]);
120  }
121  return psci;
122 }
123 
124 /******************************************************************************/
125 
127 {
128  SequenceSelection ss;
129  PolymorphismSequenceContainer* psci = dynamic_cast<PolymorphismSequenceContainer*>(psc.clone());
130  for (size_t i = 0; i < psc.getNumberOfSequences(); i++)
131  {
132  if (psc.isIngroupMember(i) )
133  ss.push_back(i);
134  }
135  if (ss.size() == psc.getNumberOfSequences())
136  {
137  delete psci;
138  throw Exception("PolymorphismSequenceContainerTools::extractOutgroup: no Outgroup sequences found.");
139  }
140  for (size_t i = ss.size(); i > 0; i--)
141  {
142  psci->deleteSequence(ss[i - 1]);
143  }
144  return psci;
145 }
146 
147 /******************************************************************************/
148 
150 {
151  SequenceSelection ss;
152  PolymorphismSequenceContainer* psci = dynamic_cast<PolymorphismSequenceContainer*>(psc.clone());
153  for (size_t i = 0; i < psc.getNumberOfSequences(); i++)
154  {
155  if (psc.getGroupId(i) != group_id)
156  ss.push_back(i);
157  }
158  if (ss.size() == psc.getNumberOfSequences())
159  {
160  delete psci;
161  throw GroupNotFoundException("PolymorphismSequenceContainerTools::extractGroup: group_id not found.", group_id);
162  }
163  for (size_t i = ss.size(); i > 0; i--)
164  {
165  psci->deleteSequence(ss[i - 1]);
166  }
167  return psci;
168 }
169 
170 /******************************************************************************/
171 
173 {
174  PolymorphismSequenceContainer* newpsc = new PolymorphismSequenceContainer(psc.getAlphabet());
175  for (size_t i = 0; i < ss.size(); i++)
176  {
177  newpsc->addSequenceWithFrequency(psc.getSequence(ss[i]), psc.getSequenceCount(i), false);
178  if (psc.isIngroupMember(i))
179  newpsc->setAsIngroupMember(i);
180  else
181  {
182  newpsc->setAsOutgroupMember(i);
183  newpsc->setGroupId(i, psc.getGroupId(i));
184  }
185  }
186  newpsc->setGeneralComments(psc.getGeneralComments());
187  return newpsc;
188 }
189 
190 /******************************************************************************/
191 
193 {
194  size_t nbSeq = psc.getNumberOfSequences();
195  vector<size_t> v;
196  for (size_t i = 0; i < nbSeq; ++i)
197  {
198  v.push_back(i);
199  }
200  vector<size_t> vv(n);
201  RandomTools::getSample(v, vv, replace);
203  return newpsc;
204 }
205 
206 /******************************************************************************/
207 
209 {
210  vector<string> seqNames = psc.getSequencesNames();
211  PolymorphismSequenceContainer* noGapCont = new PolymorphismSequenceContainer(psc.getNumberOfSequences(), psc.getAlphabet());
212  noGapCont->setSequencesNames(seqNames, false);
213  size_t nbSeq = psc.getNumberOfSequences();
214  for (size_t i = 0; i < nbSeq; i++)
215  {
216  noGapCont->setSequenceCount(i, psc.getSequenceCount(i));
217  if (psc.isIngroupMember(i))
218  noGapCont->setAsIngroupMember(i);
219  else
220  {
221  noGapCont->setAsOutgroupMember(i);
222  noGapCont->setGroupId(i, psc.getGroupId(i));
223  }
224  }
225  NoGapSiteContainerIterator ngsi(psc);
226  while (ngsi.hasMoreSites())
227  noGapCont->addSite(*ngsi.nextSite());
228  return noGapCont;
229 }
230 
231 /******************************************************************************/
232 
234 {
235  size_t count = psc.getNumberOfSites();
237  SimpleSiteContainerIterator* ssi;
238  if (ingroup)
239  {
240  try
241  {
242  npsc = extractIngroup(psc);
243  }
244  catch (Exception& e)
245  {
246  if (npsc != NULL)
247  delete npsc;
248  throw e;
249  }
250  ssi = new SimpleSiteContainerIterator(*npsc);
251  }
252  else
253  ssi = new SimpleSiteContainerIterator(psc);
254  while (ssi->hasMoreSites())
255  if (SiteTools::hasGap(*ssi->nextSite()))
256  count--;
257  delete ssi;
258  return count;
259 }
260 
261 /******************************************************************************/
262 
264 {
265  size_t count = psc.getNumberOfSites();
267  SimpleSiteContainerIterator* ssi;
268  if (ingroup)
269  {
270  try
271  {
272  npsc = extractIngroup(psc);
273  }
274  catch (Exception& e)
275  {
276  if (npsc != NULL)
277  delete npsc;
278  throw e;
279  }
280  ssi = new SimpleSiteContainerIterator(*npsc);
281  }
282  else
283  ssi = new SimpleSiteContainerIterator(psc);
284  while (ssi->hasMoreSites())
285  if (!SiteTools::isComplete(*ssi->nextSite()))
286  count--;
287  delete ssi;
288  return count;
289 }
290 
291 /******************************************************************************/
292 
294 {
295  vector<string> seqNames = psc.getSequencesNames();
296  PolymorphismSequenceContainer* complete = new PolymorphismSequenceContainer(psc.getNumberOfSequences(), psc.getAlphabet());
297  complete->setSequencesNames(seqNames, false);
298  size_t nbSeq = psc.getNumberOfSequences();
299  for (size_t i = 0; i < nbSeq; i++)
300  {
301  complete->setSequenceCount(i, psc.getSequenceCount(i));
302  if (psc.isIngroupMember(i))
303  complete->setAsIngroupMember(i);
304  else
305  {
306  complete->setAsOutgroupMember(i);
307  complete->setGroupId(i, psc.getGroupId(i));
308  }
309  }
310  CompleteSiteContainerIterator csi(psc);
311  while (csi.hasMoreSites())
312  complete->addSite(*csi.nextSite());
313  return complete;
314 }
315 
316 /******************************************************************************/
317 
319 {
321  while (SiteTools::hasGap(psci->getSite(0)))
322  psci->deleteSite(0);
323  size_t i = 0;
324  size_t n = psci->getNumberOfSites();
325  while (SiteTools::hasGap(psci->getSite(n - i - 1)))
326  {
327  psci->deleteSite(n - i - 1);
328  i++;
329  }
330  return psci;
331 }
332 
333 /******************************************************************************/
334 
336 {
337  SiteContainer* pscc = MaseTools::getSelectedSites(psc, setName);
338  Comments maseFileHeader = psc.getGeneralComments();
339  if (phase)
340  {
341  for (size_t i = 1; i < MaseTools::getPhase(maseFileHeader, setName); i++)
342  {
343  pscc->deleteSite(0);
344  }
345  }
347  for (size_t i = 0; i < psc.getNumberOfSequences(); i++)
348  {
349  if (psc.isIngroupMember(i))
350  psci->setAsIngroupMember(i);
351  else
352  {
353  psci->setAsOutgroupMember(i);
354  psci->setGroupId(i, psc.getGroupId(i));
355  }
356  }
357  psci->deleteGeneralComments();
358  delete pscc;
359  return psci;
360 }
361 
362 /******************************************************************************/
363 
365 {
366  SiteSelection ss;
367  Comments maseFileHeader = psc.getGeneralComments();
368  SiteSelection codss = MaseTools::getSiteSet(maseFileHeader, setName);
369  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
370  {
371  if (find(codss.begin(), codss.end(), i) == codss.end())
372  ss.push_back(i);
373  }
374  const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
376  for (size_t i = 0; i < psc.getNumberOfSequences(); i++)
377  {
378  if (psc.isIngroupMember(i))
379  psci->setAsIngroupMember(i);
380  else
381  {
382  psci->setAsOutgroupMember(i);
383  psci->setGroupId(i, psc.getGroupId(i));
384  }
385  }
386  delete sc;
387  return psci;
388 }
389 
390 /******************************************************************************/
391 
393 {
394  Comments maseFileHeader = psc.getGeneralComments();
395  size_t start;
396  try
397  {
398  start = MaseTools::getPhase(maseFileHeader, setName);
399  }
400  catch (Exception& e)
401  {
402  start = 1;
403  }
404  SiteSelection ss;
405  size_t i;
406  if ((int)pos - (int)start >= 0)
407  i = pos - start;
408  else
409  i = pos - start + 3;
410  while (i < psc.getNumberOfSites())
411  {
412  ss.push_back(i);
413  i += 3;
414  }
415  const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
417  for (size_t j = 0; j < psc.getNumberOfSequences(); j++)
418  {
419  if (psc.isIngroupMember(j))
420  newpsc->setAsIngroupMember(j);
421  else
422  {
423  newpsc->setAsOutgroupMember(j);
424  newpsc->setGroupId(i, psc.getGroupId(j));
425  }
426  }
427  delete sc;
428  return newpsc;
429 }
430 
431 /******************************************************************************/
432 
435  const std::string& setName,
436  const GeneticCode* gCode)
437 {
438  Comments maseFileHeader = psc.getGeneralComments();
439  SiteSelection ss;
440  SiteSelection codss = MaseTools::getSiteSet(maseFileHeader, setName);
441  size_t start;
442  try
443  {
444  start = MaseTools::getPhase(maseFileHeader, setName);
445  }
446  catch (Exception& e)
447  {
448  throw e;
449  }
450 
451  size_t first = 0, last = psc.getNumberOfSites();
452  // Check if the first codon is AUG
453  if (start == 1 &&
454  psc.getSite(codss[0]).getValue(0) == 0 &&
455  psc.getSite(codss[1]).getValue(0) == 3 &&
456  psc.getSite(codss[2]).getValue(0) == 2)
457  first = codss[0];
458  // Check if the last codon is a STOP one
459  int c1 = psc.getSite(codss[codss.size() - 3]).getValue(0);
460  int c2 = psc.getSite(codss[codss.size() - 2]).getValue(0);
461  int c3 = psc.getSite(codss[codss.size() - 1]).getValue(0);
462  if (gCode->isStop(gCode->getSourceAlphabet()->getCodon(c1, c2, c3)))
463  last = codss[codss.size() - 1];
464  // Keep sites between AUG and STOP
465  for (size_t i = first; i < last; i++)
466  {
467  if (find(codss.begin(), codss.end(), i) == codss.end())
468  {
469  ss.push_back(i);
470  }
471  }
472  const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
474  for (size_t i = 0; i < psc.getNumberOfSequences(); i++)
475  {
476  if (psc.isIngroupMember(i))
477  psci->setAsIngroupMember(i);
478  else
479  {
480  psci->setAsOutgroupMember(i);
481  psci->setGroupId(i, psc.getGroupId(i));
482  }
483  }
484  delete sc;
485  return psci;
486 }
487 
488 /******************************************************************************/
489 
491 {
492  Comments maseFileHeader = psc.getGeneralComments();
493  SiteSelection ss;
494  SiteSelection codss = MaseTools::getSiteSet(maseFileHeader, setName);
495  size_t start = MaseTools::getPhase(maseFileHeader, setName);
496  size_t last = 0;
497  // Check if the first Codon is AUG
498  if (start == 1 &&
499  psc.getSite(codss[0]).getValue(0) == 0 &&
500  psc.getSite(codss[1]).getValue(0) == 3 &&
501  psc.getSite(codss[2]).getValue(0) == 2)
502  last = codss[0];
503  for (size_t i = 0; i < last; i++)
504  {
505  if (find(codss.begin(), codss.end(), i) == codss.end())
506  {
507  ss.push_back(i);
508  }
509  }
510  const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
512  for (size_t i = 0; i < psc.getNumberOfSequences(); i++)
513  {
514  if (psc.isIngroupMember(i))
515  psci->setAsIngroupMember(i);
516  else
517  {
518  psci->setAsOutgroupMember(i);
519  psci->setGroupId(i, psc.getGroupId(i));
520  }
521  }
522  delete sc;
523  return psci;
524 }
525 
526 /******************************************************************************/
527 
530  const std::string& setName,
531  const GeneticCode* gCode)
532 {
533  Comments maseFileHeader = psc.getGeneralComments();
534  SiteSelection ss;
535  SiteSelection codss = MaseTools::getSiteSet(maseFileHeader, setName);
536  size_t first = psc.getNumberOfSites() - 1;
537  // Check if the last codon is a STOP one
538  int c1 = psc.getSite(codss[codss.size() - 3]).getValue(0);
539  int c2 = psc.getSite(codss[codss.size() - 2]).getValue(0);
540  int c3 = psc.getSite(codss[codss.size() - 1]).getValue(0);
541  if (gCode->isStop(gCode->getSourceAlphabet()->getCodon(c1, c2, c3)))
542  first = codss[codss.size() - 1];
543  for (size_t i = first; i < psc.getNumberOfSites(); i++)
544  {
545  if (find(codss.begin(), codss.end(), i) == codss.end())
546  {
547  ss.push_back(i);
548  }
549  }
550  const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
552  for (size_t i = 0; i < psc.getNumberOfSequences(); i++)
553  {
554  if (psc.isIngroupMember(i))
555  psci->setAsIngroupMember(i);
556  else
557  {
558  psci->setAsOutgroupMember(i);
559  psci->setGroupId(i, psc.getGroupId(i));
560  }
561  }
562  delete sc;
563  return psci;
564 }
565 
566 /******************************************************************************/
567 
569 {
570  string key;
571  string speciesName;
572  Comments maseFileHeader = psc.getGeneralComments();
573  if (!maseFileHeader.size())
574  return speciesName;
575  map<string, size_t> groupMap = MaseTools::getAvailableSequenceSelections(maseFileHeader);
576  for (map<string, size_t>::iterator mi = groupMap.begin(); mi != groupMap.end(); mi++)
577  {
578  key = mi->first;
579  if (key.compare(0, 7, "INGROUP") == 0)
580  {
581  StringTokenizer* sptk = new StringTokenizer(key, "_");
582  speciesName = sptk->getToken(1) + " " + sptk->getToken(2);
583  }
584  }
585  return speciesName;
586 }
587 
588 /******************************************************************************/
static PolymorphismSequenceContainer * excludeFlankingGap(const PolymorphismSequenceContainer &psc)
exclude flanking sites with gap but keep gap sites within the alignment
void setAsOutgroupMember(size_t index)
Set a sequence as outgroup member by index.
static PolymorphismSequenceContainer * getSelectedSequences(const PolymorphismSequenceContainer &psc, const SequenceSelection &ss)
Extract selected sequences.
static PolymorphismSequenceContainer * sample(const PolymorphismSequenceContainer &psc, size_t n, bool replace=true)
Get a random set of sequences.
static PolymorphismSequenceContainer * extractGroup(const PolymorphismSequenceContainer &psc, size_t group_id)
Extract a special group from the PolymorphismSequenceContainer.
The GroupNotFoundException class.
static PolymorphismSequenceContainer * getSitesWithoutGaps(const PolymorphismSequenceContainer &psc)
Retrieves sites without gaps from PolymorphismSequenceContainer.
PolymorphismSequenceContainer * clone() const
Clone a PolymorphismSequenceContainer.
STL namespace.
static PolymorphismSequenceContainer * getSelectedSites(const PolymorphismSequenceContainer &psc, const std::string &setName, bool phase)
Get a PolymorphismSequenceContainer corresponding to a site selection annotated in the mase comments...
static size_t getNumberOfCompleteSites(const PolymorphismSequenceContainer &psc, bool ingroup)
Return number of completely resolved sites in a PolymorphismSequenceContainer.
static PolymorphismSequenceContainer * getCompleteSites(const PolymorphismSequenceContainer &psc)
Retrieves complete sites from a PolymorphismSequenceContainer.
size_t getGroupId(size_t index) const
Get the group identifier of the sequence.
bool isIngroupMember(size_t index) const
Tell if the sequence is ingroup by index.
static PolymorphismSequenceContainer * getNonCodingSites(const PolymorphismSequenceContainer &psc, const std::string &setName)
Retrieve non-coding sites defined in the mase file header.
static PolymorphismSequenceContainer * extractIngroup(const PolymorphismSequenceContainer &psc)
Extract ingroup sequences from a PolymorphismSequenceContainer and create a new one.
unsigned int getSequenceCount(size_t index) const
Get the count of a sequence by index.
void addSequenceWithFrequency(const Sequence &sequence, unsigned int frequency, bool checkName=true)
Add a sequence to the container.
void setGroupId(size_t index, size_t group_id)
Set the group identifier of a sequence.
static PolymorphismSequenceContainer * read(const std::string &path, const Alphabet *alpha)
Read a Mase+ file and return a PolymorphismSequenceContainer. Toggle Sequence when selection tag begi...
void deleteSequence(size_t index)
Delete a sequence by index.
static PolymorphismSequenceContainer * get5Prime(const PolymorphismSequenceContainer &psc, const std::string &setName)
Retrieve 5&#39; sites.
void setAsIngroupMember(size_t index)
Set a sequence as ingroup member by index.
static PolymorphismSequenceContainer * getIntrons(const PolymorphismSequenceContainer &psc, const std::string &setName, const GeneticCode *gCode)
Retrieve intron sites.
static size_t getNumberOfNonGapSites(const PolymorphismSequenceContainer &psc, bool ingroup)
Return number of sites without gaps in a PolymorphismSequenceContainer.
void setSequenceCount(size_t index, unsigned int count)
Set the count of a sequence by index.
The PolymorphismSequenceContainer class.
static PolymorphismSequenceContainer * getOnePosition(const PolymorphismSequenceContainer &psc, const std::string &setName, size_t pos)
Retrieve sites at one codon position (1,2,3)
static PolymorphismSequenceContainer * extractOutgroup(const PolymorphismSequenceContainer &psc)
Extract outgroup sequences from a PolymorphismSequenceContainer and create a new one.
static std::string getIngroupSpeciesName(const PolymorphismSequenceContainer &psc)
Get the species name of the ingroup.
static PolymorphismSequenceContainer * get3Prime(const PolymorphismSequenceContainer &psc, const std::string &setName, const GeneticCode *gCode)
Retrieve 3&#39; sites.