bpp-seq  2.2.0
CompressedVectorSiteContainer.cpp
Go to the documentation of this file.
1 //
2 // File: CompressedCompressedVectorSiteContainer.cpp
3 // Created by: Julien Dutheil
4 // Created on: Wed Dec 16 12:08 2009
5 //
6 
7 /*
8  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
9 
10  This software is a computer program whose purpose is to provide classes
11  for sequences analysis.
12 
13  This software is governed by the CeCILL license under French law and
14  abiding by the rules of distribution of free software. You can use,
15  modify and/ or redistribute the software under the terms of the CeCILL
16  license as circulated by CEA, CNRS and INRIA at the following URL
17  "http://www.cecill.info".
18 
19  As a counterpart to the access to the source code and rights to copy,
20  modify and redistribute granted by the license, users are provided only
21  with a limited warranty and the software's author, the holder of the
22  economic rights, and the successive licensors have only limited
23  liability.
24 
25  In this respect, the user's attention is drawn to the risks associated
26  with loading, using, modifying and/or developing or reproducing the
27  software by the user in light of its specific status of free software,
28  that may mean that it is complicated to manipulate, and that also
29  therefore means that it is reserved for developers and experienced
30  professionals having in-depth computer knowledge. Users are therefore
31  encouraged to load and test the software's suitability as regards their
32  requirements in conditions enabling the security of their systems and/or
33  data to be ensured and, more generally, to use and operate it in the
34  same conditions as regards security.
35 
36  The fact that you are presently reading this means that you have had
37  knowledge of the CeCILL license and that you accept its terms.
38  */
39 
41 #include <Bpp/Text/TextTools.h>
42 
43 #include <iostream>
44 
45 using namespace std;
46 
47 using namespace bpp;
48 
51 CompressedVectorSiteContainer::CompressedVectorSiteContainer(
52  const std::vector<const Site*>& vs,
53  const Alphabet* alpha)
54 throw (Exception) :
56  sites_(0),
57  index_(0),
58  names_(0),
59  comments_(0),
60  sequences_(0)
61 {
62  if (vs.size() == 0) throw Exception("CompressedVectorSiteContainer::CompressedVectorSiteContainer. Empty site set.");
63  // Seq names and comments:
64  size_t nbSeq = vs[0]->size();
65  names_.resize(nbSeq);
66  comments_.resize(nbSeq);
67  for (size_t i = 0; i < nbSeq; i++)
68  {
69  names_[i] = "Seq_" + TextTools::toString(i);
70  comments_[i] = new Comments();
71  }
72  // Now try to add each site:
73  for (size_t i = 0; i < vs.size(); i++)
74  {
75  addSite(*vs[i]); // This may throw an exception if position argument already exists or is size is not valid.
76  }
77 
78  sequences_.resize(nbSeq);
79 }
80 
81 /******************************************************************************/
82 
83 CompressedVectorSiteContainer::CompressedVectorSiteContainer(size_t size, const Alphabet* alpha) :
85  sites_(0),
86  index_(0),
87  names_(size),
88  comments_(size),
89  sequences_(size)
90 {
91  // Seq names and comments:
92  for (size_t i = 0; i < size; i++)
93  {
94  names_[i] = "Seq_" + TextTools::toString(i);
95  comments_[i] = new Comments();
96  }
97 }
98 
99 /******************************************************************************/
100 
101 CompressedVectorSiteContainer::CompressedVectorSiteContainer(const std::vector<std::string>& names, const Alphabet* alpha) :
103  sites_(0),
104  index_(0),
105  names_(names.size()),
106  comments_(names.size()),
107  sequences_(names.size())
108 {
109  // Seq names and comments:
110  for (size_t i = 0; i < names.size(); i++)
111  {
112  names_[i] = names[i];
113  comments_[i] = new Comments();
114  }
115 }
116 
117 /******************************************************************************/
118 
121  sites_(0),
122  index_(0),
123  names_(0),
124  comments_(0),
125  sequences_(0)
126 {}
127 
128 /******************************************************************************/
129 
132  sites_(vsc.sites_.size()),
133  index_(vsc.index_),
134  names_(vsc.names_),
135  comments_(vsc.getNumberOfSequences()),
136  sequences_(vsc.getNumberOfSequences())
137 {
138  // Now try to add each site:
139  sites_.resize(vsc.sites_.size());
140  for (size_t i = 0; i < vsc.sites_.size(); i++)
141  {
142  sites_[i] = dynamic_cast<Site*>(vsc.sites_[i]->clone());
143  }
144  for (size_t i = 0; i < vsc.getNumberOfSites(); i++)
145  {
146  addSite(vsc.getSite(i), false); // We assume that positions are correct.
147  }
148  // Seq comments:
149  for (size_t i = 0; i < vsc.getNumberOfSequences(); i++)
150  {
151  comments_[i] = new Comments(vsc.getComments(i));
152  }
153 }
154 
155 /******************************************************************************/
156 
158  AbstractSequenceContainer(sc.getAlphabet()),
159  sites_(0),
160  index_(0),
161  names_(sc.getSequencesNames()),
162  comments_(sc.getNumberOfSequences()),
163  sequences_(sc.getNumberOfSequences())
164 {
165  // Now try to add each site:
166  for (size_t i = 0; i < sc.getNumberOfSites(); i++)
167  {
168  addSite(sc.getSite(i), false); // We assume that positions are correct.
169  }
170  // Seq comments:
171  for (size_t i = 0; i < sc.getNumberOfSequences(); i++)
172  {
173  comments_[i] = new Comments(sc.getComments(i));
174  }
175 }
176 
177 /******************************************************************************/
178 
180 {
182  // Seq names:
183  names_ = vsc.names_;
184  // Now try to add each site:
185  sites_.resize(vsc.sites_.size());
186  for (size_t i = 0; i < vsc.sites_.size(); i++)
187  {
188  sites_[i] = dynamic_cast<Site*>(vsc.sites_[i]->clone());
189  }
190  index_ = vsc.index_;
191  // Seq comments:
192  size_t nbSeq = vsc.getNumberOfSequences();
193  comments_.resize(nbSeq);
194  for (size_t i = 0; i < nbSeq; i++)
195  {
196  comments_[i] = new Comments(vsc.getComments(i));
197  }
198  sequences_.resize(nbSeq);
199 
200  return *this;
201 }
202 
203 /******************************************************************************/
204 
206 {
207  clear();
209  // Seq names:
210  names_ = sc.getSequencesNames();
211  // Now try to add each site:
212  for (size_t i = 0; i < sc.getNumberOfSites(); i++)
213  {
214  addSite(sc.getSite(i), false); // We assume that positions are correct.
215  }
216  // Seq comments:
217  size_t nbSeq = sc.getNumberOfSequences();
218  comments_.resize(nbSeq);
219  for (size_t i = 0; i < nbSeq; i++)
220  {
221  comments_[i] = new Comments(sc.getComments(i));
222  }
223  sequences_.resize(nbSeq);
224 
225  return *this;
226 }
227 
228 /******************************************************************************/
229 
230 const Site& CompressedVectorSiteContainer::getSite(size_t i) const throw (IndexOutOfBoundsException)
231 {
232  if (i >= getNumberOfSites())
233  throw IndexOutOfBoundsException("CompressedVectorSiteContainer::getSite.", i, 0, getNumberOfSites() - 1);
234  return *sites_[index_[i]];
235 }
236 
237 /******************************************************************************/
238 
239 void CompressedVectorSiteContainer::setSite(size_t pos, const Site& site, bool checkPositions) throw (Exception)
240 {
241  if (pos >= getNumberOfSites()) throw IndexOutOfBoundsException("CompressedVectorSiteContainer::setSite.", pos, 0, getNumberOfSites() - 1);
242 
243  // Check size:
244  if (site.size() != getNumberOfSequences()) throw SiteException("AlignedSequenceContainer::addSite. Site does not have the appropriate length", &site);
245 
246  // New site's alphabet and site container's alphabet matching verification
247  if (site.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType())
248  throw AlphabetMismatchException("CompressedVectorSiteContainer::setSite", getAlphabet(), site.getAlphabet());
249 
250  size_t current = index_[pos];
251  size_t siteIndex = getSiteIndex_(site);
252  if (siteIndex == current)
253  {
254  //Nothing to do here, this is the same site.
255  }
256  else if (siteIndex < sites_.size())
257  {
258  //The new site is already in the list, si we just update the index:
259  index_[pos] = siteIndex;
260 
261  //We have to check if the previous pattern was unique, and if so, remove it and update indices:
262  bool test = true;
263  for (size_t i = 0; test && i < index_.size(); ++i)
264  {
265  if (index_[i] == current)
266  {
267  //There is another site, so nothing to do...
268  test = false;
269  }
270  }
271  if (test)
272  {
273  //There was no other site pointing toward this pattern, so we remove it.
274  delete sites_[current];
275  sites_.erase(sites_.begin() + static_cast<ptrdiff_t>(current));
276  //Now we have to correct all indices:
277  for (size_t i = 0; i < index_.size(); ++i)
278  {
279  if (index_[i] > current) index_[i]--;
280  }
281  }
282  }
283  else
284  {
285  //This is a new pattern, and we have to add it to the list...
286  Site* copy = dynamic_cast<Site*>(site.clone());
287 
288  //Now we have to check if the previous pattern was unique, and if so,
289  //replace it with the new one. Otherwise, add the new site at the end of the list.
290  bool test = true;
291  for (size_t i = 0; test && i < index_.size(); ++i)
292  {
293  if (i != pos && index_[i] == current)
294  {
295  //There is another site, so nothing to do...
296  test = false;
297  }
298  }
299  if (test)
300  {
301  //There was no other site pointing toward this pattern, so we remove it.
302  delete sites_[current];
303  sites_[current] = copy;
304  }
305  else
306  {
307  //We add the site at the end:
308  sites_.push_back(copy);
309  index_[pos] = siteIndex;
310  }
311  }
312 }
313 
314 /******************************************************************************/
315 
316 Site* CompressedVectorSiteContainer::removeSite(size_t i) throw (IndexOutOfBoundsException)
317 {
318  if (i >= getNumberOfSites()) throw IndexOutOfBoundsException("CompressedVectorSiteContainer::removeSite.", i, 0, getNumberOfSites() - 1);
319  //Here we return a copy of the site, as it will not necessarily be removed from the set, so we don't want to delete it.
320  Site* site = dynamic_cast<Site *>(sites_[index_[i]]->clone());
321  deleteSite(i);
322  return site;
323 }
324 
325 /******************************************************************************/
326 
327 void CompressedVectorSiteContainer::deleteSite(size_t siteIndex) throw (IndexOutOfBoundsException)
328 {
329  if (siteIndex >= getNumberOfSites())
330  throw IndexOutOfBoundsException("CompressedVectorSiteContainer::deleteSite.", siteIndex, 0, getNumberOfSites() - 1);
331  //Here we need to check whether the pattern corresponding to this site is unique:
332  size_t current = index_[siteIndex];
333  bool test = true;
334  for (size_t j = 0; test && j < index_.size(); ++j)
335  {
336  if (j != siteIndex && index_[j] == current)
337  {
338  //There is a nother site, so nothing to...
339  test = false;
340  }
341  }
342  if (test)
343  {
344  //There was no other site pointing toward this pattern, so we remove it.
345  delete sites_[current];
346  sites_.erase(sites_.begin() + static_cast<ptrdiff_t>(current));
347  //Now we have to correct all indices:
348  for (size_t j = 0; j < index_.size(); ++j)
349  {
350  if (index_[j] > current) index_[j]--;
351  }
352  }
353  index_.erase(index_.begin() + static_cast<ptrdiff_t>(siteIndex));
354 }
355 
356 /******************************************************************************/
357 
358 void CompressedVectorSiteContainer::deleteSites(size_t siteIndex, size_t length) throw (IndexOutOfBoundsException)
359 {
360  //This may be optimized later:
361  for (size_t i = 0; i < length; ++i) {
362  deleteSite(siteIndex + i);
363  }
364 }
365 
366 /******************************************************************************/
367 
368 void CompressedVectorSiteContainer::addSite(const Site& site, bool checkPositions) throw (Exception)
369 {
370  // Check size:
371  if (site.size() != getNumberOfSequences()) throw SiteException("CompressedVectorSiteContainer::addSite. Site does not have the appropriate length", &site);
372 
373  // New site's alphabet and site container's alphabet matching verification
374  if (site.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType())
375  {
376  throw AlphabetMismatchException("CompressedVectorSiteContainer::addSite", getAlphabet(), site.getAlphabet());
377  }
378 
379  size_t siteIndex = getSiteIndex_(site);
380  if (siteIndex == sites_.size())
381  {
382  //This is a new pattern:
383  Site* copy = dynamic_cast<Site*>(site.clone());
384  sites_.push_back(copy);
385  }
386  index_.push_back(siteIndex);
387 }
388 
389 /******************************************************************************/
390 
391 void CompressedVectorSiteContainer::addSite(const Site& site, size_t siteIndex, bool checkPositions) throw (Exception)
392 {
393  if (siteIndex >= getNumberOfSites()) throw IndexOutOfBoundsException("CompressedVectorSiteContainer::addSite", siteIndex, 0, getNumberOfSites() - 1);
394 
395  // Check size:
396  if (site.size() != getNumberOfSequences()) throw SiteException("CompressedVectorSiteContainer::addSite. Site does not have the appropriate length", &site);
397 
398  // New site's alphabet and site container's alphabet matching verification
399  if (site.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType())
400  {
401  throw AlphabetMismatchException("CompressedVectorSiteContainer::addSite", getAlphabet(), site.getAlphabet());
402  }
403 
404  size_t index = getSiteIndex_(site);
405  if (index == sites_.size())
406  {
407  //This is a new pattern:
408  Site* copy = dynamic_cast<Site*>(site.clone());
409  sites_.push_back(copy);
410  }
411  index_.insert(index_.begin() + static_cast<ptrdiff_t>(siteIndex), index);
412 }
413 
414 /******************************************************************************/
415 
417 {
418  int pos = 1; // first position is 1.
419  for (vector<Site*>::iterator i = sites_.begin(); i < sites_.end(); i++)
420  {
421  (*i)->setPosition(pos++);
422  }
423 }
424 
425 /******************************************************************************/
426 
428 {
429  size_t n = getNumberOfSites();
430  Vint positions(n);
431  for (size_t i = 0; i < n; i++)
432  {
433  positions[i] = sites_[index_[i]]->getPosition();
434  }
435  return positions;
436 }
437 
438 /******************************************************************************/
439 
440 const Sequence& CompressedVectorSiteContainer::getSequence(size_t i) const throw (IndexOutOfBoundsException)
441 {
442  if (i >= getNumberOfSequences()) throw IndexOutOfBoundsException("CompressedVectorSiteContainer::getSequence.", i, 0, getNumberOfSequences() - 1);
443 
444  // Main loop : for all sites
445  size_t n = getNumberOfSites();
446  vector<int> sequence(n);
447  for (size_t j = 0; j < n; j++)
448  {
449  sequence[j] = sites_[index_[j]]->getContent()[i];
450  }
451  if (sequences_[i]) delete sequences_[i];
452  sequences_[i] = new BasicSequence(names_[i], sequence, *comments_[i], getAlphabet());
453  return *sequences_[i];
454 }
455 
456 /******************************************************************************/
457 
459 {
460  // Look for sequence name:
461  size_t pos = getSequencePosition(name);
462  return getSequence(pos);
463 }
464 
465 /******************************************************************************/
466 
467 bool CompressedVectorSiteContainer::hasSequence(const string& name) const
468 {
469  //Look for sequence name:
470  for (size_t pos = 0; pos < names_.size(); pos++) {
471  if (names_[pos] == name) return true;
472  }
473  return false;
474 }
475 
476 /******************************************************************************/
477 
479 {
480  // Look for sequence name:
481  for (size_t pos = 0; pos < names_.size(); pos++)
482  {
483  if (names_[pos] == name) return pos;
484  }
485  throw SequenceNotFoundException("CompressedVectorSiteContainer::getSequencePosition().", name);
486 }
487 
488 /******************************************************************************/
489 
491 {
492  // Must delete all sites in the container:
493  for (size_t i = 0; i < sites_.size(); i++)
494  {
495  delete sites_[i];
496  }
497 
498  // must delete all comments too:
499  for (size_t i = 0; i < comments_.size(); i++)
500  {
501  if (comments_[i]) delete comments_[i];
502  }
503 
504  // Delete all sequences retrieved:
505  for (size_t i = 0; i < sequences_.size(); i++)
506  {
507  if (sequences_[i]) delete (sequences_[i]);
508  }
509 
510  // Delete all sites pointers
511  sites_.clear();
512  index_.clear();
513  names_.clear();
514  comments_.clear();
515  sequences_.clear();
516 }
517 
518 /******************************************************************************/
519 
521 {
522  vector<string> seqnames(names_.size());
523  for (size_t i = 0; i < names_.size(); i++)
524  {
525  seqnames[i] = names_[i];
526  }
527  return seqnames;
528 }
529 
530 /******************************************************************************/
531 
533  const vector<string>& names,
534  bool checkNames)
535 throw (Exception)
536 {
537  if (names.size() != getNumberOfSequences())
538  throw IndexOutOfBoundsException("CompressedVectorSiteContainer::setSequenceNames: bad number of names.", names.size(), getNumberOfSequences(), getNumberOfSequences());
539  if (checkNames)
540  {
541  for (size_t i = 0; i < names.size(); i++)
542  {
543  // For all names in vector : throw exception if name already exists
544  for (size_t j = 0; j < i; j++)
545  {
546  if (names[j] == names[i])
547  throw Exception("CompressedVectorSiteContainer::setSequencesNames : Sequence's name already exists in container");
548  }
549  }
550  }
551  for (size_t i = 0; i < names.size(); i++)
552  {
553  names_[i] = names[i];
554  }
555 }
556 
557 /******************************************************************************/
558 
559 void CompressedVectorSiteContainer::setComments(size_t sequenceIndex, const Comments& comments) throw (IndexOutOfBoundsException)
560 {
561  comments_[sequenceIndex] = new Comments(comments);
562 }
563 
564 /******************************************************************************/
565 
567 {
570  return vsc;
571 }
572 
573 /******************************************************************************/
574 
576 {
577  size_t pos = sites_.size();
578  bool test;
579  for (size_t i = 0; i < sites_.size(); ++i)
580  {
581  test = true;
582  for (size_t j = 0; test && j < site.size(); ++j) //site is supposed to have the correct size, that is the same as all the ones in the container.
583  {
584  if (site[j] != (*sites_[i])[j])
585  test = false;
586  }
587  if (test)
588  {
589  pos = i;
590  break;
591  }
592  }
593  return pos;
594 }
595 
596 /******************************************************************************/
597 
Exception thrown when a sequence is not found The sequence not found exception base class...
std::vector< std::string > Comments
Declaration of Comments type.
Definition: Sequence.h:60
const Site & getSite(size_t siteIndex) const
Get a site from the container.
The SiteContainer interface.
Definition: SiteContainer.h:63
This alphabet is used to deal NumericAlphabet.
The site exception base class.
const Alphabet * getAlphabet() const
Get sequence container&#39;s alphabet.
The Alphabet interface.
Definition: Alphabet.h:130
const Sequence & getSequence(size_t sequenceIndex) const
Retrieve a sequence object from the container.
void setSite(size_t siteIndex, const Site &site, bool checkPosition=false)
Set a site in the container.
STL namespace.
void reindexSites()
Set all positions attributes.
CompressedVectorSiteContainer(const std::vector< const Site *> &vs, const Alphabet *alpha)
Build a new container from a set of sites.
AbstractSequenceContainer & operator=(const AbstractSequenceContainer &sc)
Partial implementation of the OrderedSequenceContainer interface.
size_t getNumberOfSites() const
Get the number of sites in the container.
CompressedVectorSiteContainer * createEmptyContainer() const
Return a copy of this container, but with no sequence inside.
const Comments & getGeneralComments() const
Get the comments of this container.
virtual const Comments & getComments(size_t sequenceIndex) const =0
Get comments of a particular sequence.
Vint getSitePositions() const
Get all position attributes of sites.
A low memory, yet restricted, version of the VectorSiteContainer class.
size_t getNumberOfSequences() const
Get the number of sequences in the container.
bool hasSequence(const std::string &name) const
Check if a sequence with a given name is present in the container.
void addSite(const Site &site, bool checkPosition=false)
Add a site in the container.
void setSequencesNames(const std::vector< std::string > &names, bool checkNames=true)
Set all sequence names.
std::vector< std::string > getSequencesNames() const
Get all the names of the sequences in the container.
virtual const Site & getSite(size_t siteIndex) const =0
Get a site from the container.
void clear()
Delete all sequences in the container.
CompressedVectorSiteContainer & operator=(const CompressedVectorSiteContainer &vsc)
void deleteSite(size_t siteIndex)
Delete a site in the container.
A basic implementation of the Sequence interface.
Definition: Sequence.h:207
virtual size_t size() const
Get the number of elements in the list.
Definition: SymbolList.h:350
void setComments(size_t sequenceIndex, const Comments &comments)
Set the comments of a particular sequence.
size_t getSequencePosition(const std::string &name) const
Get the position of a sequence in sequence container from its name.
virtual size_t getNumberOfSites() const =0
Get the number of sites in the container.
The sequence interface.
Definition: Sequence.h:74
The Site class.
Definition: Site.h:61
Exception thrown when two alphabets do not match.
virtual std::vector< std::string > getSequencesNames() const =0
Get all the names of the sequences in the container.
Site * removeSite(size_t siteIndex)
Remove a site from the container.
void deleteSites(size_t siteIndex, size_t length)
Delete a continuous range of sites in the container.
void setGeneralComments(const Comments &comments)
Set the comments of this container.
const Comments & getComments(const std::string &name) const
Get comments of a particular sequence.
virtual size_t getNumberOfSequences() const =0
Get the number of sequences in the container.