bpp-seq  2.2.0
SequenceContainerTools.cpp
Go to the documentation of this file.
1 //
2 // File: SequenceContainerTools.cpp
3 // Created by: Julien Dutheil
4 // Created on: Sat Oct 4 09:18:34 2003
5 //
6 
7 /*
8  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
9 
10  This software is a computer program whose purpose is to provide classes
11  for sequences analysis.
12 
13  This software is governed by the CeCILL license under French law and
14  abiding by the rules of distribution of free software. You can use,
15  modify and/ or redistribute the software under the terms of the CeCILL
16  license as circulated by CEA, CNRS and INRIA at the following URL
17  "http://www.cecill.info".
18 
19  As a counterpart to the access to the source code and rights to copy,
20  modify and redistribute granted by the license, users are provided only
21  with a limited warranty and the software's author, the holder of the
22  economic rights, and the successive licensors have only limited
23  liability.
24 
25  In this respect, the user's attention is drawn to the risks associated
26  with loading, using, modifying and/or developing or reproducing the
27  software by the user in light of its specific status of free software,
28  that may mean that it is complicated to manipulate, and that also
29  therefore means that it is reserved for developers and experienced
30  professionals having in-depth computer knowledge. Users are therefore
31  encouraged to load and test the software's suitability as regards their
32  requirements in conditions enabling the security of their systems and/or
33  data to be ensured and, more generally, to use and operate it in the
34  same conditions as regards security.
35 
36  The fact that you are presently reading this means that you have had
37  knowledge of the CeCILL license and that you accept its terms.
38  */
39 
40 #include "SequenceContainerTools.h"
42 #include "../Alphabet/CodonAlphabet.h"
43 
44 // From bpp-core:
45 #include <Bpp/Text/TextTools.h>
46 
47 using namespace bpp;
48 
49 // From the STL:
50 #include <iostream>
51 
52 using namespace std;
53 
54 /******************************************************************************/
55 
57 {
59  for (size_t i = 0; i < size; ++i)
60  {
61  vsc->addSequence(BasicSequence(TextTools::toString(i), "", alphabet), false);
62  }
63  return vsc;
64 }
65 
66 /******************************************************************************/
67 
69  const Alphabet* alphabet,
70  const vector<string>& seqNames)
71 throw (Exception)
72 {
73  SequenceContainer* sc = createContainerOfSpecifiedSize(alphabet, seqNames.size());
74  sc->setSequencesNames(seqNames, true);
75  return sc;
76 }
77 
78 /******************************************************************************/
79 
81  const OrderedSequenceContainer& sequences,
82  const SequenceSelection& selection,
83  SequenceContainer& outputCont) throw (Exception)
84 {
85  bool checkNames = outputCont.getNumberOfSequences() > 0;
86  for (size_t i = 0; i < selection.size(); i++)
87  {
88  outputCont.addSequence(sequences.getSequence(selection[i]), checkNames);
89  }
90 }
91 
92 /******************************************************************************/
93 
95  const SequenceContainer& sequences,
96  const std::vector<std::string>& selection,
97  SequenceContainer& outputCont, bool strict) throw (Exception)
98 {
99  bool checkNames = outputCont.getNumberOfSequences() > 0;
100  for (size_t i = 0; i < selection.size(); i++)
101  {
102  if (strict)
103  {
104  outputCont.addSequence(sequences.getSequence(selection[i]), checkNames);
105  }
106  else
107  {
108  if (sequences.hasSequence(selection[i]))
109  outputCont.addSequence(sequences.getSequence(selection[i]), checkNames);
110  }
111  }
112 }
113 
114 /******************************************************************************/
115 
117  OrderedSequenceContainer& sequences,
118  const SequenceSelection& selection)
119 {
120  vector<string> names = sequences.getSequencesNames();
121  for (size_t i = 0; i < names.size(); i++)
122  {
123  // We need to do this because after removal the indices will not be the same!
124  // another solution would be to sort decreasingly the indices...
125  bool test = false;
126  for (size_t j = 0; j < selection.size() && !test; j++)
127  {
128  test = (selection[j] == i);
129  }
130  if (!test)
131  sequences.deleteSequence(names[i]);
132  // WARNING: what if selection contains several times the same indice? ...
133  }
134 }
135 
136 /******************************************************************************/
137 
139 {
140  vector<string> seqNames = sequences.getSequencesNames();
141  if (seqNames.size() <= 1)
142  return true;
143  size_t length = sequences.getSequence(seqNames[0]).size();
144  for (size_t i = 1; i < seqNames.size(); i++)
145  {
146  if (sequences.getSequence(seqNames[i]).size() != length)
147  return false;
148  }
149  return true;
150 }
151 
152 /******************************************************************************/
153 
154 void SequenceContainerTools::getFrequencies(const SequenceContainer& sequences, std::map<int, double>& f, double pseudoCount)
155 {
156  double n = 0;
157  vector<string> names = sequences.getSequencesNames();
158  for (size_t j = 0; j < names.size(); j++)
159  {
160  vector<int> seq = sequences.getContent(names[j]);
161  for (size_t i = 0; i < seq.size(); i++)
162  {
163  f[seq[i]]++;
164  }
165  n += static_cast<double>(seq.size());
166  }
167 
168  if (pseudoCount != 0)
169  {
170  const Alphabet* pA = sequences.getAlphabet();
171  for (int i = 0; i < static_cast<int>(pA->getSize()); i++)
172  {
173  f[i] += pseudoCount;
174  }
175 
176  n += pseudoCount * static_cast<double>(pA->getSize());
177  }
178 
179  for (map<int, double>::iterator i = f.begin(); i != f.end(); i++)
180  {
181  i->second = i->second / n;
182  }
183 }
184 
185 /******************************************************************************/
186 
187 void SequenceContainerTools::getCounts(const SequenceContainer& sequences, std::map<int, int>& f)
188 {
189  size_t n = 0;
190  vector<string> names = sequences.getSequencesNames();
191  for (size_t j = 0; j < names.size(); j++)
192  {
193  vector<int> seq = sequences.getContent(names[j]);
194  for (size_t i = 0; i < seq.size(); i++)
195  {
196  f[seq[i]]++;
197  }
198  n += seq.size();
199  }
200 }
201 
202 /******************************************************************************/
203 
205 {
206  const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(sequences.getAlphabet());
207  if (!calpha)
208  throw AlphabetException("SequenceContainerTools::getCodonPosition. Input sequences should be of type codon.");
209  vector<string> names = sequences.getSequencesNames();
211  for (size_t j = 0; j < names.size(); j++)
212  {
213  vector<int> seq = sequences.getContent(names[j]);
214  vector<int> newseq(seq.size());
215  for (size_t i = 0; i < seq.size(); i++)
216  {
217  newseq[i] = calpha->getNPosition(seq[i], pos);
218  }
219  BasicSequence s(names[j], newseq, sequences.getComments(names[j]), calpha->getNucleicAlphabet());
220  newcont->addSequence(s);
221  }
222  return newcont;
223 }
224 
225 /******************************************************************************/
226 
virtual void setSequencesNames(const std::vector< std::string > &names, bool checkNames)=0
Set all sequence names.
static void getCounts(const SequenceContainer &sequences, std::map< int, int > &)
Compute base counts.
virtual void deleteSequence(size_t sequenceIndex)=0
Delete a sequence of the container.
const std::vector< int > & getContent(const std::string &name) const
Get the content of a sequence.
The OrderedSequenceContainer interface.
This alphabet is used to deal NumericAlphabet.
The VectorSequenceContainer class.
virtual unsigned int getSize() const =0
Get the number of resolved states in the alphabet (e.g. return 4 for DNA alphabet). This is the method you&#39;ll need in most cases.
The Alphabet interface.
Definition: Alphabet.h:130
static bool sequencesHaveTheSameLength(const SequenceContainer &sequences)
Check if all sequences in a SequenceContainer have the same length.
STL namespace.
int getNPosition(int word, size_t n) const
Get the int code of the n-position of a word given its int description.
Definition: WordAlphabet.h:248
std::vector< size_t > SequenceSelection
static void keepOnlySelectedSequences(OrderedSequenceContainer &sequences, const SequenceSelection &selection)
Remove all sequences that are not in a given selection from a given container.
virtual void addSequence(const Sequence &sequence, bool checkName=true)
Add a sequence at the end of the container.
Codon alphabet class.
Definition: CodonAlphabet.h:63
static SequenceContainer * getCodonPosition(const SequenceContainer &sequences, size_t pos)
Extract a certain position (1, 2 or 3) from a container of codon sequences and returns the resulting ...
static SequenceContainer * createContainerWithSequenceNames(const Alphabet *alphabet, const std::vector< std::string > &seqNames)
Create a container with specified names.
static void getSelectedSequences(const OrderedSequenceContainer &sequences, const SequenceSelection &selection, SequenceContainer &outputCont)
Add a specified set of sequences from a container to another.
The alphabet exception base class.
A basic implementation of the Sequence interface.
Definition: Sequence.h:207
virtual const Sequence & getSequence(const std::string &name) const =0
Retrieve a sequence object from the container.
virtual size_t size() const =0
Get the number of elements in the list.
virtual std::vector< std::string > getSequencesNames() const =0
Get all the names of the sequences in the container.
virtual const NucleicAlphabet *const getNucleicAlphabet() const
virtual const std::vector< int > & getContent(const std::string &name) const =0
Get the content of a sequence.
virtual const Alphabet * getAlphabet() const =0
Get sequence container&#39;s alphabet.
static void getFrequencies(const SequenceContainer &sequences, std::map< int, double > &f, double pseudoCount=0)
Compute base frequencies.
virtual std::vector< std::string > getSequencesNames() const =0
Get all the names of the sequences in the container.
The SequenceContainer interface.
static SequenceContainer * createContainerOfSpecifiedSize(const Alphabet *alphabet, size_t size)
Create a container with void sequences.