bpp-seq  2.2.0
MaseTools.cpp
Go to the documentation of this file.
1 //
2 // File: MaseTools.cpp
3 // Created by: Julien Dutheil
4 // Created on: Tue Apr 1 09:16:59 2003
5 //
6 
7 /*
8  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
9 
10  This software is a computer program whose purpose is to provide classes
11  for sequences analysis.
12 
13  This software is governed by the CeCILL license under French law and
14  abiding by the rules of distribution of free software. You can use,
15  modify and/ or redistribute the software under the terms of the CeCILL
16  license as circulated by CEA, CNRS and INRIA at the following URL
17  "http://www.cecill.info".
18 
19  As a counterpart to the access to the source code and rights to copy,
20  modify and redistribute granted by the license, users are provided only
21  with a limited warranty and the software's author, the holder of the
22  economic rights, and the successive licensors have only limited
23  liability.
24 
25  In this respect, the user's attention is drawn to the risks associated
26  with loading, using, modifying and/or developing or reproducing the
27  software by the user in light of its specific status of free software,
28  that may mean that it is complicated to manipulate, and that also
29  therefore means that it is reserved for developers and experienced
30  professionals having in-depth computer knowledge. Users are therefore
31  encouraged to load and test the software's suitability as regards their
32  requirements in conditions enabling the security of their systems and/or
33  data to be ensured and, more generally, to use and operate it in the
34  same conditions as regards security.
35 
36  The fact that you are presently reading this means that you have had
37  knowledge of the CeCILL license and that you accept its terms.
38  */
39 
40 #include "MaseTools.h"
41 #include "../Container/VectorSequenceContainer.h"
42 #include "../Container/AlignedSequenceContainer.h"
43 #include "../Container/SequenceContainerTools.h"
44 #include <Bpp/Text/StringTokenizer.h>
45 #include <Bpp/Text/TextTools.h>
46 
47 #include <iostream>
48 
49 using namespace std;
50 using namespace bpp;
51 
52 SiteSelection MaseTools::getSiteSet(const Comments& maseFileHeader, const string& setName) throw (IOException)
53 {
54  SiteSelection selection;
55  for (size_t i = 0; i < maseFileHeader.size(); i++)
56  {
57  string current = maseFileHeader[i];
58  string::size_type index = current.find("# of");
59  if (index < current.npos)
60  {
61  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 4), current.end()), " \t=;");
62  st.nextToken(); // skip next word: may be 'regions' or 'segments' or else ;-)
63  size_t numberOfSegments = TextTools::to<size_t>(st.nextToken());
64  string name = st.unparseRemainingTokens();
65  if (name == setName)
66  {
67  // cout << numberOfSegments << " segments found." << endl;
68  // Then look for the set definition:
69  i++; // next line.
70  size_t counter = 0;
71  while (i < maseFileHeader.size())
72  {
73  current = maseFileHeader[i++];
74  StringTokenizer st2(current);
75  // st.nextToken(); //Skip ';;'
76  while (st2.hasMoreToken())
77  {
78  StringTokenizer st3(st2.nextToken(), ",");
79  size_t begin = TextTools::to<size_t>(st3.nextToken());
80  size_t end = TextTools::to<size_t>(st3.nextToken());
81  // WARNING!!! In the mase+ format, sites are numbered from 1 to nbSites,
82  // Whereas in SiteContainer the index begins at 0.
83  for (size_t j = begin; j <= end; j++)
84  {
85  selection.push_back(j - 1); // bounds included.
86  }
87  counter++;
88  if (counter == numberOfSegments)
89  return selection;
90  }
91  }
92  }
93  }
94  }
95  if (selection.size() == 0)
96  {
97  throw IOException("Site set " + setName + " has not been found in the sequence file.");
98  }
99  return selection;
100 }
101 
102 /******************************************************************************/
103 
104 SequenceSelection MaseTools::getSequenceSet(const Comments& maseFileHeader, const string& setName) throw (IOException)
105 {
106  SequenceSelection selection;
107  for (size_t i = 0; i < maseFileHeader.size(); i++)
108  {
109  string current = maseFileHeader[i];
110 
111  string::size_type index = current.find("@ of");
112  if (index < current.npos)
113  {
114  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 4), current.end()), " \t=;");
115  st.nextToken(); // skip next word: may be 'sequences' or else ;-)
116  size_t numberOfSequences = TextTools::to<size_t>(st.nextToken());
117  string name = st.unparseRemainingTokens();
118  size_t counter = 0;
119  if (name == setName)
120  {
121  // cout << numberOfSequences << " segments found." << endl;
122  // Then look for the set definition:
123  i++; // next line.
124  while (i < maseFileHeader.size())
125  {
126  current = maseFileHeader[i++];
127  StringTokenizer st2(current, ",");
128  while (st2.hasMoreToken())
129  {
130  int seqIndex = TextTools::toInt(st2.nextToken());
131  // WARNING!!! In the mase+ format, sequences are numbered from 1 to nbSequences,
132  // Whereas in SequenceContainer the index begins at 0.
133  selection.push_back(static_cast<size_t>(seqIndex - 1)); // bounds included.
134  counter++;
135  if (counter == numberOfSequences)
136  return selection;
137  }
138  }
139  }
140  }
141  }
142  if (selection.size() == 0)
143  {
144  throw IOException("Sequence set " + setName + " has not been found in the sequence file.");
145  }
146  return selection;
147 }
148 
149 /******************************************************************************/
150 
151 SiteContainer* MaseTools::getSelectedSites(
152  const SiteContainer& sequences,
153  const string& setName) throw (IOException)
154 {
155  SiteSelection ss = getSiteSet(sequences.getGeneralComments(), setName);
156  // We need to convert positions in case of word alphabet:
157  // size_t wsize = sequences.getAlphabet()->getStateCodingSize();
158  // if (wsize > 1)
159  // {
160  // if (ss.size() % wsize != 0)
161  // throw IOException("MaseTools::getSelectedSites: Site selection is not compatible with the alphabet in use in the container.");
162  // SiteSelection ss2;
163  // for (size_t i = 0; i < ss.size(); i += wsize)
164  // {
165  // if (ss[i] % wsize != 0)
166  // throw IOException("MaseTools::getSelectedSites: Site selection is not compatible with the alphabet in use in the container.");
167  // for (size_t j = 1; j < wsize; ++j)
168  // {
169  // if (ss[i + j] != (ss[i + j - 1] + 1))
170  // throw IOException("MaseTools::getSelectedSites: Site selection is not compatible with the alphabet in use in the container.");
171  // }
172  // ss2.push_back(ss[i] / wsize);
173  // }
174  return SiteContainerTools::getSelectedPositions(sequences, ss);
175  // }
176  // else
177  // {
178  // return SiteContainerTools::getSelectedSites(sequences, ss);
179  // }
180 }
181 
182 /******************************************************************************/
183 
184 SequenceContainer* MaseTools::getSelectedSequences(
185  const OrderedSequenceContainer& sequences,
186  const std::string& setName) throw (IOException)
187 {
188  SequenceSelection ss = getSequenceSet(sequences.getGeneralComments(), setName);
189  VectorSequenceContainer* cont = new VectorSequenceContainer(sequences.getAlphabet());
190  SequenceContainerTools::getSelectedSequences(sequences, ss, *cont);
191  return cont;
192 }
193 
194 /******************************************************************************/
195 
196 map<string, size_t> MaseTools::getAvailableSiteSelections(const Comments& maseHeader)
197 {
198  map<string, size_t> selections;
199  for (size_t i = 0; i < maseHeader.size(); i++)
200  {
201  string current = maseHeader[i];
202 
203  string::size_type index = current.find("# of");
204  if (index < current.npos)
205  {
206  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 4), current.end()), " \t\n\f\r=;");
207  st.nextToken(); // skip next word: may be 'sequences' or else ;-)
208  size_t numberOfSegments = TextTools::to<size_t>(st.nextToken());
209  string name = st.nextToken();
210  while (st.hasMoreToken())
211  {
212  name += " " + st.nextToken();
213  }
214  size_t counter = 0;
215  size_t nbSites = 0;
216  while (i < maseHeader.size())
217  {
218  i++;
219  current = maseHeader[i];
220  StringTokenizer st2(current);
221  // st.nextToken(); //Skip ';;'
222  while (st2.hasMoreToken())
223  {
224  StringTokenizer st3(st2.nextToken(), ",");
225  size_t begin = TextTools::to<size_t>(st3.nextToken());
226  size_t end = TextTools::to<size_t>(st3.nextToken());
227  counter++;
228  nbSites += end - begin + 1;
229  }
230  if (counter == numberOfSegments)
231  {
232  selections[name] = nbSites;
233  break;
234  }
235  }
236  }
237  }
238  return selections;
239 }
240 
241 /******************************************************************************/
242 
243 map<string, size_t> MaseTools::getAvailableSequenceSelections(const Comments& maseHeader)
244 {
245  map<string, size_t> selections;
246  for (size_t i = 0; i < maseHeader.size(); i++)
247  {
248  string current = maseHeader[i];
249 
250  string::size_type index = current.find("@ of");
251  if (index < current.npos)
252  {
253  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 4), current.end()), " \t\n\f\r=;");
254  st.nextToken(); // skip next word: may be 'sequences' or else ;-)
255  size_t numberOfSequences = TextTools::fromString<size_t>(st.nextToken());
256  string name = st.nextToken();
257  while (st.hasMoreToken())
258  {
259  name += st.nextToken();
260  }
261  selections[name] = numberOfSequences;
262  }
263  }
264  return selections;
265 }
266 
267 /******************************************************************************/
268 
269 size_t MaseTools::getPhase(const Comments& maseFileHeader, const string& setName) throw (Exception)
270 {
271  size_t phase = 0;
272  string::size_type index = 0;
273  for (size_t i = 0; i < maseFileHeader.size(); i++)
274  {
275  string current = maseFileHeader[i];
276 
277  index = current.find("# of");
278  if (index < current.npos)
279  {
280  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 12), current.end()), " \t\n\f\r=;");
281  // size_t numberOfSegments = TextTools::toInt(st.nextToken());
282  // cout << "Number of regions: " << st.nextToken() << endl;
283  string name;
284  while (st.hasMoreToken())
285  {
286  name = st.nextToken();
287  // cout << "Name of regions: " << name << endl;
288  }
289  if (name == setName)
290  {
291  return phase;
292  }
293  }
294 
295  index = current.find("/codon_start");
296  if (index < current.npos)
297  {
298  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 12), current.end()), " \t\n\f\r=;");
299  phase = TextTools::to<size_t>(st.nextToken());
300  }
301  }
302  throw Exception("PolymorphismSequenceContainer::getPhase: no /codon_start found, or site selection missing.");
303 }
304 
305 /******************************************************************************/
306 
std::vector< size_t > SiteSelection
std::vector< std::string > Comments
Declaration of Comments type.
Definition: Sequence.h:60
The SiteContainer interface.
Definition: SiteContainer.h:63
The OrderedSequenceContainer interface.
This alphabet is used to deal NumericAlphabet.
The VectorSequenceContainer class.
STL namespace.
std::vector< size_t > SequenceSelection
The SequenceContainer interface.