bpp-seq  2.2.0
Mase.cpp
Go to the documentation of this file.
1 //
2 // File Mase.cpp
3 // Author : Guillaume Deuchst
4 // Julien Dutheil
5 // Last modification: Tuesday August 21 2003
6 //
7 
8 /*
9 Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
10 
11 This software is a computer program whose purpose is to provide classes
12 for sequences analysis.
13 
14 This software is governed by the CeCILL license under French law and
15 abiding by the rules of distribution of free software. You can use,
16 modify and/ or redistribute the software under the terms of the CeCILL
17 license as circulated by CEA, CNRS and INRIA at the following URL
18 "http://www.cecill.info".
19 
20 As a counterpart to the access to the source code and rights to copy,
21 modify and redistribute granted by the license, users are provided only
22 with a limited warranty and the software's author, the holder of the
23 economic rights, and the successive licensors have only limited
24 liability.
25 
26 In this respect, the user's attention is drawn to the risks associated
27 with loading, using, modifying and/or developing or reproducing the
28 software by the user in light of its specific status of free software,
29 that may mean that it is complicated to manipulate, and that also
30 therefore means that it is reserved for developers and experienced
31 professionals having in-depth computer knowledge. Users are therefore
32 encouraged to load and test the software's suitability as regards their
33 requirements in conditions enabling the security of their systems and/or
34 data to be ensured and, more generally, to use and operate it in the
35 same conditions as regards security.
36 
37 The fact that you are presently reading this means that you have had
38 knowledge of the CeCILL license and that you accept its terms.
39 */
40 
41 #include "Mase.h"
42 #include "../StringSequenceTools.h"
43 
44 using namespace bpp;
45 using namespace std;
46 
47 /****************************************************************************************/
48 
49 void Mase::appendSequencesFromStream(std::istream& input, SequenceContainer& vsc) const throw (Exception)
50 {
51  if (!input) { throw IOException ("Mase::read : fail to open file"); }
52 
53  // Initialization
54  Comments seqComments, fileComments;
55  string temp, name, sequence = "";
56  bool comments = false;
57 
58  // Get current general comments is VectorSequenceContainer
59  fileComments = vsc.getGeneralComments();
60 
61  // Main loop : for all file lines
62  while (!input.eof())
63  {
64  getline(input, temp, '\n'); // Copy current line in temporary string
65 
66  // If first character is ;
67  if (temp[0] == ';')
68  {
69  // If second character is also ;
70  if (temp[1] == ';')
71  {
72  // File comments isolation
73  temp.erase(0,2); // Characters ;; deletion
74  if(temp != "") fileComments.push_back(temp);
75  }
76  else
77  {
78  // If a name and a sequence were founded
79  if ((name != "") && (sequence != ""))
80  {
81  // New sequence creation, and addition in existing VectorSequenceContainer
82  vsc.addSequence(BasicSequence(name, sequence, seqComments, vsc.getAlphabet()), checkNames_);
83  name = "";
84  sequence = "";
85  seqComments.clear();
86  }
87 
88  // Sequence commentaries isolation
89  temp.erase(temp.begin()); // Character ; deletion
90  if (temp != "") seqComments.push_back(temp);
91  comments = true;
92  }
93  }
94  else
95  {
96  // If sequence commentaries were just isolated
97  if (comments)
98  {
99  // Sequence name isolation
100  name = temp;
101  comments = false;
102  }
103  else sequence += temp; // Sequence isolation
104  }
105  }
106 
107  // Addition of the last sequence in file
108  if ((name != "") && (sequence != ""))
109  {
110  vsc.addSequence(BasicSequence(name, sequence, seqComments, vsc.getAlphabet()), checkNames_);
111  }
112 
113  // Set new general comments in VectorSequenceContainer (old + new comments)
114  vsc.setGeneralComments(fileComments);
115 }
116 
117 /****************************************************************************************/
118 
119 void Mase::writeSequences(ostream& output, const SequenceContainer& sc) const throw (Exception)
120 {
121  // Checking the existence of specified file, and possibility to open it in write mode
122  if (!output) { throw IOException ("Mase::write : failed to open file"); }
123 
124  Comments comments = sc.getGeneralComments();
125 
126  // Writing all general comments in file
127  if (comments.size() == 0) {
128  output << ";;" << endl;
129  }
130  for (unsigned int i = 0 ; i < comments.size() ; i++) {
131  output << ";;" << comments[i] << endl;
132  }
133 
134  string seq, temp = ""; // Initialization
135 
136  // Main loop : for all sequences
137  vector<string> names = sc.getSequencesNames();
138  for (unsigned int i = 0 ; i < names.size() ; i ++)
139  {
140  comments = sc.getComments(names[i]);
141 
142  // Writing all sequence comments in file
143  // If no comments are associated with current sequence, an empy commentary line will be writed
144  if (comments.size() == 0)
145  {
146  output << ";" << endl;
147  }
148  else
149  {
150  for (unsigned int j = 0 ; j < comments.size() ; j++)
151  {
152  output << ";" << comments[j] << endl;
153  }
154  }
155 
156  // Sequence name writing
157  output << names[i] << endl;
158 
159  // Sequence cutting to specified characters number per line
160  seq = sc.toString(names[i]);
161  while (seq != "")
162  {
163  if (seq.size() > charsByLine_)
164  {
165  temp = seq;
166  temp.erase(temp.begin() + charsByLine_ , temp.end());
167  output << temp << endl;
168  seq.erase(seq.begin(), seq.begin() + charsByLine_);
169  }
170  else
171  {
172  output << seq << endl;
173  seq = "";
174  }
175  }
176  }
177 }
178 
179 /****************************************************************************************/
180 
181 void Mase::readHeader_(std::istream& input, MaseHeader& header) const throw (Exception)
182 {
183  do {
184  //Check if the line is a header line:
185  if (input.peek() == ';') {
186  char c;
187  input.get(c);
188  if (input.peek() == ';') {
189  input.get(c);
190  string line = FileTools::getNextLine(input);
191 
192  //Check the type of line...
193 
194  //Site selection:
195  string::size_type index = line.find("# of");
196  if (index < line.npos) {
197  StringTokenizer st(string(line.begin() + static_cast<ptrdiff_t>(index + 4), line.end()), " \t=;");
198  st.nextToken(); //skip next word: may be 'regions' or 'segments' or else ;-)
199  unsigned int numberOfSegments = TextTools::to<unsigned int>(st.nextToken());
200  string name = st.unparseRemainingTokens();
201  //Then look for the set definition:
202  MultiRange<size_t> siteSelection;
203  while (siteSelection.size() < numberOfSegments) {
204  line = FileTools::getNextLine(input);
205  if (line[0] != ';' || line[1] != ';')
206  throw Exception("Mase::readHeader_(): corrupted file, site selection " + name + " is incomplete. Aborting.");
207  line = line.substr(2);
208  StringTokenizer st2(line);
209  while (st2.hasMoreToken()) {
210  StringTokenizer st3(st2.nextToken(), ",");
211  unsigned int begin = TextTools::to<unsigned int>(st3.nextToken());
212  unsigned int end = TextTools::to<unsigned int>(st3.nextToken());
213  //WARNING!!! In the mase+ format, sites numerotation is 1-based, including, while ranges are 0-based, [a, b[:
214  siteSelection.addRange(Range<size_t>(begin - 1, end));
215  }
216  if (siteSelection.size() > numberOfSegments)
217  throw Exception("Mase::readHeader_(): incorrected file, found " + TextTools::toString(siteSelection.size()) + "segments while expected " + TextTools::toString(numberOfSegments));
218  }
219  header.setSiteSelection(name, siteSelection);
220  } else {
221  //Sequence selection:
222  index = line.find("@ of");
223  if (index < line.npos) {
224  StringTokenizer st(line.substr(index + 4), " \t=;");
225  st.nextToken(); //skip next word: may be 'sequences' or else ;-)
226  unsigned int numberOfSequences = TextTools::to<unsigned int>(st.nextToken());
227  string name = st.unparseRemainingTokens();
228  //The look for the set definition:
229  vector<size_t> sequenceSelection;
230  while (sequenceSelection.size() < numberOfSequences) {
231  line = FileTools::getNextLine(input);
232  if (line[0] != ';' || line[1] != ';')
233  throw Exception("Mase::readHeader_(): corrupted file, sequence selection " + name + " is incomplete. Aborting.");
234  line = line.substr(2);
235  StringTokenizer st2(line, ", ");
236  while (st2.hasMoreToken()) {
237  unsigned int pos = TextTools::to<unsigned int>(st2.nextToken());
238  //WARNING!!! In the mase+ format, sequence numerotation is 1-based
239  sequenceSelection.push_back(pos);
240  }
241  if (sequenceSelection.size() > numberOfSequences)
242  throw Exception("Mase::readHeader_(): incorrected file, found " + TextTools::toString(sequenceSelection.size()) + "sequences while expected " + TextTools::toString(numberOfSequences));
243  }
244  header.setSequenceSelection(name, sequenceSelection);
245  } else {
246  //Tree:
247  index = line.find("$");
248  if (index < line.npos) {
249  string name = TextTools::removeSurroundingWhiteSpaces(line.substr(index + 1));
250  //Here we stop if the line ends with a ";"
251  string tree = "";
252  do {
253  line = FileTools::getNextLine(input);
254  if (line[0] != ';' || line[1] != ';')
255  throw Exception("Mase::readHeader_(): corrupted file, tree " + name + " is incomplete. Aborting.");
256  line = TextTools::removeSurroundingWhiteSpaces(line.substr(2));
257  tree += line;
258  } while (! TextTools::endsWith(line, ";"));
259  header.setTree(name, tree);
260  }
261  }
262  }
263  } else {
264  input.putback(c);
265  break;
266  }
267  }
268  } while (true);
269 }
270 
271 /****************************************************************************************/
272 
273 void Mase::writeHeader_(std::ostream& output, const MaseHeader& header) const
274 {
275  //Write trees:
276  vector<string> treeNames = header.getTreeNames();
277  for (size_t i = 0; i < treeNames.size(); ++i) {
278  output << ";;$ " + treeNames[i] << endl;
279  output << ";;" + header.getTree(treeNames[i]);
280  output << endl;
281  }
282 
283  //Write site selections:
284  vector<string> siteSelectionNames = header.getSiteSelectionNames();
285  for (size_t i = 0; i < siteSelectionNames.size(); ++i) {
286  MultiRange<size_t> ranges = header.getSiteSelection(siteSelectionNames[i]);
287  output << ";;# of segments=" << ranges.size() << " " << siteSelectionNames[i] << endl;
288  output << ";;";
289  for (unsigned int j = 0; j < ranges.size(); ++j) {
290  output << " " << (ranges.getRange(j).begin() + 1) << "," << ranges.getRange(j).end();
291  if ((j + 1) % 10 == 0)
292  output << endl << ";;";
293  }
294  output << endl;
295  }
296 
297  //Write sequence selections:
298  vector<string> sequenceSelectionNames = header.getSequenceSelectionNames();
299  for (size_t i = 0; i < sequenceSelectionNames.size(); ++i) {
300  vector<size_t> set = header.getSequenceSelection(sequenceSelectionNames[i]);
301  output << ";;@ of species=" << set.size() << " " << sequenceSelectionNames[i] << endl;
302  output << ";;";
303  for (unsigned int j = 0; j < set.size(); ++j) {
304  output << " " << set[j];
305  if ((j + 1) % 10 == 0)
306  output << endl << ";;";
307  }
308  output << endl;
309  }
310 }
311 
312 /****************************************************************************************/
313 
std::vector< std::string > getSequenceSelectionNames() const
Definition: Mase.h:79
std::vector< std::string > Comments
Declaration of Comments type.
Definition: Sequence.h:60
This alphabet is used to deal NumericAlphabet.
std::vector< std::string > getTreeNames() const
Definition: Mase.h:77
STL namespace.
std::vector< std::string > getSiteSelectionNames() const
Definition: Mase.h:78
void writeSequences(std::ostream &output, const SequenceContainer &sc) const
Write a container to a stream.
Definition: Mase.cpp:119
void readHeader_(std::istream &input, MaseHeader &header) const
Definition: Mase.cpp:181
const std::string & getTree(const std::string &name) const
Definition: Mase.h:81
const MultiRange< size_t > & getSiteSelection(const std::string &name) const
Definition: Mase.h:88
A class to store information from the header of Mase files.
Definition: Mase.h:61
A basic implementation of the Sequence interface.
Definition: Sequence.h:207
void writeHeader_(std::ostream &output, const MaseHeader &header) const
Definition: Mase.cpp:273
const std::vector< size_t > & getSequenceSelection(const std::string &name) const
Definition: Mase.h:95
The SequenceContainer interface.
void appendSequencesFromStream(std::istream &input, SequenceContainer &sc) const
Append sequences to a container from a stream.
Definition: Mase.cpp:49