bpp-seq  2.2.0
Phylip.cpp
Go to the documentation of this file.
1 //
2 // File: Phylip.cpp
3 // Created by: Julien Dutheil
4 // Created on: Mon Oct 27 12:22:56 2003
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
40 #include "Phylip.h"
41 #include "../Container/SequenceContainerTools.h"
42 #include <Bpp/Text/TextTools.h>
43 #include <Bpp/Text/StringTokenizer.h>
44 #include <Bpp/Io/FileTools.h>
45 
46 using namespace bpp;
47 
48 // From the STL:
49 #include <sstream>
50 
51 using namespace std;
52 
53 /******************************************************************************/
54 
55 const std::vector<std::string> Phylip::splitNameAndSequence(const std::string& s) const throw (Exception)
56 {
57  vector<string> v(2);
58  if (extended_)
59  {
60  string::size_type index = s.find(namesSplit_);
61  if(index == string::npos) throw Exception("No sequence name found.");
62  v[0] = TextTools::removeSurroundingWhiteSpaces(s.substr(0, index));
63  v[1] = TextTools::removeFirstWhiteSpaces (s.substr(index + namesSplit_.size())); //There may be more than 2 white spaces.
64  }
65  else
66  {
67  v[0] = TextTools::removeSurroundingWhiteSpaces(s.substr(0, 10));
68  v[1] = s.substr(10);
69  }
70  return v;
71 }
72 
73 /******************************************************************************/
74 
75 void Phylip::readSequential(std::istream& in, SiteContainer& asc) const throw (Exception)
76 {
77  string temp;
78 
79  //Ignore first line:
80  getline(in, temp, '\n'); // Copy current line in temporary string
81  temp = TextTools::removeSurroundingWhiteSpaces(FileTools::getNextLine(in));
82  string name = "";
83  string seq = "";
84 
85  while (!in.eof())
86  {
87  // Read each sequence:
88  vector<string> v;
89  bool hasName = true;
90  try
91  {
92  v = splitNameAndSequence(temp);
93  }
94  catch (Exception & e)
95  {
96  hasName = false;
97  }
98  if (hasName)
99  {
100  // a new sequence is found:
101  if (!TextTools::isEmpty(name)) //If this is not the first sequence!
102  {
103  // Add the previous sequence to the container:
104  asc.addSequence(BasicSequence(name, seq, asc.getAlphabet()), checkNames_);
105  }
106  name = v[0];
107  seq = v[1];
108  }
109  else
110  {
111  //No sequence name found.
112  if (TextTools::isEmpty(name))
113  throw Exception("First sequence in file has no name!");
114  seq += TextTools::removeWhiteSpaces(temp);
115  }
116  //while(!TextTools::isEmpty(temp))
117  //{
118  // //Sequences are separated by at least one blank line:
119  // getline(in, temp, '\n'); // read next line in file.
120  // seq += TextTools::removeWhiteSpaces(temp);
121  //}
122  //end of this sequence:
123  temp = TextTools::removeSurroundingWhiteSpaces(FileTools::getNextLine(in));
124 
125  }
126  // Add last sequence:
127  asc.addSequence(BasicSequence(name, seq, asc.getAlphabet()), checkNames_);
128 }
129 
130 /******************************************************************************/
131 
132 void Phylip::readInterleaved(std::istream& in, SiteContainer& asc) const throw (Exception)
133 {
134  string temp;
135 
136  //Read first line:
137  getline(in, temp, '\n'); // Copy current line in temporary string
138  StringTokenizer st(temp);
139  unsigned int nbSequences = TextTools::to<unsigned int>(st.nextToken());
140  //int nbSites = TextTools::toInt(st.nextToken());
141  temp = FileTools::getNextLine(in);
142 
143  vector<string> names, seqs;
144  // Read first block:
145  for (unsigned int i = 0; i < nbSequences && !in.eof() && !TextTools::isEmpty(temp); i++)
146  {
147  vector<string> v = splitNameAndSequence(temp);
148  names.push_back(v[0]);
149  seqs.push_back(v[1]);
150  getline(in, temp, '\n'); // read next line in file.
151  }
152 
153  //Then read all other blocks:
154  temp = FileTools::getNextLine(in);
155  while (!in.eof())
156  {
157  for (unsigned int i = 0; i < names.size(); i++)
158  {
159  if (TextTools::isEmpty(temp))
160  throw IOException("Phylip::readInterleaved. Bad file,there are not the same number of sequence in each block.");
161  seqs[i] += TextTools::removeWhiteSpaces(temp);
162  getline(in, temp, '\n'); // read next line in file.
163  }
164  temp = FileTools::getNextLine(in);
165  }
166  for (unsigned int i = 0; i < names.size(); i++)
167  {
168  asc.addSequence(BasicSequence(names[i], seqs[i], asc.getAlphabet()), checkNames_);
169  }
170 }
171 
172 /******************************************************************************/
173 
174 void Phylip::appendAlignmentFromStream(std::istream& input, SiteContainer& vsc) const throw (Exception)
175 {
176  // Checking the existence of specified file
177  if (!input) { throw IOException ("Phylip::read: fail to open file"); }
178 
179  if(sequential_) readSequential (input, vsc);
180  else readInterleaved(input, vsc);
181 }
182 
183 /******************************************************************************/
184 
185 unsigned int Phylip::getNumberOfSequences(const std::string& path) const throw (IOException)
186 {
187  // Checking the existence of specified file
188  ifstream file (path.c_str(), ios::in);
189  if (! file) { throw IOException ("Phylip::getNumberOfSequences: failed to open file"); }
190  string firstLine = FileTools::getNextLine(file);
191  StringTokenizer st(firstLine, " \t");
192  istringstream iss(st.nextToken());
193  unsigned int nb;
194  iss >> nb;
195  file.close();
196  return nb;
197 }
198 
199 /******************************************************************************/
200 
201 std::vector<std::string> Phylip::getSizedNames(const std::vector<std::string>& names) const
202 {
203  vector<string> sizedNames(names.size());
204  if (extended_)
205  {
206  //Add 6 white spaces to the larger name and align other names.
207  //First, determine the size of the wider name:
208  size_t sizeMax = 0;
209  for (size_t i = 0; i < names.size(); i++)
210  if (names[i].size() > sizeMax) sizeMax = names[i].size();
211  //Quite easy ;-) Now update all lengths:
212  for (size_t i = 0; i < names.size(); i++)
213  sizedNames[i] = TextTools::resizeRight(names[i], sizeMax) + namesSplit_;
214  }
215  else
216  {
217  //We trunc all names to ten characters:
218  for(unsigned int i = 0; i < names.size(); i++) sizedNames[i] = TextTools::resizeRight(names[i], 10);
219  cout << "Warning: names have been truncated to 10 characters. They may be ambiguous sequence names then." << endl;
220  }
221  return sizedNames;
222 }
223 
224 /******************************************************************************/
225 
226 void Phylip::writeSequential(std::ostream& out, const SequenceContainer& sc) const
227 {
228  //cout << "Write sequential" << endl;
229  size_t numberOfSites = sc.getSequence(sc.getSequencesNames()[0]).size() * sc.getAlphabet()->getStateCodingSize();
230  out << sc.getNumberOfSequences() << " " << numberOfSites << endl;
231 
232  vector<string> seqNames = sc.getSequencesNames();
233  vector<string> names = getSizedNames(seqNames);
234  for (size_t i = 0; i < seqNames.size(); ++i)
235  {
236  vector<string> seq = TextTools::split(sc.toString(seqNames[i]), charsByLine_);
237  out << names[i] << seq[0] << endl;
238  for (size_t j = 1; j < seq.size(); ++j)
239  {
240  out << string(names[i].size(), ' ') << seq[j] << endl;
241  }
242  out << endl;
243  }
244 }
245 
246 void Phylip::writeInterleaved(std::ostream& out, const SequenceContainer& sc) const
247 {
248  //cout << "Write interleaved;" << endl;
249  size_t numberOfSites = sc.getSequence(sc.getSequencesNames()[0]).size() * sc.getAlphabet()->getStateCodingSize();
250  out << sc.getNumberOfSequences() << " " << numberOfSites << endl;
251 
252  vector<string> seqNames = sc.getSequencesNames();
253  vector<string> names = getSizedNames(seqNames);
254  //Split sequences:
255  vector< vector<string> > seqs(sc.getNumberOfSequences());
256  for (size_t i = 0; i < seqNames.size(); ++i)
257  {
258  seqs[i] = TextTools::split(sc.toString(seqNames[i]), charsByLine_);
259  }
260  //Write first block:
261  for (size_t i = 0; i < names.size(); ++i)
262  {
263  out << names[i] << seqs[i][0] << endl;
264  }
265  out << endl;
266  //Write other blocks:
267  for (size_t j = 1; j < seqs[0].size(); ++j)
268  {
269  for (size_t i = 0; i < sc.getNumberOfSequences(); ++i)
270  {
271  out << seqs[i][j] << endl;
272  }
273  out << endl;
274  }
275 }
276 
277 /******************************************************************************/
278 
279 void Phylip::writeAlignment(std::ostream& output, const SiteContainer& sc) const throw (Exception)
280 {
281  //First must check if all sequences are aligned:
282  if (sc.getNumberOfSequences() == 0)
283  throw Exception("Phylip::write. SequenceContainer appear to contain no sequence.");
284 
285  // Checking the existence of specified file, and possibility to open it in write mode
286  if (!output) { throw IOException ("Phylip::write : failed to open file"); }
287 
288  if (sequential_) writeSequential (output, sc);
289  else writeInterleaved(output, sc);
290 }
291 
292 /******************************************************************************/
293 
294 const std::string Phylip::getFormatName() const { return "Phylip file, " + string(extended_ ? "extended," : "") + string(sequential_ ? "sequential" : "interleaved"); }
295 
296 /******************************************************************************/
297 
298 const std::string Phylip::getFormatDescription() const
299 {
300  return "Phylip file format, sequential and interleaved. PAML extension also supported.";
301 }
302 
303 /******************************************************************************/
304 
void writeInterleaved(std::ostream &out, const SequenceContainer &sc) const
Definition: Phylip.cpp:246
void readSequential(std::istream &in, SiteContainer &asc) const
Definition: Phylip.cpp:75
The SiteContainer interface.
Definition: SiteContainer.h:63
const std::string getFormatDescription() const
Definition: Phylip.cpp:298
This alphabet is used to deal NumericAlphabet.
std::vector< std::string > getSizedNames(const std::vector< std::string > &names) const
Definition: Phylip.cpp:201
void appendAlignmentFromStream(std::istream &input, SiteContainer &sc) const
Append sequences to a container from a stream.
Definition: Phylip.cpp:174
const std::string getFormatName() const
Definition: Phylip.cpp:294
virtual std::string toString(const std::string &name) const =0
Convert a particular sequence to a string.
void readInterleaved(std::istream &in, SiteContainer &asc) const
Definition: Phylip.cpp:132
STL namespace.
virtual unsigned int getStateCodingSize() const =0
Get the size of the string coding a state.
virtual size_t getNumberOfSequences() const =0
Get the number of sequences in the container.
void writeAlignment(std::ostream &output, const SiteContainer &sc) const
Write a container to a stream.
Definition: Phylip.cpp:279
A basic implementation of the Sequence interface.
Definition: Sequence.h:207
virtual const Sequence & getSequence(const std::string &name) const =0
Retrieve a sequence object from the container.
virtual std::vector< std::string > getSequencesNames() const =0
Get all the names of the sequences in the container.
void writeSequential(std::ostream &out, const SequenceContainer &sc) const
Definition: Phylip.cpp:226
const std::vector< std::string > splitNameAndSequence(const std::string &s) const
Definition: Phylip.cpp:55
virtual const Alphabet * getAlphabet() const =0
Get sequence container&#39;s alphabet.
unsigned int getNumberOfSequences(const std::string &path) const
Definition: Phylip.cpp:185
The SequenceContainer interface.