bpp-seq  2.2.0
Fasta.cpp
Go to the documentation of this file.
1 //
2 // File: Fasta.cpp
3 // Authors: Guillaume Deuchst
4 // Julien Dutheil
5 // Sylvain Gaillard
6 // Created: Tue Aug 21 2003
7 //
8 
9 /*
10 Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
11 
12 This software is a computer program whose purpose is to provide classes
13 for sequences analysis.
14 
15 This software is governed by the CeCILL license under French law and
16 abiding by the rules of distribution of free software. You can use,
17 modify and/ or redistribute the software under the terms of the CeCILL
18 license as circulated by CEA, CNRS and INRIA at the following URL
19 "http://www.cecill.info".
20 
21 As a counterpart to the access to the source code and rights to copy,
22 modify and redistribute granted by the license, users are provided only
23 with a limited warranty and the software's author, the holder of the
24 economic rights, and the successive licensors have only limited
25 liability.
26 
27 In this respect, the user's attention is drawn to the risks associated
28 with loading, using, modifying and/or developing or reproducing the
29 software by the user in light of its specific status of free software,
30 that may mean that it is complicated to manipulate, and that also
31 therefore means that it is reserved for developers and experienced
32 professionals having in-depth computer knowledge. Users are therefore
33 encouraged to load and test the software's suitability as regards their
34 requirements in conditions enabling the security of their systems and/or
35 data to be ensured and, more generally, to use and operate it in the
36 same conditions as regards security.
37 
38 The fact that you are presently reading this means that you have had
39 knowledge of the CeCILL license and that you accept its terms.
40 */
41 
42 #include "Fasta.h"
43 
44 #include <fstream>
45 
46 #include "../StringSequenceTools.h"
47 #include <Bpp/Text/TextTools.h>
48 #include <Bpp/Text/StringTokenizer.h>
49 #include <Bpp/Io/FileTools.h>
50 
51 using namespace bpp;
52 using namespace std;
53 
54 /******************************************************************************/
55 
56 bool Fasta::nextSequence(istream& input, Sequence& seq) const throw (Exception) {
57  if (!input)
58  throw IOException("Fasta::nextSequence: can't read from istream input");
59  string seqname = "";
60  string content = "";
61  Comments seqcmts;
62  short seqcpt = 0;
63  string linebuffer = "";
64  char c;
65  while (!input.eof())
66  {
67  c = static_cast<char>(input.peek());
68  if (input.eof())
69  c = '\n';
70 
71  // Sequence begining detection
72  if (c == '>')
73  {
74  // Stop if find a new sequence
75  if (seqcpt++)
76  break;
77  }
78  getline(input, linebuffer);
79  if (c == '>')
80  {
81  // Get the sequence name line
82  seqname = string(linebuffer.begin() + 1, linebuffer.end());
83  }
84  if (c != '>' && !TextTools::isWhiteSpaceCharacter(c)) {
85  // Sequence content
86  content += TextTools::toUpper(TextTools::removeWhiteSpaces(linebuffer));
87  }
88  }
89 
90  bool res = (!input.eof());
91  // Sequence name and comments isolation
92  if (strictNames_ || extended_) {
93  size_t pos = seqname.find_first_of(" \t\n");
94  string seqcmt;
95  if (pos != string::npos) {
96  seqcmt = seqname.substr(pos + 1);
97  seqname = seqname.substr(0, pos);
98  }
99  if (extended_) {
100  StringTokenizer st(seqcmt, " \\", true, false);
101  while (st.hasMoreToken()) {
102  seqcmts.push_back(st.nextToken());
103  }
104  } else {
105  seqcmts.push_back(seqcmt);
106  }
107  seq.setComments(seqcmts);
108  }
109  seq.setName(seqname);
110  seq.setContent(content);
111  return res;
112 }
113 
114 /******************************************************************************/
115 
116 void Fasta::writeSequence(ostream& output, const Sequence& seq) const throw (Exception)
117 {
118  if (!output)
119  throw IOException("Fasta::writeSequence: can't write to ostream output");
120  // Sequence name
121  output << ">" << seq.getName();
122  // Sequence comments
123  if (extended_)
124  {
125  for (unsigned int i = 0 ; i < seq.getComments().size() ; i++)
126  {
127  output << " \\" << seq.getComments()[i];
128  }
129  }
130  output << endl;
131  // Sequence content
132  string buffer; // use a buffer to format sequence with states > 1 char
133  for (size_t i = 0 ; i < seq.size() ; ++i)
134  {
135  buffer += seq.getChar(i);
136  if (buffer.size() >= charsByLine_)
137  {
138  output << string(buffer.begin(), buffer.begin() + charsByLine_) << endl;
139  buffer.erase(0, charsByLine_);
140  }
141  }
142  output << string(buffer.begin(), buffer.end()) << endl;
143 }
144 
145 /******************************************************************************/
146 
147 void Fasta::appendSequencesFromStream(istream& input, SequenceContainer& vsc) const throw (Exception)
148 {
149  if (!input)
150  throw IOException("Fasta::appendFromStream: can't read from istream input");
151  char c = '\n';
152  char last_c;
153  bool header = false;
154  bool hasSeq = true;
155  string line = "";
156  Comments cmts;
157  while (!input.eof() && hasSeq)
158  {
159  last_c = c;
160  input.get(c);
161  // Header detection
162  if (extended_ && c == '#')
163  {
164  header = true;
165  continue;
166  }
167  // Header end detection
168  if (c == '\n')
169  {
170  if (extended_ && header)
171  {
172  if (line[0] == '\\')
173  {
174  line.erase(line.begin());
175  cmts.push_back(line);
176  }
177  line = "";
178  header = false;
179  }
180  continue;
181  }
182  // Header capture
183  if (header)
184  {
185  line.append(1, c);
186  }
187  // Sequence detection
188  if (c == '>' && last_c == '\n')
189  {
190  input.putback(c);
191  c = last_c;
192  BasicSequence tmpseq("", "", vsc.getAlphabet());
193  hasSeq = nextSequence(input, tmpseq);
194  vsc.addSequence(tmpseq, checkNames_);
195  }
196  }
197  if (extended_ && cmts.size()) {
198  vsc.setGeneralComments(cmts);
199  }
200 }
201 
202 /******************************************************************************/
203 
204 void Fasta::writeSequences(ostream& output, const SequenceContainer& sc) const throw (Exception)
205 {
206  if (!output)
207  throw IOException("Fasta::write: can't write to ostream output");
208 
209  if (extended_)
210  {
211  // Loop for all general comments
212  for (unsigned int i = 0 ; i < sc.getGeneralComments().size() ; i++)
213  {
214  output << "#\\" << sc.getGeneralComments()[i] << endl;
215  }
216  output << endl;
217  }
218 
219  // Main loop : for all sequences in vector container
220  vector<string> names = sc.getSequencesNames();
221  for (size_t i = 0; i < names.size(); ++i)
222  {
223  writeSequence(output, sc.getSequence(names[i]));
224  }
225 }
226 
227 /******************************************************************************/
228 
229 // FileIndex class
230 
231 void Fasta::FileIndex::build(const std::string& path) throw (Exception) {
232  // open the file
233  std::ifstream f_in(path.c_str());
234  // get the size of the file
235  f_in.seekg(0, std::ios::end);
236  fileSize_ = f_in.tellg();
237  // feed the map
238  f_in.seekg(0, std::ios::beg);
239  streampos pos = f_in.tellg();
240  char ch;
241  std::string seq_id = "";
242  while (f_in.get(ch)) {
243  if (ch == '>') {
244  pos = static_cast<int>(f_in.tellg()) - 1;
245  std::getline(f_in, seq_id);
246  index_[seq_id] = pos;
247  }
248  }
249  f_in.close();
250 }
251 
252 streampos Fasta::FileIndex::getSequencePosition(const std::string& id) const throw (Exception) {
253  std::map<std::string, streampos>::const_iterator it = index_.find(id);
254  if (it != index_.end()) {
255  return it->second;
256  }
257  throw Exception("Sequence not found: " + id);
258 }
259 
260 void Fasta::FileIndex::read(const std::string& path) throw (Exception) {
261  std::ifstream f_in(path.c_str());
262  std::string line_buffer = "";
263  while (!f_in.eof()) {
264  std::getline(f_in, line_buffer);
265  if (bpp::TextTools::isEmpty(bpp::TextTools::removeSurroundingWhiteSpaces(line_buffer))) {
266  continue;
267  }
268  bpp::StringTokenizer tk(line_buffer, "\t");
269  index_[tk.getToken(0)] = bpp::TextTools::toInt(tk.getToken(1));
270  }
271  f_in.close();
272 }
273 
274 void Fasta::FileIndex::write(const std::string& path) throw (Exception) {
275  std::ofstream f_out(path.c_str());
276  for (std::map<std::string, streampos>::const_iterator it = index_.begin() ; it != index_.end() ; ++it) {
277  f_out << it->first << "\t" << bpp::TextTools::toString(it->second) << std::endl;
278  }
279  f_out.close();
280 }
281 
282 void Fasta::FileIndex::getSequence(const std::string& seqid, Sequence& seq, const std::string& path) const {
283  Fasta fs(60);
284  streampos seq_pos = this->getSequencePosition(seqid);
285  std::ifstream fasta(path.c_str());
286  fasta.seekg(seq_pos);
287  fs.nextSequence(fasta, seq);
288  fasta.close();
289 }
290 
291 /******************************************************************************/
292 
void build(const std::string &path)
Build the index given a path to the file.
Definition: Fasta.cpp:231
std::vector< std::string > Comments
Declaration of Comments type.
Definition: Sequence.h:60
The fasta sequence file format.
Definition: Fasta.h:63
void getSequence(const std::string &seqid, Sequence &seq, const std::string &path) const
Get a sequence given its ID.
Definition: Fasta.cpp:282
This alphabet is used to deal NumericAlphabet.
std::streampos getSequencePosition(const std::string &id) const
Get the position of a Sequence given its ID.
Definition: Fasta.cpp:252
STL namespace.
void read(const std::string &path)
Read the index from a file.
Definition: Fasta.cpp:260
A basic implementation of the Sequence interface.
Definition: Sequence.h:207
void write(const std::string &path)
Write the index to a file.
Definition: Fasta.cpp:274
void writeSequence(std::ostream &output, const Sequence &seq) const
Read sequence from stream.
Definition: Fasta.cpp:116
The sequence interface.
Definition: Sequence.h:74
void writeSequences(std::ostream &output, const SequenceContainer &sc) const
Write a container to a stream.
Definition: Fasta.cpp:204
void appendSequencesFromStream(std::istream &input, SequenceContainer &sc) const
Append sequences to a container from a stream.
Definition: Fasta.cpp:147
The SequenceContainer interface.
bool nextSequence(std::istream &input, Sequence &seq) const
Read sequence from stream.
Definition: Fasta.cpp:56