bpp-seq-omics  2.2.0
MafParser.cpp
Go to the documentation of this file.
1 //
2 // File: MafParser.cpp
3 // Authors: Julien Dutheil
4 // Created: Tue Apr 27 2010
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2010)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
40 #include "MafParser.h"
41 #include <Bpp/Seq/SequenceWithQuality.h>
42 #include <Bpp/Seq/SequenceWithAnnotationTools.h>
43 #include <Bpp/Text/TextTools.h>
44 #include <Bpp/Text/KeyvalTools.h>
45 
46 using namespace std;
47 using namespace bpp;
48 
49 MafBlock* MafParser::analyseCurrentBlock_() throw (Exception)
50 {
51  MafBlock* block = 0;
52 
53  string line;
54  bool test = true;
55  MafSequence* currentSequence = 0;
56  while (test)
57  {
58  if (stream_->eof()) return 0;
59  getline(*stream_, line, '\n');
60  if (TextTools::isEmpty(line))
61  {
62  if (firstBlock_)
63  continue;
64  if (currentSequence) {
65  //Add previous sequence:
66  block->addSequence(*currentSequence); //The sequence is copied in the container.
67  delete currentSequence;
68  }
69 
70  //end of paragraph
71  test = false;
72  }
73  else if (line[0] == 'a')
74  {
75  if (currentSequence) {
76  //Add previous sequence:
77  block->addSequence(*currentSequence); //The sequence is copied in the container.
78  delete currentSequence;
79  }
80 
81  //New block.
82  block = new MafBlock();
83  firstBlock_ = false;
84 
85  map<string, string> args;
86  if (line.size() > 2)
87  {
88  KeyvalTools::multipleKeyvals(line.substr(2), args, " ");
89 
90  if (args.find("score") != args.end())
91  if (args["score"] != "NA")
92  block->setScore(TextTools::toDouble(args["score"]));
93 
94  if (args.find("pass") != args.end())
95  block->setPass(TextTools::to<unsigned int>(args["pass"]));
96  }
97  }
98  else if (line[0] == 's')
99  {
100  StringTokenizer st(line);
101  st.nextToken(); //The 's' tag
102  string src = st.nextToken();
103  unsigned int start = TextTools::to<unsigned int>(st.nextToken());
104  unsigned int size = TextTools::to<unsigned int>(st.nextToken());
105  string tmp = st.nextToken();
106  if (tmp.size() != 1)
107  throw Exception("MafAlignmentParser::nextBlock. Strand specification is incorrect, should be only one character long, found " + TextTools::toString(tmp.size()) + ".");
108  char strand = tmp[0];
109 
110  unsigned int srcSize = TextTools::to<unsigned int>(st.nextToken());
111  if (currentSequence) {
112  //Add previous sequence:
113  block->addSequence(*currentSequence); //The sequence is copied in the container.
114  delete currentSequence;
115  }
116  const string seq = st.nextToken();
117  currentSequence = new MafSequence(src, seq, start, strand, srcSize);
118  if (currentSequence->getGenomicSize() != size)
119  throw Exception("MafAlignmentParser::nextBlock. Sequence found (" + src + ") does not match specified size: " + TextTools::toString(currentSequence->getGenomicSize()) + ", should be " + TextTools::toString(size) + ".");
120 
121  //Add mask:
122  if (mask_) {
123  vector<bool> mask(currentSequence->size());
124  for (unsigned int i = 0; i < mask.size(); ++i) {
125  mask[i] = cmAlphabet_.isMasked(seq[i]);
126  }
127  currentSequence->addAnnotation(new SequenceMask(mask));
128  }
129  }
130  else if (line[0] == 'q')
131  {
132  if (!currentSequence)
133  throw Exception("MafAlignmentParser::nextBlock(). Quality scores found, but there is currently no sequence!");
134  StringTokenizer st(line);
135  st.nextToken(); //The 'q' tag
136  string name = st.nextToken();
137  if (name != currentSequence->getName())
138  throw Exception("MafAlignmentParser::nextBlock(). Quality scores found, but with a different name from the previous sequence: " + name + ", should be " + currentSequence->getName() + ".");
139  string qstr = st.nextToken();
140  //Now parse the score string:
141  SequenceQuality* seqQual = new SequenceQuality(qstr.size());
142  for (unsigned int i = 0; i < qstr.size(); ++i) {
143  char c = qstr[i];
144  if (c == '-') {
145  seqQual->setScore(i, -1);
146  } else if (c == '0' || c == '1' || c == '2' || c== '3' || c == '4' || c == '5' || c == '6' || c == '7' || c == '8' || c == '9') {
147  seqQual->setScore(i, c - '0');
148  } else if (c == 'F' || c == 'f') { //Finished
149  seqQual->setScore(i, 10);
150  } else if (c == '?' || c == '.') {
151  seqQual->setScore(i, -2);
152  } else {
153  throw Exception("MafAlignmentParser::nextBlock(). Unvalid quality score: " + TextTools::toString(c) + ". Should be 0-9, F or '-'.");
154  }
155  }
156  currentSequence->addAnnotation(seqQual);
157  }
158  }
159  return block;
160 }
161 
void setScore(double score)
Definition: MafBlock.h:102
STL namespace.
size_t getGenomicSize() const
Definition: MafSequence.h:158
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:55
void addSequence(const MafSequence &sequence)
Definition: MafBlock.h:115
void setPass(unsigned int pass)
Definition: MafBlock.h:103
A sequence class which is used to store data from MAF files.
Definition: MafSequence.h:62