bpp-seq-omics  2.2.0
AlignmentFilterMafIterator.cpp
Go to the documentation of this file.
1 //
2 // File: AlignmentFilterMafIterator.cpp
3 // Authors: Julien Dutheil
4 // Created: Tue Sep 07 2010
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2010)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
41 
42 using namespace bpp;
43 
44 //From the STL:
45 #include <string>
46 #include <numeric>
47 
48 using namespace std;
49 
51 {
52  if (blockBuffer_.size() == 0) {
53  //Else there is no more block in the buffer, we need to parse more:
54  do {
55  MafBlock* block = iterator_->nextBlock();
56  if (!block) return 0; //No more block.
57 
58  //Parse block.
59  int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode();
60  int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode();
61  size_t nr;
62  size_t nc = static_cast<size_t>(block->getNumberOfSites());
63 
64  vector< vector<int> > aln;
65  if (missingAsGap_) {
66  nr = species_.size();
67  aln.resize(nr);
68  for (size_t i = 0; i < nr; ++i) {
69  if (block->hasSequenceForSpecies(species_[i]))
70  aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
71  else {
72  aln[i].resize(nc);
73  fill(aln[i].begin(), aln[i].end(), gap);
74  }
75  }
76  } else {
77  vector<string> speciesSet = VectorTools::vectorIntersection(species_, block->getSpeciesList());
78  nr = speciesSet.size();
79  aln.resize(nr);
80  for (size_t i = 0; i < nr; ++i) {
81  aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
82  }
83  }
84 
85 
86 
87 
88  //First we create a mask:
89  vector<size_t> pos;
90  vector<int> col(nr);
91  //Reset window:
92  window_.clear();
93  //Init window:
94  size_t i;
95  if (nc < windowSize_)
96  throw Exception("AlignmentFilterMafIterator::analyseCurrentBlock_. Block is smaller than window size: " + TextTools::toString(nc));
97  for (i = 0; i < windowSize_; ++i) {
98  for (size_t j = 0; j < nr; ++j) {
99  col[j] = aln[j][i];
100  }
101  window_.push_back(col);
102  }
103  //Slide window:
104  if (verbose_) {
105  ApplicationTools::message->endLine();
106  ApplicationTools::displayTask("Sliding window for alignment filter", true);
107  }
108  while (i + step_ < nc) {
109  if (verbose_)
110  ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
111  //Evaluate current window:
112  unsigned int sumGap = 0;
113  double sumEnt = 0;
114  for (size_t u = 0; u < window_.size(); ++u) {
115  for (size_t v = 0; v < window_[u].size(); ++v) {
116  if (window_[u][v] == gap || window_[u][v] == unk) {
117  sumGap++;
118  }
119  col[v] = window_[u][v];
120  if (col[v] == unk) col[v] = gap;
121  }
122  sumEnt += VectorTools::shannonDiscrete<int, double>(col) / log(5.);
123  }
124  if (sumGap > maxGap_ && (sumEnt / static_cast<double>(windowSize_)) > maxEnt_) {
125  if (pos.size() == 0) {
126  pos.push_back(i - windowSize_);
127  pos.push_back(i);
128  } else {
129  if (i - windowSize_ < pos[pos.size() - 1]) {
130  pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
131  } else { //This is a new region
132  pos.push_back(i - windowSize_);
133  pos.push_back(i);
134  }
135  }
136  }
137 
138  //Move forward:
139  for (size_t k = 0; k < step_; ++k) {
140  for (size_t j = 0; j < nr; ++j) {
141  col[j] = aln[j][i];
142  }
143  window_.push_back(col);
144  window_.pop_front();
145  ++i;
146  }
147  }
148 
149  //Evaluate last window:
150  unsigned int sumGap = 0;
151  double sumEnt = 0;
152  for (size_t u = 0; u < window_.size(); ++u) {
153  for (size_t v = 0; v < window_[u].size(); ++v) {
154  if (window_[u][v] == gap || window_[u][v] == unk) {
155  sumGap++;
156  }
157  col[v] = window_[u][v];
158  if (col[v] == unk) col[v] = gap;
159  }
160  sumEnt += VectorTools::shannonDiscrete<int, double>(col) / log(5.);
161  }
162  if (sumGap > maxGap_ && (sumEnt / static_cast<double>(windowSize_)) > maxEnt_) {
163  if (pos.size() == 0) {
164  pos.push_back(i - windowSize_);
165  pos.push_back(i);
166  } else {
167  if (i - windowSize_ <= pos[pos.size() - 1]) {
168  pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
169  } else { //This is a new region
170  pos.push_back(i - windowSize_);
171  pos.push_back(i);
172  }
173  }
174  }
175  if (verbose_)
176  ApplicationTools::displayTaskDone();
177 
178  //Now we remove regions with two many gaps, using a sliding window:
179  if (pos.size() == 0) {
180  blockBuffer_.push_back(block);
181  if (logstream_) {
182  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine();
183  }
184  } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
185  //Everything is removed:
186  if (logstream_) {
187  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
188  }
189  } else {
190  if (logstream_) {
191  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
192  }
193  if (verbose_) {
194  ApplicationTools::message->endLine();
195  ApplicationTools::displayTask("Spliting block", true);
196  }
197  for (i = 0; i < pos.size(); i+=2) {
198  if (verbose_)
199  ApplicationTools::displayGauge(i, pos.size() - 2, '=');
200  if (logstream_) {
201  (*logstream_ << "ALN CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
202  }
203  if (pos[i] > 0) {
204  MafBlock* newBlock = new MafBlock();
205  newBlock->setScore(block->getScore());
206  newBlock->setPass(block->getPass());
207  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
208  MafSequence* subseq;
209  if (i == 0) {
210  subseq = block->getSequence(j).subSequence(0, pos[i]);
211  } else {
212  subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
213  }
214  newBlock->addSequence(*subseq);
215  delete subseq;
216  }
217  blockBuffer_.push_back(newBlock);
218  }
219 
220  if (keepTrashedBlocks_) {
221  MafBlock* outBlock = new MafBlock();
222  outBlock->setScore(block->getScore());
223  outBlock->setPass(block->getPass());
224  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
225  MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
226  outBlock->addSequence(*outseq);
227  delete outseq;
228  }
229  trashBuffer_.push_back(outBlock);
230  }
231  }
232  //Add last block:
233  if (pos[pos.size() - 1] < block->getNumberOfSites()) {
234  MafBlock* newBlock = new MafBlock();
235  newBlock->setScore(block->getScore());
236  newBlock->setPass(block->getPass());
237  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
238  MafSequence* subseq;
239  subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
240  newBlock->addSequence(*subseq);
241  delete subseq;
242  }
243  blockBuffer_.push_back(newBlock);
244  }
245  if (verbose_)
246  ApplicationTools::displayTaskDone();
247 
248  delete block;
249  }
250  } while (blockBuffer_.size() == 0);
251  }
252 
253  MafBlock* block = blockBuffer_.front();
254  blockBuffer_.pop_front();
255  return block;
256 }
257 
259 {
260  if (blockBuffer_.size() == 0) {
261  //Else there is no more block in the buffer, we need to parse more:
262  do {
263  MafBlock* block = iterator_->nextBlock();
264  if (!block) return 0; //No more block.
265 
266  //Parse block.
267  int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode();
268  int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode();
269  size_t nr;
270  size_t nc = static_cast<size_t>(block->getNumberOfSites());
271 
272  vector< vector<int> > aln;
273  if (missingAsGap_) {
274  nr = species_.size();
275  aln.resize(nr);
276  for (size_t i = 0; i < nr; ++i) {
277  if (block->hasSequenceForSpecies(species_[i]))
278  aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
279  else {
280  aln[i].resize(nc);
281  fill(aln[i].begin(), aln[i].end(), gap);
282  }
283  }
284  } else {
285  vector<string> speciesSet = VectorTools::vectorIntersection(species_, block->getSpeciesList());
286  nr = speciesSet.size();
287  aln.resize(nr);
288  for (size_t i = 0; i < nr; ++i) {
289  aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
290  }
291  }
292  //First we create a mask:
293  vector<size_t> pos;
294  vector<bool> col(nr);
295  //Reset window:
296  window_.clear();
297  //Init window:
298  size_t i;
299  for (i = 0; i < windowSize_; ++i) {
300  for (size_t j = 0; j < nr; ++j) {
301  col[j] = (aln[j][i] == gap || aln[j][i] == unk);
302  }
303  window_.push_back(col);
304  }
305  //Slide window:
306  if (verbose_) {
307  ApplicationTools::message->endLine();
308  ApplicationTools::displayTask("Sliding window for alignment filter", true);
309  }
310  while (i + step_ < nc) {
311  if (verbose_)
312  ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
313  //Evaluate current window:
314  unsigned int count = 0;
315  bool posIsGap = true;
316  for (size_t u = 0; u < window_.size(); ++u) {
317  unsigned int partialCount = 0;
318  if (!posIsGap || (u > 0 && window_[u] != window_[u - 1])) {
319  for (size_t v = 0; v < window_[u].size(); ++v)
320  if (window_[u][v]) partialCount++;
321  if (partialCount > maxGap_) {
322  count++;
323  posIsGap = true;
324  } else {
325  posIsGap = false;
326  }
327  }
328  }
329  if (count > maxPos_) {
330  if (pos.size() == 0) {
331  pos.push_back(i - windowSize_);
332  pos.push_back(i);
333  } else {
334  if (i - windowSize_ < pos[pos.size() - 1]) {
335  pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
336  } else { //This is a new region
337  pos.push_back(i - windowSize_);
338  pos.push_back(i);
339  }
340  }
341  }
342 
343  //Move forward:
344  for (size_t k = 0; k < step_; ++k) {
345  for (size_t j = 0; j < nr; ++j) {
346  col[j] = (aln[j][i] == gap || aln[j][i] == unk);
347  }
348  window_.push_back(col);
349  window_.pop_front();
350  ++i;
351  }
352  }
353 
354  //Evaluate last window:
355  unsigned int count = 0;
356  bool posIsGap = true;
357  for (size_t u = 0; u < window_.size(); ++u) {
358  unsigned int partialCount = 0;
359  if (!posIsGap || (u > 0 && window_[u] != window_[u - 1])) {
360  for (size_t v = 0; v < window_[u].size(); ++v)
361  if (window_[u][v]) partialCount++;
362  if (partialCount > maxGap_) {
363  count++;
364  posIsGap = true;
365  } else {
366  posIsGap = false;
367  }
368  }
369  }
370  if (count > maxPos_) {
371  if (pos.size() == 0) {
372  pos.push_back(i - windowSize_);
373  pos.push_back(i);
374  } else {
375  if (i - windowSize_ <= pos[pos.size() - 1]) {
376  pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
377  } else { //This is a new region
378  pos.push_back(i - windowSize_);
379  pos.push_back(i);
380  }
381  }
382  }
383  if (verbose_)
384  ApplicationTools::displayTaskDone();
385 
386  //Now we remove regions with two many gaps, using a sliding window:
387  if (pos.size() == 0) {
388  blockBuffer_.push_back(block);
389  if (logstream_) {
390  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine();
391  }
392  } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
393  //Everything is removed:
394  if (logstream_) {
395  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
396  }
397  } else {
398  if (logstream_) {
399  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
400  }
401  if (verbose_) {
402  ApplicationTools::message->endLine();
403  ApplicationTools::displayTask("Spliting block", true);
404  }
405  for (i = 0; i < pos.size(); i+=2) {
406  if (verbose_)
407  ApplicationTools::displayGauge(i, pos.size() - 2, '=');
408  if (logstream_) {
409  (*logstream_ << "ALN CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
410  }
411  if (pos[i] > 0) {
412  MafBlock* newBlock = new MafBlock();
413  newBlock->setScore(block->getScore());
414  newBlock->setPass(block->getPass());
415  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
416  MafSequence* subseq;
417  if (i == 0) {
418  subseq = block->getSequence(j).subSequence(0, pos[i]);
419  } else {
420  subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
421  }
422  newBlock->addSequence(*subseq);
423  delete subseq;
424  }
425  blockBuffer_.push_back(newBlock);
426  }
427 
428  if (keepTrashedBlocks_) {
429  MafBlock* outBlock = new MafBlock();
430  outBlock->setScore(block->getScore());
431  outBlock->setPass(block->getPass());
432  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
433  MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
434  outBlock->addSequence(*outseq);
435  delete outseq;
436  }
437  trashBuffer_.push_back(outBlock);
438  }
439  }
440  //Add last block:
441  if (pos[pos.size() - 1] < block->getNumberOfSites()) {
442  MafBlock* newBlock = new MafBlock();
443  newBlock->setScore(block->getScore());
444  newBlock->setPass(block->getPass());
445  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
446  MafSequence* subseq;
447  subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
448  newBlock->addSequence(*subseq);
449  delete subseq;
450  }
451  blockBuffer_.push_back(newBlock);
452  }
453  if (verbose_)
454  ApplicationTools::displayTaskDone();
455 
456  delete block;
457  }
458  } while (blockBuffer_.size() == 0);
459  }
460 
461  MafBlock* block = blockBuffer_.front();
462  blockBuffer_.pop_front();
463  return block;
464 }
465 
const MafSequence & getSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:139
void setScore(double score)
Definition: MafBlock.h:102
unsigned int getPass() const
Definition: MafBlock.h:106
STL namespace.
size_t getNumberOfSites() const
Definition: MafBlock.h:113
MafSequence * subSequence(size_t startAt, size_t length) const
Extract a sub-sequence.
Definition: MafSequence.cpp:48
bool hasSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:129
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:55
void addSequence(const MafSequence &sequence)
Definition: MafBlock.h:115
std::vector< std::string > getSpeciesList() const
Definition: MafBlock.h:162
double getScore() const
Definition: MafBlock.h:105
void setPass(unsigned int pass)
Definition: MafBlock.h:103
size_t getNumberOfSequences() const
Definition: MafBlock.h:111
std::string getDescription() const
Definition: MafBlock.h:177
const MafSequence & getSequence(const std::string &name) const
Definition: MafBlock.h:121
A sequence class which is used to store data from MAF files.
Definition: MafSequence.h:62