bpp-seq-omics  2.2.0
FeatureFilterMafIterator.cpp
Go to the documentation of this file.
1 //
2 // File: FeatureFilterMafIterator.cpp
3 // Authors: Julien Dutheil
4 // Created: Tue Sep 07 2010
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2010)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
41 
42 using namespace bpp;
43 
44 //From the STL:
45 #include <string>
46 #include <numeric>
47 
48 using namespace std;
49 
51 {
52  if (blockBuffer_.size() == 0) {
53  //Unless there is no more block in the buffer, we need to parse more:
54  do {
55  MafBlock* block = iterator_->nextBlock();
56  if (!block) return 0; //No more block.
57 
58  //Check if the block contains the reference species:
59  if (!block->hasSequenceForSpecies(refSpecies_)) {
60  if (logstream_) {
61  (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain the reference species and was kept as is.").endLine();
62  }
63  return block;
64  }
65 
66  //Get the feature ranges for this block:
67  const MafSequence& refSeq = block->getSequenceForSpecies(refSpecies_);
68  //first check if there is one (for now we assume that features refer to the chromosome or contig name, with implicit species):
69  std::map<std::string, MultiRange<size_t> >::iterator mr = ranges_.find(refSeq.getChromosome());
70  if (mr == ranges_.end()) {
71  if (logstream_) {
72  (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain any feature and was kept as is.").endLine();
73  }
74  return block;
75  }
76  //else
77  MultiRange<size_t> mRange = mr->second;
78  //mRange.restrictTo(Range<size_t>(refSeq.start(), refSeq.stop() + 1)); jdutheil on 17/04/13: do we really need the +1 here?
79  mRange.restrictTo(refSeq.getRange(true));
80  if (mRange.isEmpty()) {
81  if (logstream_) {
82  (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain any feature and was kept as is.").endLine();
83  }
84  return block;
85  }
86  std::vector<size_t> tmp = mRange.getBounds();
87 
88  //If the reference sequence is on the negative strand, then we have to correct the coordinates:
89  std::deque<size_t> refBounds;
90  if (refSeq.getStrand() == '-') {
91  for (size_t i = 0; i < tmp.size(); ++i)
92  {
93  refBounds.push_front(refSeq.getSrcSize() - tmp[i]);
94  }
95  } else {
96  refBounds = deque<size_t>(tmp.begin(), tmp.end());
97  }
98 
99  //Now extract corresponding alignments. We use the range to split the original block.
100  //Only thing to watch out is the coordinates, refering to the ref species...
101  //A good idea is then to convert those with respect to the given block:
102 
103  int gap = refSeq.getAlphabet()->getGapCharacterCode();
104  long int refPos = static_cast<long int>(refSeq.start()) - 1;
105  //long int refPos = refSeq.getStrand() == '-' ? static_cast<long int>(refSeq.getSrcSize() - refSeq.start()) - 1 : static_cast<long int>(refSeq.start()) - 1;
106  std::vector<size_t> pos;
107  if (verbose_) {
108  ApplicationTools::message->endLine();
109  ApplicationTools::displayTask("Removing features", true);
110  }
111  for (size_t alnPos = 0; alnPos < refSeq.size() && refBounds.size() > 0; ++alnPos) {
112  if (verbose_)
113  ApplicationTools::displayGauge(alnPos, refBounds.back(), '>');
114  if (refSeq[alnPos] != gap) {
115  refPos++;
116  //check if this position is a bound:
117  if (refBounds.front() == static_cast<size_t>(refPos)) {
118  pos.push_back(alnPos);
119  refBounds.pop_front();
120  }
121  }
122  }
123  if (verbose_)
124  ApplicationTools::displayTaskDone();
125 
126  //Check if the last bound matches the end of the alignment:
127  if (refBounds.size() > 0 && refBounds.front() == refSeq.stop()) {
128  pos.push_back(refSeq.size());
129  refBounds.pop_front();
130  }
131 
132  if (refBounds.size() > 0) {
133  VectorTools::print(vector<size_t>(refBounds.begin(), refBounds.end()));
134  throw Exception("FeatureFilterMafIterator::nextBlock(). An error occurred here, " + TextTools::toString(refBounds.size()) + " coordinates are left, in sequence " + refSeq.getDescription() + "... this is most likely a bug, please report!");
135  }
136 
137  //Next step is simply to split the black according to the translated coordinates:
138  if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
139  //Everything is removed:
140  if (logstream_) {
141  (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
142  }
143  } else {
144  if (logstream_) {
145  (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
146  }
147  if (verbose_) {
148  ApplicationTools::displayTask("Spliting block", true);
149  }
150  for (size_t i = 0; i < pos.size(); i+=2) {
151  if (verbose_)
152  ApplicationTools::displayGauge(i, pos.size() - 2, '=');
153  if (logstream_) {
154  (*logstream_ << "FEATURE FILTER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
155  }
156  if (pos[i] > 0) {
157  MafBlock* newBlock = new MafBlock();
158  newBlock->setScore(block->getScore());
159  newBlock->setPass(block->getPass());
160  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
161  MafSequence* subseq;
162  if (i == 0) {
163  subseq = block->getSequence(j).subSequence(0, pos[i]);
164  } else {
165  subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
166  }
167  newBlock->addSequence(*subseq);
168  delete subseq;
169  }
170  blockBuffer_.push_back(newBlock);
171  }
172 
173  if (keepTrashedBlocks_) {
174  MafBlock* outBlock = new MafBlock();
175  outBlock->setScore(block->getScore());
176  outBlock->setPass(block->getPass());
177  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
178  MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
179  outBlock->addSequence(*outseq);
180  delete outseq;
181  }
182  trashBuffer_.push_back(outBlock);
183  }
184  }
185  //Add last block:
186  if (pos.back() < block->getNumberOfSites()) {
187  MafBlock* newBlock = new MafBlock();
188  newBlock->setScore(block->getScore());
189  newBlock->setPass(block->getPass());
190  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
191  MafSequence* subseq;
192  subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
193  newBlock->addSequence(*subseq);
194  delete subseq;
195  }
196  blockBuffer_.push_back(newBlock);
197  }
198  if (verbose_)
199  ApplicationTools::displayTaskDone();
200 
201  delete block;
202  }
203  } while (blockBuffer_.size() == 0);
204  }
205 
206  MafBlock* nxtBlock = blockBuffer_.front();
207  blockBuffer_.pop_front();
208  return nxtBlock;
209 }
210 
const MafSequence & getSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:139
void setScore(double score)
Definition: MafBlock.h:102
size_t stop() const
Definition: MafSequence.h:111
unsigned int getPass() const
Definition: MafBlock.h:106
STL namespace.
size_t getNumberOfSites() const
Definition: MafBlock.h:113
char getStrand() const
Definition: MafSequence.h:156
MafSequence * subSequence(size_t startAt, size_t length) const
Extract a sub-sequence.
Definition: MafSequence.cpp:48
bool hasSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:129
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:55
size_t getSrcSize() const
Definition: MafSequence.h:160
void addSequence(const MafSequence &sequence)
Definition: MafBlock.h:115
const std::string & getChromosome() const
Definition: MafSequence.h:154
Range< size_t > getRange(bool origin=true) const
Definition: MafSequence.h:121
double getScore() const
Definition: MafBlock.h:105
void setPass(unsigned int pass)
Definition: MafBlock.h:103
size_t getNumberOfSequences() const
Definition: MafBlock.h:111
std::string getDescription() const
Definition: MafBlock.h:177
const MafSequence & getSequence(const std::string &name) const
Definition: MafBlock.h:121
size_t start() const
Definition: MafSequence.h:106
A sequence class which is used to store data from MAF files.
Definition: MafSequence.h:62
std::string getDescription() const
Definition: MafSequence.h:178