52 if (blockBuffer_.size() == 0) {
55 MafBlock* block = iterator_->nextBlock();
59 int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode();
60 int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode();
64 vector< vector<int> > aln;
68 for (
size_t i = 0; i < nr; ++i) {
73 fill(aln[i].begin(), aln[i].end(), gap);
77 vector<string> speciesSet = VectorTools::vectorIntersection(species_, block->
getSpeciesList());
78 nr = speciesSet.size();
80 for (
size_t i = 0; i < nr; ++i) {
96 throw Exception(
"AlignmentFilterMafIterator::analyseCurrentBlock_. Block is smaller than window size: " + TextTools::toString(nc));
97 for (i = 0; i < windowSize_; ++i) {
98 for (
size_t j = 0; j < nr; ++j) {
101 window_.push_back(col);
105 ApplicationTools::message->endLine();
106 ApplicationTools::displayTask(
"Sliding window for alignment filter",
true);
108 while (i + step_ < nc) {
110 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1,
'>');
112 unsigned int sumGap = 0;
114 for (
size_t u = 0; u < window_.size(); ++u) {
115 for (
size_t v = 0; v < window_[u].size(); ++v) {
116 if (window_[u][v] == gap || window_[u][v] == unk) {
119 col[v] = window_[u][v];
120 if (col[v] == unk) col[v] = gap;
122 sumEnt += VectorTools::shannonDiscrete<int, double>(col) / log(5.);
124 if (sumGap > maxGap_ && (sumEnt / static_cast<double>(windowSize_)) > maxEnt_) {
125 if (pos.size() == 0) {
126 pos.push_back(i - windowSize_);
129 if (i - windowSize_ < pos[pos.size() - 1]) {
130 pos[pos.size() - 1] = i;
132 pos.push_back(i - windowSize_);
139 for (
size_t k = 0; k < step_; ++k) {
140 for (
size_t j = 0; j < nr; ++j) {
143 window_.push_back(col);
150 unsigned int sumGap = 0;
152 for (
size_t u = 0; u < window_.size(); ++u) {
153 for (
size_t v = 0; v < window_[u].size(); ++v) {
154 if (window_[u][v] == gap || window_[u][v] == unk) {
157 col[v] = window_[u][v];
158 if (col[v] == unk) col[v] = gap;
160 sumEnt += VectorTools::shannonDiscrete<int, double>(col) / log(5.);
162 if (sumGap > maxGap_ && (sumEnt / static_cast<double>(windowSize_)) > maxEnt_) {
163 if (pos.size() == 0) {
164 pos.push_back(i - windowSize_);
167 if (i - windowSize_ <= pos[pos.size() - 1]) {
168 pos[pos.size() - 1] = i;
170 pos.push_back(i - windowSize_);
176 ApplicationTools::displayTaskDone();
179 if (pos.size() == 0) {
180 blockBuffer_.push_back(block);
182 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" is clean and kept as is.").endLine();
184 }
else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->
getNumberOfSites()) {
187 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" was entirely removed. Tried to get the next one.").endLine();
191 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" with size "<< block->
getNumberOfSites() <<
" will be split into " << (pos.size() / 2 + 1) <<
" blocks.").endLine();
194 ApplicationTools::message->endLine();
195 ApplicationTools::displayTask(
"Spliting block",
true);
197 for (i = 0; i < pos.size(); i+=2) {
199 ApplicationTools::displayGauge(i, pos.size() - 2,
'=');
201 (*logstream_ <<
"ALN CLEANER: removing region (" << pos[i] <<
", " << pos[i+1] <<
") from block " << block->
getDescription() <<
".").endLine();
217 blockBuffer_.push_back(newBlock);
220 if (keepTrashedBlocks_) {
229 trashBuffer_.push_back(outBlock);
243 blockBuffer_.push_back(newBlock);
246 ApplicationTools::displayTaskDone();
250 }
while (blockBuffer_.size() == 0);
253 MafBlock* block = blockBuffer_.front();
254 blockBuffer_.pop_front();
260 if (blockBuffer_.size() == 0) {
263 MafBlock* block = iterator_->nextBlock();
264 if (!block)
return 0;
267 int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode();
268 int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode();
272 vector< vector<int> > aln;
274 nr = species_.size();
276 for (
size_t i = 0; i < nr; ++i) {
281 fill(aln[i].begin(), aln[i].end(), gap);
285 vector<string> speciesSet = VectorTools::vectorIntersection(species_, block->
getSpeciesList());
286 nr = speciesSet.size();
288 for (
size_t i = 0; i < nr; ++i) {
294 vector<bool> col(nr);
299 for (i = 0; i < windowSize_; ++i) {
300 for (
size_t j = 0; j < nr; ++j) {
301 col[j] = (aln[j][i] == gap || aln[j][i] == unk);
303 window_.push_back(col);
307 ApplicationTools::message->endLine();
308 ApplicationTools::displayTask(
"Sliding window for alignment filter",
true);
310 while (i + step_ < nc) {
312 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1,
'>');
314 unsigned int count = 0;
315 bool posIsGap =
true;
316 for (
size_t u = 0; u < window_.size(); ++u) {
317 unsigned int partialCount = 0;
318 if (!posIsGap || (u > 0 && window_[u] != window_[u - 1])) {
319 for (
size_t v = 0; v < window_[u].size(); ++v)
320 if (window_[u][v]) partialCount++;
321 if (partialCount > maxGap_) {
329 if (count > maxPos_) {
330 if (pos.size() == 0) {
331 pos.push_back(i - windowSize_);
334 if (i - windowSize_ < pos[pos.size() - 1]) {
335 pos[pos.size() - 1] = i;
337 pos.push_back(i - windowSize_);
344 for (
size_t k = 0; k < step_; ++k) {
345 for (
size_t j = 0; j < nr; ++j) {
346 col[j] = (aln[j][i] == gap || aln[j][i] == unk);
348 window_.push_back(col);
355 unsigned int count = 0;
356 bool posIsGap =
true;
357 for (
size_t u = 0; u < window_.size(); ++u) {
358 unsigned int partialCount = 0;
359 if (!posIsGap || (u > 0 && window_[u] != window_[u - 1])) {
360 for (
size_t v = 0; v < window_[u].size(); ++v)
361 if (window_[u][v]) partialCount++;
362 if (partialCount > maxGap_) {
370 if (count > maxPos_) {
371 if (pos.size() == 0) {
372 pos.push_back(i - windowSize_);
375 if (i - windowSize_ <= pos[pos.size() - 1]) {
376 pos[pos.size() - 1] = i;
378 pos.push_back(i - windowSize_);
384 ApplicationTools::displayTaskDone();
387 if (pos.size() == 0) {
388 blockBuffer_.push_back(block);
390 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" is clean and kept as is.").endLine();
392 }
else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->
getNumberOfSites()) {
395 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" was entirely removed. Tried to get the next one.").endLine();
399 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" with size "<< block->
getNumberOfSites() <<
" will be split into " << (pos.size() / 2 + 1) <<
" blocks.").endLine();
402 ApplicationTools::message->endLine();
403 ApplicationTools::displayTask(
"Spliting block",
true);
405 for (i = 0; i < pos.size(); i+=2) {
407 ApplicationTools::displayGauge(i, pos.size() - 2,
'=');
409 (*logstream_ <<
"ALN CLEANER: removing region (" << pos[i] <<
", " << pos[i+1] <<
") from block " << block->
getDescription() <<
".").endLine();
425 blockBuffer_.push_back(newBlock);
428 if (keepTrashedBlocks_) {
437 trashBuffer_.push_back(outBlock);
451 blockBuffer_.push_back(newBlock);
454 ApplicationTools::displayTaskDone();
458 }
while (blockBuffer_.size() == 0);
461 MafBlock* block = blockBuffer_.front();
462 blockBuffer_.pop_front();
const MafSequence & getSequenceForSpecies(const std::string &species) const
MafBlock * analyseCurrentBlock_()
void setScore(double score)
MafBlock * analyseCurrentBlock_()
unsigned int getPass() const
size_t getNumberOfSites() const
MafSequence * subSequence(size_t startAt, size_t length) const
Extract a sub-sequence.
bool hasSequenceForSpecies(const std::string &species) const
A synteny block data structure, the basic unit of a MAF alignement file.
void addSequence(const MafSequence &sequence)
std::vector< std::string > getSpeciesList() const
void setPass(unsigned int pass)
size_t getNumberOfSequences() const
std::string getDescription() const
const MafSequence & getSequence(const std::string &name) const
A sequence class which is used to store data from MAF files.