52 if (blockBuffer_.size() == 0) {
55 MafBlock* block = iterator_->nextBlock();
59 int gap = AlphabetTools::DNA_ALPHABET.getGapCharacterCode();
65 vector< vector<int> > aln;
66 if (missingAsGap_ && !ignoreGaps_) {
69 for (
size_t i = 0; i < nr; ++i) {
74 fill(aln[i].begin(), aln[i].end(), gap);
78 vector<string> speciesSet = VectorTools::vectorIntersection(species_, block->
getSpeciesList());
79 nr = speciesSet.size();
81 for (
size_t i = 0; i < nr; ++i) {
91 for (i = 0; i < windowSize_; ++i) {
93 for (
size_t j = 0; j < nr; ++j) {
95 if (x != gap || !ignoreGaps_)
98 double entropy = VectorTools::shannonDiscrete<int, double>(col) / log(5.);
99 window_.push_back(entropy > maxEnt_ ? 1 : 0);
103 ApplicationTools::message->endLine();
104 ApplicationTools::displayTask(
"Sliding window for entropy filter",
true);
106 while (i + step_ < nc) {
108 ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1,
'>');
110 unsigned int count = std::accumulate(window_.begin(), window_.end(), 0u);
111 if (count > maxPos_) {
112 if (pos.size() == 0) {
113 pos.push_back(i - windowSize_);
116 if (i - windowSize_ < pos[pos.size() - 1]) {
117 pos[pos.size() - 1] = i;
119 pos.push_back(i - windowSize_);
126 for (
size_t k = 0; k < step_; ++k) {
128 for (
size_t j = 0; j < nr; ++j) {
130 if (x != gap || !ignoreGaps_)
133 double entropy = VectorTools::shannonDiscrete<int, double>(col) / log(5.);
134 window_.push_back(entropy > maxEnt_ ? 1 : 0);
141 unsigned int count = std::accumulate(window_.begin(), window_.end(), 0u);
142 if (count > maxPos_) {
143 if (pos.size() == 0) {
144 pos.push_back(i - windowSize_);
147 if (i - windowSize_ <= pos[pos.size() - 1]) {
148 pos[pos.size() - 1] = i;
150 pos.push_back(i - windowSize_);
156 ApplicationTools::displayTaskDone();
159 if (pos.size() == 0) {
160 blockBuffer_.push_back(block);
162 (*logstream_ <<
"ENTROPY CLEANER: block " << block->
getDescription() <<
" is clean and kept as is.").endLine();
164 }
else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->
getNumberOfSites()) {
167 (*logstream_ <<
"ENTROPY CLEANER: block " << block->
getDescription() <<
" was entirely removed. Tried to get the next one.").endLine();
171 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" with size "<< block->
getNumberOfSites() <<
" will be split into " << (pos.size() / 2 + 1) <<
" blocks.").endLine();
174 ApplicationTools::message->endLine();
175 ApplicationTools::displayTask(
"Spliting block",
true);
177 for (i = 0; i < pos.size(); i+=2) {
179 ApplicationTools::displayGauge(i, pos.size() - 2,
'=');
181 (*logstream_ <<
"ENTROPY CLEANER: removing region (" << pos[i] <<
", " << pos[i+1] <<
") from block " << block->
getDescription() <<
".").endLine();
197 blockBuffer_.push_back(newBlock);
200 if (keepTrashedBlocks_) {
209 trashBuffer_.push_back(outBlock);
223 blockBuffer_.push_back(newBlock);
226 ApplicationTools::displayTaskDone();
230 }
while (blockBuffer_.size() == 0);
233 MafBlock* block = blockBuffer_.front();
234 blockBuffer_.pop_front();
const MafSequence & getSequenceForSpecies(const std::string &species) const
void setScore(double score)
unsigned int getPass() const
size_t getNumberOfSites() const
MafSequence * subSequence(size_t startAt, size_t length) const
Extract a sub-sequence.
bool hasSequenceForSpecies(const std::string &species) const
A synteny block data structure, the basic unit of a MAF alignement file.
void addSequence(const MafSequence &sequence)
std::vector< std::string > getSpeciesList() const
void setPass(unsigned int pass)
MafBlock * analyseCurrentBlock_()
size_t getNumberOfSequences() const
std::string getDescription() const
const MafSequence & getSequence(const std::string &name) const
A sequence class which is used to store data from MAF files.