43 #include "../Model/FrequenciesSet/NucleotideFrequenciesSet.h" 44 #include "../Model/FrequenciesSet/ProteinFrequenciesSet.h" 45 #include "../Model/FrequenciesSet/CodonFrequenciesSet.h" 46 #include "../Model/FrequenciesSet/MvaFrequenciesSet.h" 49 #include <Bpp/Io/FileTools.h> 50 #include <Bpp/Text/TextTools.h> 51 #include <Bpp/Text/StringTokenizer.h> 52 #include <Bpp/Text/KeyvalTools.h> 54 #include <Bpp/Numeric/AutoParameter.h> 57 #include <Bpp/Seq/Alphabet/AlphabetTools.h> 58 #include <Bpp/Seq/App/SequenceApplicationTools.h> 59 #include <Bpp/Seq/Container/SequenceContainerTools.h> 78 unparsedArguments_.clear();
80 map<string, string> args;
81 KeyvalTools::parseProcedure(freqDescription, freqName, args);
82 auto_ptr<FrequenciesSet> pFS;
84 if (freqName ==
"Fixed")
86 if (AlphabetTools::isNucleicAlphabet(alphabet))
88 if (alphabetCode_ & NUCLEOTIDE)
91 throw Exception(
"Nucleotide alphabet not supported.");
93 else if (AlphabetTools::isProteicAlphabet(alphabet))
95 if (alphabetCode_ & PROTEIN)
98 throw Exception(
"Protein alphabet not supported.");
100 else if (AlphabetTools::isCodonAlphabet(alphabet))
102 if (alphabetCode_ & CODON) {
104 throw Exception(
"BppOFrequenciesSetFormat::read(). No genetic code specified! Consider using 'setGeneticCode'.");
107 throw Exception(
"Codon alphabet not supported.");
115 else if (freqName ==
"Full")
117 unsigned short method = 1;
118 if (args.find(
"method") != args.end()){
119 if (args[
"method"] ==
"local")
122 if (args[
"method"] ==
"binary")
126 if (AlphabetTools::isNucleicAlphabet(alphabet))
128 if (alphabetCode_ & NUCLEOTIDE)
131 throw Exception(
"Nucleotide alphabet not supported.");
133 else if (AlphabetTools::isProteicAlphabet(alphabet))
135 if (alphabetCode_ & PROTEIN)
138 throw Exception(
"Protein alphabet not supported.");
140 else if (AlphabetTools::isCodonAlphabet(alphabet))
142 if (alphabetCode_ & CODON) {
144 throw Exception(
"BppOFrequenciesSetFormat::read(). No genetic code specified! Consider using 'setGeneticCode'.");
147 throw Exception(
"Codon alphabet not supported.");
156 else if (freqName ==
"GC")
158 if (!AlphabetTools::isNucleicAlphabet(alphabet))
159 throw Exception(
"Error, unvalid frequencies " + freqName +
" with non-nucleic alphabet.");
160 if (alphabetCode_ & NUCLEOTIDE)
161 pFS.reset(
new GCFrequenciesSet(dynamic_cast<const NucleicAlphabet*>(alphabet)));
163 throw Exception(
"Nucleotide alphabet not supported.");
167 else if (freqName ==
"Word")
169 if (!(alphabetCode_& WORD))
170 throw Exception(
"Word alphabet not supported.");
171 if (!AlphabetTools::isWordAlphabet(alphabet))
172 throw Exception(
"BppOFrequenciesSetFormat::read(...).\n\t Bad alphabet type " 173 + alphabet->getAlphabetType() +
" for frequencies set " + freqName +
".");
175 const WordAlphabet* pWA =
dynamic_cast<const WordAlphabet*
>(alphabet);
177 if (args.find(
"frequency") != args.end())
179 string sAFS = args[
"frequency"];
181 unsigned int nbfreq = pWA->getLength();
183 for (
unsigned i = 0; i < nbfreq; i++)
185 st += TextTools::toString(i + 1);
189 auto_ptr<FrequenciesSet> pFS2(nestedReader.
read(pWA->getNAlphabet(0), sAFS, data,
false));
191 for (map<string, string>::iterator it = unparsedParameterValuesNested.begin(); it != unparsedParameterValuesNested.end(); it++)
193 unparsedArguments_[st +
"_" + it->first] = it->second;
199 if (args.find(
"frequency1") == args.end())
200 throw Exception(
"BppOFrequenciesSetFormat::read. Missing argument 'frequency' or 'frequency1' for frequencies set 'Word'.");
201 vector<string> v_sAFS;
202 vector<FrequenciesSet*> v_AFS;
203 unsigned int nbfreq = 1;
205 while (args.find(
"frequency" + TextTools::toString(nbfreq)) != args.end())
207 v_sAFS.push_back(args[
"frequency" + TextTools::toString(nbfreq++)]);
210 if (v_sAFS.size() != pWA->getLength())
211 throw Exception(
"BppOFrequenciesSetFormat::read. Number of frequencies (" + TextTools::toString(v_sAFS.size()) +
") does not match length of the words (" + TextTools::toString(pWA->getLength()) +
")");
213 for (
size_t i = 0; i < v_sAFS.size(); ++i)
216 pFS.reset(nestedReader.
read(pWA->getNAlphabet(i), v_sAFS[i], data,
false));
218 for (map<string, string>::iterator it = unparsedParameterValuesNested.begin(); it != unparsedParameterValuesNested.end(); it++)
220 unparsedArguments_[TextTools::toString(i + 1) +
"_" + it->first] = it->second;
222 v_AFS.push_back(pFS.release());
229 else if (freqName ==
"Codon")
231 if (!(alphabetCode_ & CODON))
232 throw Exception(
"Codon alphabet not supported.");
233 if (!AlphabetTools::isCodonAlphabet(alphabet))
234 throw Exception(
"BppOFrequenciesSetFormat::read.\n\t Bad alphabet type " 235 + alphabet->getAlphabetType() +
" for frequenciesset " + freqName +
".");
237 throw Exception(
"BppOFrequenciesSetFormat::read(). No genetic code specified! Consider using 'setGeneticCode'.");
239 const CodonAlphabet* pWA =
dynamic_cast<const CodonAlphabet*
>(alphabet);
241 if (args.find(
"frequency") != args.end())
243 string sAFS = args[
"frequency"];
246 auto_ptr<FrequenciesSet> pFS2(nestedReader.
read(pWA->getNAlphabet(0), sAFS, data,
false));
249 for (map<string, string>::iterator it = unparsedParameterValuesNested.begin(); it != unparsedParameterValuesNested.end(); it++)
251 unparsedArguments_[
"123_" + it->first] = it->second;
258 if (args.find(
"frequency1") == args.end())
259 throw Exception(
"BppOFrequenciesSetFormat::read. Missing argument 'frequency' or 'frequency1' for frequencies set.");
260 vector<string> v_sAFS;
261 vector<FrequenciesSet*> v_AFS;
262 unsigned int nbfreq = 1;
264 while (args.find(
"frequency" + TextTools::toString(nbfreq)) != args.end())
266 v_sAFS.push_back(args[
"frequency" + TextTools::toString(nbfreq++)]);
269 if (v_sAFS.size() != 3)
270 throw Exception(
"BppOFrequenciesSetFormat::read. Number of frequencies (" + TextTools::toString(v_sAFS.size()) +
") is not three");
272 for (
size_t i = 0; i < v_sAFS.size(); ++i)
275 pFS.reset(nestedReader.
read(pWA->getNAlphabet(i), v_sAFS[i], data,
false));
277 for (map<string, string>::iterator it = unparsedParameterValuesNested.begin(); it != unparsedParameterValuesNested.end(); it++)
279 unparsedArguments_[TextTools::toString(i + 1) +
"_" + it->first] = it->second;
281 v_AFS.push_back(pFS.release());
285 throw Exception(
"BppOFrequenciesSetFormat::read(). No genetic code specified! Consider using 'setGeneticCode'.");
291 else if (freqName ==
"FullPerAA")
293 if (!(alphabetCode_ & CODON))
294 throw Exception(
"Codon alphabet not supported.");
295 if (!AlphabetTools::isCodonAlphabet(alphabet))
296 throw Exception(
"BppOFrequenciesSetFormat::read.\n\t Bad alphabet type " 297 + alphabet->getAlphabetType() +
" for frequenciesset " + freqName +
".");
300 throw Exception(
"BppOFrequenciesSetFormat::read(). No genetic code specified! Consider using 'setGeneticCode'.");
302 const ProteicAlphabet* pPA =
dynamic_cast<const ProteicAlphabet*
>(geneticCode_->getTargetAlphabet());
304 unsigned short method=1;
305 if (args.find(
"method") != args.end()){
306 if (args[
"method"]==
"local")
309 if (args[
"method"]==
"binary")
313 if (args.find(
"protein_frequencies") != args.end())
315 string sPFS = args[
"protein_frequencies"];
317 auto_ptr<ProteinFrequenciesSet> pPFS(dynamic_cast<ProteinFrequenciesSet*>(nestedReader.
read(pPA, sPFS, data,
false)));
320 for (map<string, string>::iterator it = unparsedParameterValuesNested.begin(); it != unparsedParameterValuesNested.end(); it++)
322 unparsedArguments_[
"FullPerAA." + it->first] = it->second;
332 else if (AlphabetTools::isCodonAlphabet(alphabet))
334 if (!(alphabetCode_ & CODON))
335 throw Exception(
"Codon alphabet not supported.");
337 throw Exception(
"BppOFrequenciesSetFormat::read(). No genetic code specified! Consider using 'setGeneticCode'.");
339 const CodonAlphabet* pWA =
dynamic_cast<const CodonAlphabet*
>(alphabet);
341 if (args.find(
"genetic_code") != args.end()) {
342 ApplicationTools::displayWarning(
"'genetic_code' argument is no longer supported inside model description, and has been supersided by a global 'genetic_code' option.");
343 throw Exception(
"BppOFrequenciesSetFormat::read. Deprecated 'genetic_code' argument.");
346 string mgmtStopCodon =
"quadratic";
348 if (freqName ==
"F0")
352 else if (freqName ==
"F1X4")
356 if (args.find(
"mgmtStopCodon") != args.end()){
357 mgmtStopCodon = args[
"mgmtStopCodon"];
358 ApplicationTools::displayResult(
"StopCodon frequencies distribution ", mgmtStopCodon);
360 if (args.find(
"frequency") != args.end())
362 string sAFS = args[
"frequency"];
365 auto_ptr<FrequenciesSet> pFS2(nestedReader.
read(pWA->getNAlphabet(0), sAFS, data,
false));
367 throw Exception(
"BppOFrequenciesSetFormat::read. The frequency option in F1X4 can only be Full");
371 for (map<string, string>::iterator it = unparsedParameterValuesNested.begin(); it != unparsedParameterValuesNested.end(); it++)
373 unparsedArguments_[
"123_" + it->first] = it->second;
376 if (args.find(
"123_theta") != args.end())
377 unparsedArguments_[
"123_Full.theta"] = args[
"123_theta"];
378 if (args.find(
"123_theta1") != args.end())
379 unparsedArguments_[
"123_Full.theta1"] = args[
"123_theta1"];
380 if (args.find(
"123_theta2") != args.end())
381 unparsedArguments_[
"123_Full.theta2"] = args[
"123_theta2"];
382 if (args.find(
"theta") != args.end())
383 unparsedArguments_[
"123_Full.theta"] = args[
"123_theta"];
384 if (args.find(
"theta1") != args.end())
385 unparsedArguments_[
"123_Full.theta1"] = args[
"123_theta1"];
386 if (args.find(
"theta2") != args.end())
387 unparsedArguments_[
"123_Full.theta2"] = args[
"123_theta2"];
389 else if (freqName ==
"F3X4")
393 if (args.find(
"mgmtStopCodon") != args.end()){
394 mgmtStopCodon = args[
"mgmtStopCodon"];
395 ApplicationTools::displayResult(
"StopCodon frequencies distribution ", mgmtStopCodon);
398 if (args.find(
"frequency1") != args.end() ||
399 args.find(
"frequency2") != args.end() ||
400 args.find(
"frequency3") != args.end())
402 vector<string> v_sAFS;
404 for (
unsigned int nbfreq = 1; nbfreq<=3; nbfreq++)
405 if (args.find(
"frequency" + TextTools::toString(nbfreq)) != args.end())
406 v_sAFS.push_back(args[
"frequency" + TextTools::toString(nbfreq)]);
408 v_sAFS.push_back(
"");
410 for (
unsigned i = 0; i < v_sAFS.size(); i++)
414 pFS.reset(nestedReader.
read(pWA->getNAlphabet(i), v_sAFS[i], data,
false));
416 throw Exception(
"BppOFrequenciesSetFormat::read. The frequency options in F3X4 can only be Full");
419 for (map<string, string>::iterator it = unparsedParameterValuesNested.begin(); it != unparsedParameterValuesNested.end(); it++)
421 unparsedArguments_[TextTools::toString(i + 1) +
"_" + it->first] = it->second;
426 for (
unsigned int i = 1 ; i <= 3; i++){
428 if (args.find(TextTools::toString(i)+
"_theta") != args.end())
429 unparsedArguments_[TextTools::toString(i)+
"_Full.theta"] = args[TextTools::toString(i)+
"_theta"];
430 if (args.find(TextTools::toString(i)+
"_theta1") != args.end())
431 unparsedArguments_[TextTools::toString(i)+
"_Full.theta1"] = args[TextTools::toString(i)+
"_theta1"];
432 if (args.find(TextTools::toString(i)+
"_theta2") != args.end())
433 unparsedArguments_[TextTools::toString(i)+
"_Full.theta2"] = args[TextTools::toString(i)+
"_theta2"];
436 else if (freqName ==
"F61")
441 unsigned short method=1;
442 if (args.find(
"method") != args.end()){
443 if (args[
"method"]==
"local")
446 if (args[
"method"]==
"binary")
452 throw Exception(
"Unknown frequency option: " + freqName);
456 else if (freqName ==
"MVAprotein")
458 pFS.reset(
new MvaFrequenciesSet(dynamic_cast<const ProteicAlphabet*>(alphabet)));
462 throw Exception(
"Unknown frequency option: " + freqName);
464 vector<string> pnames = pFS->getParameters().getParameterNames();
466 string pref = pFS->getNamespace();
467 for (
size_t i = 0; i < pnames.size(); i++)
469 string name = pFS->getParameterNameWithoutNamespace(pnames[i]);
470 if (args.find(name) != args.end())
471 unparsedArguments_[pref + name] = args[name];
475 if (args.find(
"init") != args.end())
477 unparsedArguments_[
"init"] = args[
"init"];
478 unparsedArguments_[
"initFreqs"] = args[
"init"];
480 if (args.find(
"init.observedPseudoCount") != args.end())
482 unparsedArguments_[
"init.observedPseudoCount"] = args[
"init.observedPseudoCount"];
483 unparsedArguments_[
"initFreqs.observedPseudoCount"] = args[
"init.observedPseudoCount"];
485 if (args.find(
"values") != args.end())
487 unparsedArguments_[
"initFreqs"] =
"values" + args[
"values"];
488 unparsedArguments_[
"init"] =
"values" + args[
"values"];
492 initialize_(*pFS, data);
493 return pFS.release();
498 std::vector<std::string>& writtenNames)
const 505 ParameterList pl = pfreqset->getParameters();
506 int p = out.getPrecision();
507 out.setPrecision(12);
509 string name = pfreqset->
getName();
513 if ((name ==
"Fixed") || (name ==
"F0"))
517 for (
size_t i = 0; i < vf.size(); i++)
527 if (dynamic_cast<const FullProteinFrequenciesSet*>(pfreqset))
533 out <<
"method=" << ((meth==2)?
"local":
"binary");
539 if (name !=
"F1X4" && name !=
"F3X4" && name !=
"F61")
544 for (
size_t i = 0; i < pWFI->
getLength(); i++)
548 out <<
"frequency" << i + 1 <<
"=";
556 for (
size_t i = 0; i < pWFU->
getLength(); i++)
569 out <<
"protein_frequencies=";
571 write(ppfs, out, writtenNames);
579 out <<
"method=" << ((meth==2)?
"local":
"binary");
587 for (
size_t i = 0; i < pl.size(); i++)
589 if (find(writtenNames.begin(), writtenNames.end(), pl[i].getName()) == writtenNames.end())
595 string pname = pfreqset->getParameterNameWithoutNamespace(pl[i].getName());
596 (out << pname <<
"=").enableScientificNotation(
false) << pl[i].getValue();
597 writtenNames.push_back(pl[i].getName());
607 out <<
"method=" << ((meth==2)?
"local":
"binary");
619 if (unparsedArguments_.find(
"init") != unparsedArguments_.end())
622 string init = unparsedArguments_[
"init"];
623 if (init ==
"observed")
626 throw Exception(
"Missing data for observed frequencies");
627 unsigned int psc = 0;
628 if (unparsedArguments_.find(
"observedPseudoCount") != unparsedArguments_.end())
629 psc = TextTools::to<unsigned int>(unparsedArguments_[
"observedPseudoCount"]);
631 map<int, double> freqs;
632 SequenceContainerTools::getFrequencies(*data, freqs, psc);
636 else if (init.substr(0, 6) ==
"values")
639 vector<double> frequencies;
640 string rf = init.substr(6);
642 StringTokenizer strtok(rf.substr(1, rf.length() - 2),
",");
643 while (strtok.hasMoreToken())
644 frequencies.push_back(TextTools::toDouble(strtok.nextToken()));
647 else if (init ==
"balanced")
652 throw Exception(
"Unknown init argument");
657 ParameterList pl = freqSet.getParameters();
659 for (
size_t i = 0; i < pl.size(); ++i)
661 AutoParameter ap(pl[i]);
663 ap.setMessageHandler(ApplicationTools::warning);
664 pl.setParameter(i, ap);
667 for (
size_t i = 0; i < pl.size(); ++i)
669 const string pName = pl[i].getName();
670 double value = ApplicationTools::getDoubleParameter(pName, unparsedArguments_, pl[i].getValue(),
"",
true, warningLevel_);
672 pl[i].setValue(value);
674 ApplicationTools::displayResult(
"Parameter found", pName +
"=" + TextTools::toString(pl[i].getValue()));
677 freqSet.matchParametersValues(pl);
virtual size_t getLength() const
A generic FrequenciesSet for Full Codon alphabets.
FrequenciesSet useful for homogeneous and stationary models.
const FrequenciesSet & getFrequenciesSetForLetter(size_t i) const
This class implements a state map where all resolved states are modeled.
the Frequencies in codons are the product of Independent Frequencies in letters with the frequencies ...
FrequenciesSet integrating ProteinFrequenciesSet inside CodonFrequenciesSet. In this case...
virtual void setFrequencies(const std::vector< double > &frequencies)=0
Set the parameters in order to match a given set of frequencies.
FrequenciesSet useful for homogeneous and stationary models, protein implementation.
virtual const std::vector< double > getFrequencies() const =0
Parametrize a set of state frequencies.
const FrequenciesSet & getFrequenciesSetForLetter(size_t i) const
unsigned short getMethod() const
const ProteinFrequenciesSet * getProteinFrequenciesSet() const
Nucleotide FrequenciesSet using three independent parameters (theta, theta1, theta2) to modelize the ...
A generic FrequenciesSet allowing for the estimation of all frequencies.
Protein FrequenciesSet using 19 independent parameters to model the 20 frequencies.
virtual std::string getName() const =0
FrequenciesSet useful for homogeneous and stationary models, codon implementation.
Parametrize a set of state frequencies for proteins.
FrequenciesSet useful for homogeneous and stationary models, nucleotide implementation.
static FrequenciesSet * getFrequenciesSetForCodons(short option, const GeneticCode *gCode, const std::string &mgmtStopFreq="quadratic", unsigned short method=1)
A helper function that provide frequencies set for codon models according to PAML option...
unsigned short getMethod() const
the Frequencies in words are the product of Independent Frequencies in letters
virtual void setFrequenciesFromAlphabetStatesFrequencies(const std::map< int, double > &frequencies)=0
Set the Frequencies from the one of the map which keys match with a letter of the Alphabet...
A frequencies set used to estimate frequencies at the root with the COaLA model. Frequencies at the r...
Nucleotide FrequenciesSet using only one parameter, the GC content.
the Frequencies in codons are the product of the frequencies for a unique FrequenciesSet in letters...