56 missing_data_symbol_(
'$') {}
59 const std::string& data_separator)
60 throw (Exception) : data_separator_(
' '),
61 missing_data_symbol_(
'$')
65 setDataSeparator(data_separator);
66 setMissingDataSymbol(missing_data_symbol);
78 if (missing_data_symbol.size() != 1 || isdigit(missing_data_symbol[0])
79 || TextTools::isWhiteSpaceCharacter(missing_data_symbol[0])
80 || missing_data_symbol[0] == data_separator_
82 throw Exception(
"PopgenlibIO::setMissingData: not expected value for missing_data_symbol.");
84 missing_data_symbol_ = missing_data_symbol[0];
89 if (data_separator == WHITESPACE)
90 data_separator_ =
' ';
91 else if (data_separator == TAB)
92 data_separator_ =
'\t';
93 else if (data_separator == COMA)
94 data_separator_ =
',';
95 else if (data_separator == SEMICOLON)
96 data_separator_ =
';';
99 if (isdigit(data_separator[0])
100 || data_separator == getMissingDataSymbol()
102 throw Exception(
"PopgenlibIO::setDataSeparator: not expected value for data_separator.");
103 data_separator_ = data_separator.c_str()[0];
117 case (
'\t'):
return TAB;
118 case (
','):
return COMA;
137 throw IOException(
"PopgenlibIO::read: fail to open stream.");
139 vector<string> temp_v;
141 VectorSequenceContainer* tmp_vsc = NULL;
143 vector<LocusInfo> tmp_locinf;
145 bool section1 =
true;
146 bool section2 =
true;
147 bool section3 =
true;
148 bool section4 =
true;
149 bool section5 =
true;
150 size_t current_section = 0;
151 size_t previous_section = 0;
156 temp = FileTools::getNextLine(is);
159 if (temp.find(
"[General]", 0) != string::npos)
161 previous_section = current_section;
165 else if (temp.find(
"[Localities]", 0) != string::npos)
167 previous_section = current_section;
171 else if (temp.find(
"[Sequences]", 0) != string::npos)
173 previous_section = current_section;
177 else if (temp.find(
"[Loci]", 0) != string::npos)
179 previous_section = current_section;
183 else if (temp.find(
"[Individuals]", 0) != string::npos)
185 previous_section = current_section;
190 if (current_section == 1 && previous_section < 1)
192 temp_v.push_back(temp);
194 if (section1 && current_section != 1 && previous_section == 1)
197 parseGeneral_(temp_v, data_set);
199 if (data_set.hasSequenceData() && tmp_vsc == NULL)
200 tmp_vsc =
new VectorSequenceContainer(data_set.getAlphabet());
204 if (current_section == 2 && previous_section < 2)
206 if (temp.find(
">", 0) != string::npos)
208 parseLocality_(temp_v, data_set);
210 temp_v.push_back(temp);
213 temp_v.push_back(temp);
215 if (section2 && current_section != 2 && previous_section == 2)
218 parseLocality_(temp_v, data_set);
223 if (current_section == 3 && previous_section < 3)
225 if (temp.find(
">", 0) != string::npos)
227 parseSequence_(temp_v, *tmp_vsc);
229 temp_v.push_back(temp);
232 temp_v.push_back(temp);
234 if (section3 && current_section != 3 && previous_section == 3)
237 parseSequence_(temp_v, *tmp_vsc);
242 if (current_section == 4 && previous_section < 4)
244 if (temp.find(
">", 0) != string::npos)
246 parseLoci_(temp_v, tmp_locinf);
248 temp_v.push_back(temp);
251 temp_v.push_back(temp);
253 if (section4 && current_section != 4 && previous_section == 4)
256 parseLoci_(temp_v, tmp_locinf);
259 for (
size_t i = 0; i < tmp_locinf.size(); i++)
263 data_set.setAnalyzedLoci(tmp_anloc);
267 if (current_section == 5 && previous_section < 5)
269 if (temp.find(
">", 0) != string::npos)
271 parseIndividual_(temp_v, data_set, *tmp_vsc);
273 temp_v.push_back(temp);
276 temp_v.push_back(temp);
278 if (section5 && current_section != 5 && previous_section == 5)
281 parseIndividual_(temp_v, data_set, *tmp_vsc);
286 if (section2 && current_section == 2)
287 parseLocality_(temp_v, data_set);
288 if (section3 && current_section == 3)
289 parseSequence_(temp_v, *tmp_vsc);
290 if (section5 && current_section == 5)
291 parseIndividual_(temp_v, data_set, *tmp_vsc);
298 for (
size_t i = 0; i < in.size(); i++)
303 while (!is.eof() && in.size() != 0)
305 temp = FileTools::getNextLine(is);
306 if (temp.find(
"MissingData", 0) != string::npos)
308 if (temp.find(
"DataSeparator", 0) != string::npos)
310 if (temp.find(
"SequenceType", 0) != string::npos)
318 for (
size_t i = 0; i < in.size(); i++)
324 while (!is.eof() && in.size() != 0)
326 temp = FileTools::getNextLine(is);
328 if (temp.find(
">", 0) != string::npos)
330 tmp_locality.
setName(TextTools::removeSurroundingWhiteSpaces(
string(temp.begin() + 1, temp.end())));
332 if (temp.find(
"Coord", 0) != string::npos)
335 tmp_locality.setX(TextTools::toDouble(v[0]));
336 tmp_locality.setY(TextTools::toDouble(v[1]));
339 if (tmp_locality.
getName() !=
"")
347 for (
size_t i = 0; i < in.size(); i++)
351 ifasta.readSequences(is, vsc);
357 for (
size_t i = 0; i < in.size(); i++)
361 string locinf_name =
"";
366 temp = FileTools::getNextLine(is);
367 if (temp.find(
">", 0) != string::npos)
369 locinf_name = TextTools::removeSurroundingWhiteSpaces(
string(temp.begin() + 1, temp.end()));
371 if (temp.find(
"Ploidy", 0) != string::npos)
374 string tmp_str_ploidy = TextTools::removeSurroundingWhiteSpaces(v[0]);
375 tmp_str_ploidy = TextTools::toUpper(tmp_str_ploidy);
379 else if (tmp_str_ploidy ==
HAPLOID)
383 else if (tmp_str_ploidy ==
UNKNOWN)
386 if (temp.find(
"NbAlleles", 0) != string::npos)
391 if (locinf_name !=
"")
392 locus_info.push_back(
LocusInfo(locinf_name, locinf_ploidy));
398 size_t tmp_group_pos = 0;
400 for (
size_t i = 0; i < in.size(); i++)
403 if (in[i].find(
">", 0) != string::npos)
405 tmp_indiv.
setId(TextTools::removeSurroundingWhiteSpaces(
string(in[i].begin() + 1, in[i].end())));
408 if (in[i].find(
"Group", 0) != string::npos)
411 tmp_group_pos = TextTools::to<size_t>(
getValues_(temp,
"=")[0]);
420 if (in[i].find(
"Locality", 0) != string::npos)
423 size_t sep_pos = temp.find(
"=", 0);
424 string loc_name = TextTools::removeSurroundingWhiteSpaces(
string(temp.begin() +
static_cast<ptrdiff_t
>(sep_pos + 1), temp.end()));
433 if (in[i].find(
"Coord", 0) != string::npos)
439 if (in[i].find(
"Date", 0) != string::npos)
444 d = TextTools::toInt(
string(tmp_date.begin(), tmp_date.begin() + 2));
445 m = TextTools::toInt(
string(tmp_date.begin() + 2, tmp_date.begin() + 4));
446 y = TextTools::toInt(
string(tmp_date.begin() + 4, tmp_date.end()));
450 if (in[i].find(
"SequenceData", 0) != string::npos)
454 vector<string> seq_pos_str =
getValues_(temp,
"");
455 for (
size_t j = 0; j < seq_pos_str.size(); j++)
460 tmp_indiv.
addSequence(j, vsc.getSequence(TextTools::to<size_t>(seq_pos_str[j]) - 1));
467 if (in[i].find(
"AllelicData", 0) != string::npos)
469 string temp1 = in[++i];
470 string temp2 = in[++i];
471 vector<string> allele_pos_str1 =
getValues_(temp1,
"");
472 vector<string> allele_pos_str2 =
getValues_(temp2,
"");
479 if (allele_pos_str1.size() == allele_pos_str2.size())
481 for (
size_t j = 0; j < allele_pos_str1.size(); j++)
484 allele_pos_str1[j] = TextTools::removeSurroundingWhiteSpaces(allele_pos_str1[j]);
485 vector<string> tmp_alleles_id;
495 tmp_alleles_id.push_back(allele_pos_str1[j]);
497 allele_pos_str2[j] = TextTools::removeSurroundingWhiteSpaces(allele_pos_str2[j]);
507 tmp_alleles_id.push_back(allele_pos_str2[j]);
519 if (tmp_indiv.
getId() !=
"")
549 os <<
"[General]" << endl;
550 os <<
"MissingData = " << getMissingDataSymbol() << endl;
551 os <<
"DataSeparator = " << getDataSeparator() << endl;
552 if (data_set.hasSequenceData())
554 string seq_type = data_set.getAlphabetType();
555 os <<
"SequenceType = " << seq_type << endl;
558 if (data_set.hasLocality())
560 os << endl <<
"[Localities]" << endl;
561 for (
size_t i = 0; i < data_set.getNumberOfLocalities(); i++)
563 os <<
">" << (data_set.getLocalityAtPosition(i)).getName() << endl;
564 os <<
"Coord = " << (data_set.getLocalityAtPosition(i)).getX();
565 os <<
" " << (data_set.getLocalityAtPosition(i)).getY() << endl;
570 if (data_set.hasSequenceData())
573 os << endl <<
"[Sequences]" << endl;
574 for (
size_t i = 0; i < data_set.getNumberOfGroups(); i++)
576 for (
size_t j = 0; j < data_set.getNumberOfIndividualsInGroup(i); j++)
578 fasta.writeSequences(os, data_set.getIndividualAtPositionFromGroup(i, j)->getSequences());
584 if (data_set.hasAlleleicData())
586 os << endl <<
"[Loci]" << endl;
587 for (
size_t i = 0; i < data_set.getNumberOfLoci(); i++)
589 const LocusInfo& tmp_locus_info = data_set.getLocusInfoAtPosition(i);
590 os <<
">" << tmp_locus_info.
getName() << endl;
606 os << endl <<
"[Individuals]" << endl;
607 for (
size_t i = 0; i < data_set.getNumberOfGroups(); i++)
609 for (
size_t j = 0; j < data_set.getNumberOfIndividualsInGroup(i); j++)
613 const Individual* tmp_ind = data_set.getIndividualAtPositionFromGroup(i, j);
614 os <<
">" << tmp_ind->
getId() << endl;
615 os <<
"Group = " << TextTools::toString((data_set.getGroupAtPosition(i)).getGroupId()) << endl;
619 os <<
"Coord = " << tmp_ind->
getX() <<
" " << tmp_ind->
getY() << endl;
625 os <<
"SequenceData = {" << endl;
626 for (
size_t k = 0; k < nbss; k++)
631 os << TextTools::toString(seqcpt++);
633 catch (SequenceNotFoundException)
635 os << getMissingDataChar();
638 os << getDataSeparatorChar();
647 vector<vector<string> > output(tmp_genotype.
size());
648 os <<
"AllelicData = {" << endl;
649 for (
size_t k = 0; k < tmp_genotype.
size(); k++)
654 output[k][0] = getMissingDataChar();
655 output[k][1] = getMissingDataChar();
660 output[k][0] = data_set.getLocusInfoAtPosition(k).getAlleleInfoByKey(tmp_all_ind[0]).getId();
661 if (tmp_all_ind.size() > 1)
662 output[k][1] = data_set.getLocusInfoAtPosition(k).getAlleleInfoByKey(tmp_all_ind[1]).getId();
664 output[k][1] = getMissingDataChar();
667 for (
size_t k = 0; k < output.size(); k++)
670 if (k < output.size() - 1)
671 os << getDataSeparatorChar();
675 for (
size_t k = 0; k < output.size(); k++)
678 if (k < output.size() - 1)
679 os << getDataSeparatorChar();
696 vector<string> values;
697 size_t limit = param_line.find(delim, 0);
698 if (limit != string::npos)
699 param_line = string(param_line.begin() +
static_cast<ptrdiff_t
>(limit + delim.size()), param_line.end());
700 param_line = TextTools::removeSurroundingWhiteSpaces(param_line);
706 values.push_back(
string(param_line.begin() +
static_cast<ptrdiff_t
>(bi), param_line.begin() +
static_cast<ptrdiff_t
>(bs)));
710 values.push_back(
string(param_line.begin() +
static_cast<ptrdiff_t
>(bi), param_line.end()));
void parseGeneral_(const std::vector< std::string > &in, DataSet &data_set)
size_t getNumberOfSequences() const
Get the number of sequences.
static unsigned int DIPLOID
void setDate(const Date &date)
Set the date of the Individual.
const MonolocusGenotype & getMonolocusGenotype(size_t locus_position) const
Get a MonolocusGenotype.
void initGenotype(size_t loci_number)
Init the genotype.
double getX() const
Get the X coordinate of the Individual.
char missing_data_symbol_
bool hasCoord() const
Tell if this Individual has coordinates.
bool hasLocality() const
Tell if this Individual has a locality.
static const std::string SEMICOLON
char getMissingDataChar() const
Get the character for missing data.
virtual void read(std::istream &is, DataSet &data_set)=0
Read a DataSet on istream.
unsigned int getPloidy() const
Get the ploidy of the locus.
void parseLocality_(const std::vector< std::string > &in, DataSet &data_set)
static const std::string UNKNOWN
const Locality< double > & getLocalityByName(const std::string &name) const
Get a Locality by name.
std::string getDataSeparator() const
Get the code for data separator.
void addAlleleInfoByLocusPosition(size_t locus_position, const AlleleInfo &allele)
Add an AlleleInfo to a LocusInfo.
static unsigned int HAPLOID
The BasicAlleleInfo class.
void setId(const std::string &id)
Set the id of the Individual.
void parseIndividual_(const std::vector< std::string > &in, DataSet &data_set, const VectorSequenceContainer &vsc)
The MultilocusGenotype class.
static const std::string HAPLODIPLOID
const Locality< double > * getLocality() const
Get the locality of the Individual.
bool hasGenotype() const
Tell if the Individual has a MultilocusGenotype.
const std::string & getName() const
Get the name of the locality.
std::string getDateStr() const
Get the Date as a string.
bool isMonolocusGenotypeMissing(size_t locus_position) const
Tell if a MonolocusGenotype is a missing data.
size_t getNumberOfLoci() const
Get the number of loci.
char getDataSeparatorChar() const
Get the data separator char.
bool hasSequences() const
Tell if the Individual has some sequences.
void setDataSeparator(const std::string &data_separator)
Set the code for data separator.
bool hasDate() const
Tell if this Individual has a date.
const std::string & getId() const
Get the id of the Individual.
std::string getMissingDataSymbol() const
Get the code for missing data.
void parseSequence_(const std::vector< std::string > &in, VectorSequenceContainer &vsc)
void setMonolocusGenotypeByAlleleId(size_t locus_position, const std::vector< std::string > allele_id, const LocusInfo &locus_info)
Set a MonolocusGenotype.
const LocusInfo & getLocusInfoAtPosition(size_t locus_position) const
Get a LocusInfo by its position.
const std::string & getName() const
Get the name of the locus.
void setLocusInfo(size_t locus_position, const LocusInfo &locus)
Set a LocusInfo.
static unsigned int UNKNOWN
size_t getGroupPosition(size_t group_id) const
Get the position of a Group.
void addIndividualToGroup(size_t group_position, const Individual &individual)
Add an Individual to a Group.
static unsigned int HAPLODIPLOID
size_t getNumberOfAlleles() const
Get the number of alleles at this locus.
void setName(const std::string &name)
Set the name of the locality.
std::vector< std::string > getValues_(std::string ¶m_line, const std::string &delim)
void setAlphabet(const Alphabet *alpha)
Set the alphabet of the AnalyzedSequences.
size_t size() const
Count the number of loci.
const Sequence & getSequenceAtPosition(const size_t sequence_position) const
Get a sequence by its position.
static const std::string HAPLOID
void addLocality(Locality< double > &locality)
Add a locality to the DataSet.
const Date & getDate() const
Get the date of the Individual.
void write(std::ostream &os, const DataSet &data_set) const
Write a DataSet on ostream.
void setCoord(const Point2D< double > &coord)
Set the coodinates of the Individual.
void setLocality(const Locality< double > *locality)
Set the locality of the Individual.
static const std::string TAB
const MultilocusGenotype & getGenotype() const
Get the genotype.
static const std::string COMA
void addSequence(size_t sequence_key, const Sequence &sequence)
Add a sequence to the Individual.
void setMissingDataSymbol(const std::string &missing_data_symbol)
Set the code for missing data.
void addEmptyGroup(size_t group_id)
Add an empty Group to the DataSet.
double getY() const
Get the Y coordinate of the Individual.
void parseLoci_(const std::vector< std::string > &in, std::vector< LocusInfo > &locus_info)
virtual void write(std::ostream &os, const DataSet &data_set) const =0
Write a DataSet on ostream.
static const std::string WHITESPACE
virtual std::vector< size_t > getAlleleIndex() const =0
Get the alleles' index.
void read(std::istream &is, DataSet &data_set)
Read a DataSet on istream.
static const std::string DIPLOID