bpp-popgen  2.2.0
Genepop.cpp
Go to the documentation of this file.
1 //
2 // File Genepop.cpp
3 // Author : Sylvain Gaillard
4 // Last modification : Tuesday September 21 2004
5 //
6 
7 /*
8  Copyright or © or Copr. CNRS, (November 17, 2004)
9 
10  This software is a computer program whose purpose is to provide classes
11  for population genetics analysis.
12 
13  This software is governed by the CeCILL license under French law and
14  abiding by the rules of distribution of free software. You can use,
15  modify and/ or redistribute the software under the terms of the CeCILL
16  license as circulated by CEA, CNRS and INRIA at the following URL
17  "http://www.cecill.info".
18 
19  As a counterpart to the access to the source code and rights to copy,
20  modify and redistribute granted by the license, users are provided only
21  with a limited warranty and the software's author, the holder of the
22  economic rights, and the successive licensors have only limited
23  liability.
24 
25  In this respect, the user's attention is drawn to the risks associated
26  with loading, using, modifying and/or developing or reproducing the
27  software by the user in light of its specific status of free software,
28  that may mean that it is complicated to manipulate, and that also
29  therefore means that it is reserved for developers and experienced
30  professionals having in-depth computer knowledge. Users are therefore
31  encouraged to load and test the software's suitability as regards their
32  requirements in conditions enabling the security of their systems and/or
33  data to be ensured and, more generally, to use and operate it in the
34  same conditions as regards security.
35 
36  The fact that you are presently reading this means that you have had
37  knowledge of the CeCILL license and that you accept its terms.
38  */
39 
40 #include "Genepop.h"
41 
42 using namespace bpp;
43 using namespace std;
44 
46 
48 
49 void Genepop::read(istream& is, DataSet& data_set) throw (Exception)
50 {
51  if (!is)
52  throw IOException("Genepop::read: fail to open stream.");
53  // Skip first line
54  FileTools::getNextLine(is);
55  ios::pos_type entry_point = is.tellg();
56  bool eof_ok = false;
57  bool loc_def_ok = false;
58  bool loc_nbr_ok = false;
59  size_t grp_nbr = 0;
60  vector<LocusInfo> tmp_loc;
61  vector<set<string> > al_ids;
62  map<string, size_t> ind_id_count;
63  map<string, size_t> ind_id_index;
64 
65  string temp("");
66  // First read : file structure
67  while (!eof_ok)
68  {
69  if (is.peek() == EOF && !eof_ok)
70  {
71  // If eof rewind to entry_point
72  is.seekg(entry_point);
73  eof_ok = true;
74  }
75  else
76  {
77  // Count everything
78  temp = FileTools::getNextLine(is);
79  string cp_temp = TextTools::removeSurroundingWhiteSpaces(temp);
80  cp_temp = TextTools::toUpper(cp_temp);
81  if (cp_temp == string("POP"))
82  {
83  loc_def_ok = true;
84  grp_nbr++;
85  data_set.addEmptyGroup(grp_nbr);
86  }
87  if (!loc_def_ok)
88  {
89  StringTokenizer st(temp, string(", "), true);
90  while (st.hasMoreToken())
91  tmp_loc.push_back(LocusInfo(TextTools::removeSurroundingWhiteSpaces(st.nextToken())));
92  }
93  if (loc_def_ok && !loc_nbr_ok)
94  {
95  al_ids.resize(tmp_loc.size());
96  loc_nbr_ok = true;
97  }
98  if (loc_def_ok)
99  {
100  string alleles;
101  StringTokenizer st(temp, string(","));
102  if (st.numberOfRemainingTokens() == 2)
103  {
104  ind_id_count[TextTools::removeSurroundingWhiteSpaces(st.nextToken())]++;
105  alleles = st.nextToken();
106  }
107  StringTokenizer st2(alleles);
108  if ((size_t)st2.numberOfRemainingTokens() == tmp_loc.size())
109  {
110  size_t i = 0;
111  while (st2.hasMoreToken())
112  {
113  string ids = TextTools::removeSurroundingWhiteSpaces(st2.nextToken());
114  string tmp_id = string(ids.begin(), ids.begin() + (ids.size() / 2));
115  if (tmp_id != string("00") && tmp_id != string("000"))
116  al_ids[i].insert(tmp_id);
117  tmp_id = string(ids.begin() + (ids.size() / 2), ids.end());
118  if (tmp_id != string("00") && tmp_id != string("000"))
119  al_ids[i].insert(tmp_id);
120  i++;
121  }
122  }
123  }
124  }
125  }
126 
127  // Set AnalyzedLoci
128  data_set.initAnalyzedLoci(tmp_loc.size());
129  for (size_t i = 0; i < tmp_loc.size(); i++)
130  {
131  data_set.setLocusInfo(i, tmp_loc[i]);
132  for (set<string>::iterator it = al_ids[i].begin(); it != al_ids[i].end(); it++)
133  {
134  data_set.addAlleleInfoByLocusPosition(i, BasicAlleleInfo(*it));
135  }
136  }
137 
138  // Second read : file data
139  grp_nbr = 0;
140  size_t grp_pos = 0;
141  loc_def_ok = false;
142  while (!is.eof())
143  {
144  temp = FileTools::getNextLine(is);
145  string cp_temp = TextTools::removeSurroundingWhiteSpaces(temp);
146  cp_temp = TextTools::toUpper(cp_temp);
147  if (cp_temp == string("POP"))
148  {
149  grp_nbr++;
150  loc_def_ok = true;
151  grp_pos = data_set.getGroupPosition(grp_nbr);
152  }
153  else
154  {
155  if (loc_def_ok)
156  {
157  string alleles;
158  StringTokenizer st(temp, string(","));
159  size_t ind_pos = 0;
160  if (st.numberOfRemainingTokens() == 2)
161  {
162  string ind_id = TextTools::removeSurroundingWhiteSpaces(st.nextToken());
163  if (ind_id_count[ind_id] > 1)
164  ind_id = ind_id + string("_") + TextTools::toString(++ind_id_index[ind_id]);
165  data_set.addEmptyIndividualToGroup(grp_pos, ind_id);
166  ind_pos = data_set.getIndividualPositionInGroup(grp_pos, ind_id);
167  data_set.initIndividualGenotypeInGroup(grp_pos, ind_pos);
168  alleles = st.nextToken();
169  }
170  StringTokenizer st2(alleles);
171  if ((size_t)st2.numberOfRemainingTokens() == tmp_loc.size())
172  {
173  size_t i = 0;
174  while (st2.hasMoreToken())
175  {
176  string ids = TextTools::removeSurroundingWhiteSpaces(st2.nextToken());
177  vector<string> tmp_ids;
178  tmp_ids.push_back(string(ids.begin(), ids.begin() + (ids.size() / 2)));
179  tmp_ids.push_back(string(ids.begin() + (ids.size() / 2), ids.end()));
180  if (tmp_ids[0] != string("00") && tmp_ids[0] != string("000")
181  && tmp_ids[1] != string("00") && tmp_ids[1] != string("000"))
182  {
183  data_set.setIndividualMonolocusGenotypeByAlleleIdInGroup(grp_pos, ind_pos, i, tmp_ids);
184  }
185  i++;
186  tmp_ids.clear();
187  }
188  }
189  }
190  }
191  }
192 }
193 
194 void Genepop::read(const string& path, DataSet& data_set) throw (Exception)
195 {
196  AbstractIDataSet::read(path, data_set);
197 }
198 
199 DataSet* Genepop::read(istream& is) throw (Exception)
200 {
201  return AbstractIDataSet::read(is);
202 }
203 
204 DataSet* Genepop::read(const string& path) throw (Exception)
205 {
206  return AbstractIDataSet::read(path);
207 }
208 
void read(std::istream &is, DataSet &data_set)
Read a DataSet on istream.
Definition: Genepop.cpp:49
virtual void read(std::istream &is, DataSet &data_set)=0
Read a DataSet on istream.
STL namespace.
The BasicAlleleInfo class.
The LocusInfo class.
Definition: LocusInfo.h:63
The DataSet class.
Definition: DataSet.h:73