bpp-popgen  2.2.0
GeneMapperCsvExport.cpp
Go to the documentation of this file.
1 //
2 // File: GeneMapperCsvExport.cpp
3 // Author: Sylvain Gaillard
4 // Created: April 2, 2008
5 //
6 
7 /*
8  Copyright or © or Copr. Bio++ Development Team, (April 2, 2008)
9 
10  This software is a computer program whose purpose is to provide classes
11  for population genetics analysis.
12 
13  This software is governed by the CeCILL license under French law and
14  abiding by the rules of distribution of free software. You can use,
15  modify and/ or redistribute the software under the terms of the CeCILL
16  license as circulated by CEA, CNRS and INRIA at the following URL
17  "http://www.cecill.info".
18 
19  As a counterpart to the access to the source code and rights to copy,
20  modify and redistribute granted by the license, users are provided only
21  with a limited warranty and the software's author, the holder of the
22  economic rights, and the successive licensors have only limited
23  liability.
24 
25  In this respect, the user's attention is drawn to the risks associated
26  with loading, using, modifying and/or developing or reproducing the
27  software by the user in light of its specific status of free software,
28  that may mean that it is complicated to manipulate, and that also
29  therefore means that it is reserved for developers and experienced
30  professionals having in-depth computer knowledge. Users are therefore
31  encouraged to load and test the software's suitability as regards their
32  requirements in conditions enabling the security of their systems and/or
33  data to be ensured and, more generally, to use and operate it in the
34  same conditions as regards security.
35 
36  The fact that you are presently reading this means that you have had
37  knowledge of the CeCILL license and that you accept its terms.
38  */
39 
40 #include "GeneMapperCsvExport.h"
41 
42 using namespace bpp;
43 using namespace std;
44 
45 const std::string GeneMapperCsvExport::SAMPLE_FILE_H = "Sample File";
46 const std::string GeneMapperCsvExport::SAMPLE_NAME_H = "Sample Name";
47 const std::string GeneMapperCsvExport::PANEL_H = "Panel";
48 const std::string GeneMapperCsvExport::MARKER_H = "Marker";
49 const std::string GeneMapperCsvExport::DYE_H = "Dye";
50 const std::string GeneMapperCsvExport::ALLELE_H = "Allele ";
51 const std::string GeneMapperCsvExport::SIZE_H = "Size ";
52 const std::string GeneMapperCsvExport::HEIGHT_H = "Height ";
53 const std::string GeneMapperCsvExport::PEAK_AREA_H = "Peak Area ";
54 const std::string GeneMapperCsvExport::DAC_H = "DAC";
55 const std::string GeneMapperCsvExport::AN_H = "AN";
56 
57 //GeneMapperCsvExport::GeneMapperCsvExport(bool ia) : IndependentAlleles_(ia) {}
58 
60 
61 void GeneMapperCsvExport::read(std::istream& is, DataSet& data_set) throw (Exception)
62 {
63  if (!is)
64  throw IOException("GeneMapperCsvExport::read: fail to open stream.");
65 
66  /*
67  * Feed a DataTable with the data
68  */
69  DataTable* dtp = DataTable::read(is, "\t", true, -1);
70  DataTable& dt = *dtp;
71 
72  /*
73  * Fixe the individuals' name if there is duplicate in the file
74  */
75  vector<string> ind_names;
76  vector<string> markers;
77  try
78  {
79  ind_names = dt.getColumn(SAMPLE_NAME_H);
80  markers = dt.getColumn(MARKER_H);
81  }
82  catch (Exception& e)
83  {
84  throw e;
85  }
86  map<string, int> indname_marker;
87  for (size_t i = 0; i < dt.getNumberOfRows(); i++)
88  {
89  string test_lab = dt(i, SAMPLE_NAME_H) + dt(i, MARKER_H);
90  if (indname_marker.find(test_lab) != indname_marker.end())
91  {
92  string new_lab = dt(i, SAMPLE_NAME_H) + "_" + TextTools::toString(indname_marker[test_lab] + 1);
93  dt (i, SAMPLE_NAME_H) = new_lab;
94  }
95  indname_marker[test_lab]++;
96  }
97  ind_names = dt.getColumn(SAMPLE_NAME_H);
98 
99  map<string, size_t> ind_count = VectorTools::countValues(ind_names);
100  ind_names = VectorTools::unique(ind_names);
101  markers = VectorTools::unique(markers);
102  size_t loc_nbr = markers.size();
103 
104  /*
105  * Loci number
106  */
107  data_set.initAnalyzedLoci(loc_nbr);
108 
109  /*
110  * Group of individuals
111  */
112  data_set.addEmptyGroup(0);
113  for (unsigned int i = 0; i < ind_names.size(); i++)
114  {
115  Individual ind(ind_names[i]);
116  data_set.addIndividualToGroup(data_set.getGroupPosition(0), ind);
117  }
118 
119  /*
120  * Loci data
121  */
122  AnalyzedLoci al(markers.size());
123  vector<string> col_names = dt.getColumnNames();
124 
125  // Finds columns containing allele data
126  vector<size_t> alleles_cols;
127  for (size_t i = 0; i < col_names.size(); i++)
128  {
129  if (TextTools::startsWith(col_names[i], ALLELE_H))
130  alleles_cols.push_back(i);
131  }
132  // Set LocusInfo
133  vector<vector<size_t> > alleles_pos;
134  for (size_t i = 0; i < markers.size(); i++)
135  {
136  al.setLocusInfo(i, LocusInfo(markers[i], LocusInfo::UNKNOWN));
137  }
138  std::map< std::string, std::set< std::string > > markerAlleles;
139  for (size_t i = 0; i < dt.getNumberOfRows(); ++i)
140  {
141  for (size_t j = 0; j < alleles_cols.size(); ++j)
142  {
143  if (dt(i, alleles_cols[j]) != "")
144  {
145  markerAlleles[dt(i, MARKER_H)].insert(dt(i, alleles_cols[j]));
146  }
147  }
148  }
149  for (std::map< std::string, std::set< std::string > >::iterator itm = markerAlleles.begin(); itm != markerAlleles.end(); itm++)
150  {
151  std::set< std::string >& s = itm->second;
152  for (std::set< std::string >::iterator its = s.begin(); its != s.end(); its++)
153  {
154  al.addAlleleInfoByLocusName(itm->first, BasicAlleleInfo(*its));
155  }
156  }
157  data_set.setAnalyzedLoci(al);
158 
159  /*
160  * Individuals informations
161  */
162  size_t ind_col_index = VectorTools::which(dt.getColumnNames(), SAMPLE_NAME_H);
163  size_t mark_col_index = VectorTools::which(dt.getColumnNames(), MARKER_H);
164  for (size_t i = 0; i < dt.getNumberOfRows(); i++)
165  {
166  vector<size_t> alleles;
167  for (size_t j = 0; j < alleles_cols.size(); j++)
168  {
169  if (!TextTools::isEmpty(dt(i, alleles_cols[j])))
170  {
171  unsigned int num = (data_set.getLocusInfoByName(dt(i, mark_col_index))).getAlleleInfoKey(dt(i, alleles_cols[j]));
172  alleles.push_back(num);
173  }
174  }
175  alleles = VectorTools::unique(alleles);
176  MultiAlleleMonolocusGenotype ma(alleles);
177  if (!data_set.getIndividualByIdFromGroup(0, dt(i, ind_col_index))->hasGenotype())
178  data_set.initIndividualGenotypeInGroup(0, data_set.getIndividualPositionInGroup(0, dt(i, ind_col_index)));
179  if (alleles.size())
180  data_set.setIndividualMonolocusGenotypeInGroup(0, data_set.getIndividualPositionInGroup(0, dt(i, ind_col_index)), data_set.getAnalyzedLoci()->getLocusInfoPosition(dt(i, mark_col_index)), ma);
181  }
182  delete dtp;
183 }
184 
185 void GeneMapperCsvExport::read(const std::string& path, DataSet& data_set) throw (Exception)
186 {
187  AbstractIDataSet::read(path, data_set);
188 }
189 
190 DataSet* GeneMapperCsvExport::read(std::istream& is) throw (Exception)
191 {
192  return AbstractIDataSet::read(is);
193 }
194 
195 DataSet* GeneMapperCsvExport::read(const std::string& path) throw (Exception)
196 {
197  return AbstractIDataSet::read(path);
198 }
199 
200 // --- GeneMapperCsvExport::Record ---
201 GeneMapperCsvExport::Record::Record(const std::string& row) : sampleFile_(),
202  sampleName_(),
203  panel_(),
204  markerName_(),
205  dye_(),
206  alleles_(),
207  dac_(),
208  an_(0.)
209 {
210  StringTokenizer st(row, "\t", true, false);
211  /*
212  if (st.numberOfRemainingTokens() != 7 + 4 * alleleNumber) {
213  throw Exception("GeneMapperCsvExport::Record::Record: bad number of allele");
214  }
215  */
216  size_t itemNum = st.numberOfRemainingTokens();
217  size_t alleleNum = (itemNum - 7) / 4;
218  sampleFile_ = st.getToken(0);
219  sampleName_ = st.getToken(1);
220  panel_ = st.getToken(2);
221  markerName_ = st.getToken(3);
222  dye_ = st.getToken(4);
223  dac_ = st.getToken(itemNum - 2);
224  an_ = TextTools::toDouble(st.getToken(itemNum - 1));
225  for (unsigned int i = 0; i < alleleNum; ++i)
226  {
228  st.getToken(5 + i),
229  TextTools::toDouble(st.getToken(5 + alleleNum + i)),
230  TextTools::to<unsigned int>(st.getToken(5 + (2 * alleleNum) + i)),
231  TextTools::toDouble(st.getToken(5 + (3 * alleleNum) + i))
232  );
233  alleles_.push_back(al);
234  }
235 }
Record(const std::string &row)
Constructor.
The MultiAlleleMonolocusGenotype class.
virtual void read(std::istream &is, DataSet &data_set)=0
Read a DataSet on istream.
static const std::string AN_H
STL namespace.
static const std::string SIZE_H
The BasicAlleleInfo class.
The Individual class.
Definition: Individual.h:75
static const std::string SAMPLE_NAME_H
static const std::string PANEL_H
void read(std::istream &is, DataSet &data_set)
Read a DataSet on istream.
static const std::string HEIGHT_H
static const std::string DAC_H
std::vector< GeneMapperCsvExport::Allele > alleles_
static const std::string MARKER_H
static unsigned int UNKNOWN
Definition: LocusInfo.h:74
static const std::string DYE_H
The AnalyzedLoci class.
Definition: AnalyzedLoci.h:64
static const std::string SAMPLE_FILE_H
Store data for one allele.
static const std::string PEAK_AREA_H
The LocusInfo class.
Definition: LocusInfo.h:63
static const std::string ALLELE_H
The DataSet class.
Definition: DataSet.h:73