bpp-seq-omics  2.2.0
VcfOutputMafIterator.cpp
Go to the documentation of this file.
1 //
2 // File: VcfOutputMafIterator.cpp
3 // Authors: Julien Dutheil
4 // Created: Tue Jan 05 2013
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2010)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
40 #include "VcfOutputMafIterator.h"
41 
42 //From bpp-seq:
43 #include <Bpp/Seq/SequenceWithAnnotationTools.h>
44 #include <Bpp/Seq/SequenceWithQuality.h>
45 #include <Bpp/Seq/Container/VectorSiteContainer.h>
46 #include <Bpp/Seq/SiteTools.h>
47 #include <Bpp/Seq/SequenceWalker.h>
48 
49 using namespace bpp;
50 
51 //From the STL:
52 #include <string>
53 #include <numeric>
54 #include <ctime>
55 
56 using namespace std;
57 
58 void VcfOutputMafIterator::writeHeader_(std::ostream& out) const
59 {
60  time_t t = time(0); // get current time
61  struct tm* ct = localtime(&t);
62  out << "##fileformat=VCFv4.0" << endl;
63  out << "##fileDate=" << (ct->tm_year + 1900) << (ct->tm_mon + 1) << ct->tm_mday << endl;
64  out << "##source=Bio++" << endl;
65  out << "##FILTER=<ID=gap,Description=\"At least one sequence contains a gap\">" << endl;
66  out << "##FILTER=<ID=unk,Description=\"At least one sequence contains an unresolved character\">" << endl;
67  if (genotypes_.size() > 0)
68  out << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl;
69  //There are more options in the header that we may want to support...
70 
71  //Now write the header line:
72  out << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
73  if (genotypes_.size() > 0) {
74  out << "\tFORMAT";
75  for (size_t i = 0; i < genotypes_.size(); ++i)
76  out << "\t" << genotypes_[i];
77  }
78  out << endl;
79 }
80 
81 void VcfOutputMafIterator::writeBlock_(std::ostream& out, const MafBlock& block) const
82 {
83  const MafSequence& refSeq = block.getSequenceForSpecies(refSpecies_);
84  string chr = refSeq.getChromosome();
85  SequenceWalker walker(refSeq);
86  size_t offset = refSeq.start();
87  int gap = refSeq.getAlphabet()->getGapCharacterCode();
88  map<int, string> chars;
89  for (int i = 0; i < static_cast<int>(AlphabetTools::DNA_ALPHABET.getNumberOfTypes()); ++i)
90  chars[i] = AlphabetTools::DNA_ALPHABET.intToChar(i);
91  VectorSiteContainer sites(block.getAlignment());
92  //Where to store genotype information, if any:
93  vector<int> gt(genotypes_.size());
94  //Now we look all sites for SNPs:
95  for (size_t i = 0; i < sites.getNumberOfSites(); i++) {
96  if (refSeq[i] == gap)
97  continue;
98  string filter = "";
99  if (SiteTools::hasGap(sites.getSite(i))) {
100  filter = "gap";
101  }
102  if (SiteTools::hasUnknown(sites.getSite(i))) {
103  if (filter != "")
104  filter += ",";
105  filter += "unk";
106  }
107  if (filter == "")
108  filter = "PASS";
109 
110  map<int, size_t> counts;
111  SiteTools::getCounts(sites.getSite(i), counts);
112  int ref = refSeq[i];
113  string alt = "";
114  string ac = "";
115 
116  map<int, int> snps;
117  int c = 0;
118  for (int x = 0; x < 4; ++x) {
119  if (x != ref) {
120  size_t f = counts[x];
121  if (f > 0) {
122  if (alt != "") {
123  alt += ",";
124  ac += ",";
125  }
126  alt += chars[x];
127  ac += TextTools::toString(f);
128  snps[x] = ++c;
129  }
130  } else {
131  snps[x] = 0;
132  }
133  }
134  if (ac != "") {
135  out << chr << "\t" << (offset + walker.getSequencePosition(i) + 1) << "\t.\t" << chars[refSeq[i]] << "\t" << alt << "\t.\t" << filter << "\tAC=" << ac;
136  //Write genotpyes:
137  if (genotypes_.size() > 0) {
138  out << "\tGT";
139  for (size_t g = 0; g < genotypes_.size(); ++g) {
140  vector<const MafSequence*> sequences = block.getSequencesForSpecies(genotypes_[g]);
141  if (sequences.size() == 0)
142  out << "\t.";
143  else if (sequences.size() > 1)
144  throw Exception("VcfOutputMafIterator::writeBlock(). Duplicated sequence for species '" + genotypes_[g] + ",.");
145  else {
146  int state = (*sequences[0])[i];
147  if (AlphabetTools::DNA_ALPHABET.isGap(state) || AlphabetTools::DNA_ALPHABET.isUnresolved(state))
148  out << "\t.";
149  else
150  out << "\t" << snps[state];
151  }
152  }
153  }
154  out << endl;
155  }
156  }
157 }
158 
const MafSequence & getSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:139
void writeBlock_(std::ostream &out, const MafBlock &block) const
std::vector< const MafSequence * > getSequencesForSpecies(const std::string &species) const
Definition: MafBlock.h:149
STL namespace.
AlignedSequenceContainer & getAlignment()
Definition: MafBlock.h:108
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:55
const std::string & getChromosome() const
Definition: MafSequence.h:154
void writeHeader_(std::ostream &out) const
size_t start() const
Definition: MafSequence.h:106
A sequence class which is used to store data from MAF files.
Definition: MafSequence.h:62