1 // ***************************************************************************
2 // bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 26 January 2011
7 // ---------------------------------------------------------------------------
8 // Provides general utilities used by BamTools sub-tools.
9 // ***************************************************************************
11 #include <api/BamMultiReader.h>
12 #include <api/BamReader.h>
13 #include <utils/bamtools_utilities.h>
14 using namespace BamTools;
24 const char REVCOMP_LOOKUP[] = {'T', 0, 'G', 'H',
32 } // namespace BamTools
34 // check if a file exists
35 bool Utilities::FileExists(const string& filename) {
36 ifstream f(filename.c_str(), ifstream::in);
40 // Parses a region string, does validation (valid ID's, positions), stores in Region struct
41 // Returns success (true/false)
42 bool Utilities::ParseRegionString(const string& regionString,
43 const BamReader& reader,
46 // -------------------------------
47 // parse region string
49 // check first for empty string
50 if ( regionString.empty() )
53 // non-empty string, look for a colom
54 size_t foundFirstColon = regionString.find(':');
56 // store chrom strings, and numeric positions
63 // going to use entire contents of requested chromosome
64 // just store entire region string as startChrom name
65 // use BamReader methods to check if its valid for current BAM file
66 if ( foundFirstColon == string::npos ) {
67 startChrom = regionString;
69 stopChrom = regionString;
73 // colon found, so we at least have some sort of startPos requested
76 // store start chrom from beginning to first colon
77 startChrom = regionString.substr(0,foundFirstColon);
79 // look for ".." after the colon
80 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
83 // so we have a startPos but no range
84 // store contents before colon as startChrom, after as startPos
85 if ( foundRangeDots == string::npos ) {
86 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
87 stopChrom = startChrom;
91 // ".." found, so we have some sort of range selected
94 // store startPos between first colon and range dots ".."
95 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
97 // look for second colon
98 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
100 // no second colon found
101 // so we have a "standard" chrom:start..stop input format (on single chrom)
102 if ( foundSecondColon == string::npos ) {
103 stopChrom = startChrom;
104 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
107 // second colon found
108 // so we have a range requested across 2 chrom's
110 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
111 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
116 // -------------------------------
117 // validate reference IDs & genomic positions
119 const RefVector references = reader.GetReferenceData();
121 // if startRefID not found, return false
122 int startRefID = reader.GetReferenceID(startChrom);
123 if ( startRefID == (int)references.size() ) return false;
125 // if startPos is larger than reference, return false
126 const RefData& startReference = references.at(startRefID);
127 if ( startPos > startReference.RefLength ) return false;
129 // if stopRefID not found, return false
130 int stopRefID = reader.GetReferenceID(stopChrom);
131 if ( stopRefID == (int)references.size() ) return false;
133 // if stopPosition larger than reference, return false
134 const RefData& stopReference = references.at(stopRefID);
135 if ( stopPos > stopReference.RefLength ) return false;
137 // if no stopPosition specified, set to reference end
138 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
140 // -------------------------------
141 // set up Region struct & return
143 region.LeftRefID = startRefID;
144 region.LeftPosition = startPos;
145 region.RightRefID = stopRefID;;
146 region.RightPosition = stopPos;
150 // Same as ParseRegionString() above, but accepts a BamMultiReader
151 bool Utilities::ParseRegionString(const string& regionString,
152 const BamMultiReader& reader,
155 // -------------------------------
156 // parse region string
158 // check first for empty string
159 if ( regionString.empty() )
162 // non-empty string, look for a colom
163 size_t foundFirstColon = regionString.find(':');
165 // store chrom strings, and numeric positions
172 // going to use entire contents of requested chromosome
173 // just store entire region string as startChrom name
174 // use BamReader methods to check if its valid for current BAM file
175 if ( foundFirstColon == string::npos ) {
176 startChrom = regionString;
178 stopChrom = regionString;
182 // colon found, so we at least have some sort of startPos requested
185 // store start chrom from beginning to first colon
186 startChrom = regionString.substr(0,foundFirstColon);
188 // look for ".." after the colon
189 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
192 // so we have a startPos but no range
193 // store contents before colon as startChrom, after as startPos
194 if ( foundRangeDots == string::npos ) {
195 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
196 stopChrom = startChrom;
200 // ".." found, so we have some sort of range selected
203 // store startPos between first colon and range dots ".."
204 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
206 // look for second colon
207 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
209 // no second colon found
210 // so we have a "standard" chrom:start..stop input format (on single chrom)
211 if ( foundSecondColon == string::npos ) {
212 stopChrom = startChrom;
213 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
216 // second colon found
217 // so we have a range requested across 2 chrom's
219 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
220 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
225 // -------------------------------
226 // validate reference IDs & genomic positions
228 const RefVector references = reader.GetReferenceData();
230 // if startRefID not found, return false
231 int startRefID = reader.GetReferenceID(startChrom);
232 if ( startRefID == (int)references.size() ) return false;
234 // if startPos is larger than reference, return false
235 const RefData& startReference = references.at(startRefID);
236 if ( startPos > startReference.RefLength ) return false;
238 // if stopRefID not found, return false
239 int stopRefID = reader.GetReferenceID(stopChrom);
240 if ( stopRefID == (int)references.size() ) return false;
242 // if stopPosition larger than reference, return false
243 const RefData& stopReference = references.at(stopRefID);
244 if ( stopPos > stopReference.RefLength ) return false;
246 // if no stopPosition specified, set to reference end
247 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
249 // -------------------------------
250 // set up Region struct & return
252 region.LeftRefID = startRefID;
253 region.LeftPosition = startPos;
254 region.RightRefID = stopRefID;;
255 region.RightPosition = stopPos;
259 void Utilities::Reverse(string& sequence) {
260 reverse(sequence.begin(), sequence.end());
263 void Utilities::ReverseComplement(string& sequence) {
265 // do complement, in-place
266 size_t seqLength = sequence.length();
267 for ( size_t i = 0; i < seqLength; ++i )
268 sequence.replace(i, 1, 1, REVCOMP_LOOKUP[(int)sequence.at(i) - 65]);