1 // ***************************************************************************
2 // bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 8 October 2011
6 // ---------------------------------------------------------------------------
7 // Provides general utilities used by BamTools sub-tools.
8 // ***************************************************************************
10 #include <api/BamMultiReader.h>
11 #include <api/BamReader.h>
12 #include <utils/bamtools_utilities.h>
13 using namespace BamTools;
25 const char REVCOMP_LOOKUP[] = {'T', 0, 'G', 'H',
33 } // namespace BamTools
35 // returns true if 'source' contains 'pattern'
36 bool Utilities::Contains(const string& source, const string& pattern) {
37 return ( source.find(pattern) != string::npos );
40 // returns true if 'source' contains 'c'
41 bool Utilities::Contains(const std::string &source, const char c) {
42 return ( source.find(c) != string::npos );
45 // returns true if 'source' ends with 'pattern'
46 bool Utilities::EndsWith(const string& source, const string& pattern) {
47 return ( source.find(pattern) == (source.length() - pattern.length()) );
50 // returns true if 'source' ends with 'c'
51 bool Utilities::EndsWith(const std::string& source, const char c) {
52 return ( source.find(c) == (source.length() - 1) );
55 // check if a file exists
56 bool Utilities::FileExists(const string& filename) {
57 ifstream f(filename.c_str(), ifstream::in);
61 // Parses a region string, does validation (valid ID's, positions), stores in Region struct
62 // Returns success (true/false)
63 bool Utilities::ParseRegionString(const string& regionString,
64 const BamReader& reader,
67 // -------------------------------
68 // parse region string
70 // check first for empty string
71 if ( regionString.empty() )
74 // non-empty string, look for a colom
75 size_t foundFirstColon = regionString.find(':');
77 // store chrom strings, and numeric positions
84 // going to use entire contents of requested chromosome
85 // just store entire region string as startChrom name
86 // use BamReader methods to check if its valid for current BAM file
87 if ( foundFirstColon == string::npos ) {
88 startChrom = regionString;
90 stopChrom = regionString;
94 // colon found, so we at least have some sort of startPos requested
97 // store start chrom from beginning to first colon
98 startChrom = regionString.substr(0,foundFirstColon);
100 // look for ".." after the colon
101 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
104 // so we have a startPos but no range
105 // store contents before colon as startChrom, after as startPos
106 if ( foundRangeDots == string::npos ) {
107 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
108 stopChrom = startChrom;
112 // ".." found, so we have some sort of range selected
115 // store startPos between first colon and range dots ".."
116 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
118 // look for second colon
119 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
121 // no second colon found
122 // so we have a "standard" chrom:start..stop input format (on single chrom)
123 if ( foundSecondColon == string::npos ) {
124 stopChrom = startChrom;
125 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
128 // second colon found
129 // so we have a range requested across 2 chrom's
131 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
132 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
137 // -------------------------------
138 // validate reference IDs & genomic positions
140 const RefVector references = reader.GetReferenceData();
142 // if startRefID not found, return false
143 int startRefID = reader.GetReferenceID(startChrom);
144 if ( startRefID == -1 ) return false;
146 // startPos cannot be greater than or equal to reference length
147 const RefData& startReference = references.at(startRefID);
148 if ( startPos >= startReference.RefLength ) return false;
150 // if stopRefID not found, return false
151 int stopRefID = reader.GetReferenceID(stopChrom);
152 if ( stopRefID == -1 ) return false;
154 // stopPosition cannot be larger than reference length
155 const RefData& stopReference = references.at(stopRefID);
156 if ( stopPos > stopReference.RefLength ) return false;
158 // if no stopPosition specified, set to reference end
159 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
161 // -------------------------------
162 // set up Region struct & return
164 region.LeftRefID = startRefID;
165 region.LeftPosition = startPos;
166 region.RightRefID = stopRefID;;
167 region.RightPosition = stopPos;
171 // Same as ParseRegionString() above, but accepts a BamMultiReader
172 bool Utilities::ParseRegionString(const string& regionString,
173 const BamMultiReader& reader,
176 // -------------------------------
177 // parse region string
179 // check first for empty string
180 if ( regionString.empty() )
183 // non-empty string, look for a colom
184 size_t foundFirstColon = regionString.find(':');
186 // store chrom strings, and numeric positions
193 // going to use entire contents of requested chromosome
194 // just store entire region string as startChrom name
195 // use BamReader methods to check if its valid for current BAM file
196 if ( foundFirstColon == string::npos ) {
197 startChrom = regionString;
199 stopChrom = regionString;
203 // colon found, so we at least have some sort of startPos requested
206 // store start chrom from beginning to first colon
207 startChrom = regionString.substr(0,foundFirstColon);
209 // look for ".." after the colon
210 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
213 // so we have a startPos but no range
214 // store contents before colon as startChrom, after as startPos
215 if ( foundRangeDots == string::npos ) {
216 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
217 stopChrom = startChrom;
221 // ".." found, so we have some sort of range selected
224 // store startPos between first colon and range dots ".."
225 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
227 // look for second colon
228 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
230 // no second colon found
231 // so we have a "standard" chrom:start..stop input format (on single chrom)
232 if ( foundSecondColon == string::npos ) {
233 stopChrom = startChrom;
234 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
237 // second colon found
238 // so we have a range requested across 2 chrom's
240 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
241 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
246 // -------------------------------
247 // validate reference IDs & genomic positions
249 const RefVector references = reader.GetReferenceData();
251 // if startRefID not found, return false
252 int startRefID = reader.GetReferenceID(startChrom);
253 if ( startRefID == -1 ) return false;
255 // startPos cannot be greater than or equal to reference length
256 const RefData& startReference = references.at(startRefID);
257 if ( startPos >= startReference.RefLength ) return false;
259 // if stopRefID not found, return false
260 int stopRefID = reader.GetReferenceID(stopChrom);
261 if ( stopRefID == -1 ) return false;
263 // stopPosition cannot be larger than reference length
264 const RefData& stopReference = references.at(stopRefID);
265 if ( stopPos > stopReference.RefLength ) return false;
267 // if no stopPosition specified, set to reference end
268 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
270 // -------------------------------
271 // set up Region struct & return
273 region.LeftRefID = startRefID;
274 region.LeftPosition = startPos;
275 region.RightRefID = stopRefID;;
276 region.RightPosition = stopPos;
280 void Utilities::Reverse(string& sequence) {
281 reverse(sequence.begin(), sequence.end());
284 void Utilities::ReverseComplement(string& sequence) {
286 // do complement, in-place
287 size_t seqLength = sequence.length();
288 for ( size_t i = 0; i < seqLength; ++i )
289 sequence.replace(i, 1, 1, REVCOMP_LOOKUP[(int)sequence.at(i) - 65]);
295 vector<string> Utilities::Split(const string& source, const char delim) {
297 stringstream ss(source);
299 vector<string> fields;
301 while ( getline(ss, field, delim) )
302 fields.push_back(field);
306 vector<string> Utilities::Split(const string& source, const string& delims) {
308 vector<string> fields;
311 char* cchars = new char[source.size()+1];
312 char* cstr = &cchars[0];
313 strcpy(cstr, source.c_str());
314 tok = strtok(cstr, delims.c_str());
315 while (tok != NULL) {
316 fields.push_back(tok);
317 tok = strtok(NULL, delims.c_str());
325 // returns true if 'source' starts with 'pattern'
326 bool Utilities::StartsWith(const string& source, const string& pattern) {
327 return ( source.find(pattern) == 0 );
330 // returns true if 'source' starts with 'c'
331 bool Utilities::StartsWith(const std::string &source, const char c) {
332 return ( source.find(c) == 0 );