1 // ***************************************************************************
2 // bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 9 June 2011
7 // ---------------------------------------------------------------------------
8 // Provides general utilities used by BamTools sub-tools.
9 // ***************************************************************************
11 #include <api/BamMultiReader.h>
12 #include <api/BamReader.h>
13 #include <utils/bamtools_utilities.h>
14 using namespace BamTools;
26 const char REVCOMP_LOOKUP[] = {'T', 0, 'G', 'H',
34 } // namespace BamTools
36 // returns true if 'source' contains 'pattern'
37 bool Utilities::Contains(const string& source, const string& pattern) {
38 return ( source.find(pattern) != string::npos );
41 // returns true if 'source' contains 'c'
42 bool Utilities::Contains(const std::string &source, const char c) {
43 return ( source.find(c) != string::npos );
46 // returns true if 'source' ends with 'pattern'
47 bool Utilities::EndsWith(const string& source, const string& pattern) {
48 return ( source.find(pattern) == (source.length() - pattern.length()) );
51 // returns true if 'source' ends with 'c'
52 bool Utilities::EndsWith(const std::string& source, const char c) {
53 return ( source.find(c) == (source.length() - 1) );
56 // check if a file exists
57 bool Utilities::FileExists(const string& filename) {
58 ifstream f(filename.c_str(), ifstream::in);
62 // Parses a region string, does validation (valid ID's, positions), stores in Region struct
63 // Returns success (true/false)
64 bool Utilities::ParseRegionString(const string& regionString,
65 const BamReader& reader,
68 // -------------------------------
69 // parse region string
71 // check first for empty string
72 if ( regionString.empty() )
75 // non-empty string, look for a colom
76 size_t foundFirstColon = regionString.find(':');
78 // store chrom strings, and numeric positions
85 // going to use entire contents of requested chromosome
86 // just store entire region string as startChrom name
87 // use BamReader methods to check if its valid for current BAM file
88 if ( foundFirstColon == string::npos ) {
89 startChrom = regionString;
91 stopChrom = regionString;
95 // colon found, so we at least have some sort of startPos requested
98 // store start chrom from beginning to first colon
99 startChrom = regionString.substr(0,foundFirstColon);
101 // look for ".." after the colon
102 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
105 // so we have a startPos but no range
106 // store contents before colon as startChrom, after as startPos
107 if ( foundRangeDots == string::npos ) {
108 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
109 stopChrom = startChrom;
113 // ".." found, so we have some sort of range selected
116 // store startPos between first colon and range dots ".."
117 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
119 // look for second colon
120 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
122 // no second colon found
123 // so we have a "standard" chrom:start..stop input format (on single chrom)
124 if ( foundSecondColon == string::npos ) {
125 stopChrom = startChrom;
126 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
129 // second colon found
130 // so we have a range requested across 2 chrom's
132 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
133 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
138 // -------------------------------
139 // validate reference IDs & genomic positions
141 const RefVector references = reader.GetReferenceData();
143 // if startRefID not found, return false
144 int startRefID = reader.GetReferenceID(startChrom);
145 if ( startRefID == (int)references.size() ) return false;
147 // if startPos is larger than reference, return false
148 const RefData& startReference = references.at(startRefID);
149 if ( startPos > startReference.RefLength ) return false;
151 // if stopRefID not found, return false
152 int stopRefID = reader.GetReferenceID(stopChrom);
153 if ( stopRefID == (int)references.size() ) return false;
155 // if stopPosition larger than reference, return false
156 const RefData& stopReference = references.at(stopRefID);
157 if ( stopPos > stopReference.RefLength ) return false;
159 // if no stopPosition specified, set to reference end
160 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
162 // -------------------------------
163 // set up Region struct & return
165 region.LeftRefID = startRefID;
166 region.LeftPosition = startPos;
167 region.RightRefID = stopRefID;;
168 region.RightPosition = stopPos;
172 // Same as ParseRegionString() above, but accepts a BamMultiReader
173 bool Utilities::ParseRegionString(const string& regionString,
174 const BamMultiReader& reader,
177 // -------------------------------
178 // parse region string
180 // check first for empty string
181 if ( regionString.empty() )
184 // non-empty string, look for a colom
185 size_t foundFirstColon = regionString.find(':');
187 // store chrom strings, and numeric positions
194 // going to use entire contents of requested chromosome
195 // just store entire region string as startChrom name
196 // use BamReader methods to check if its valid for current BAM file
197 if ( foundFirstColon == string::npos ) {
198 startChrom = regionString;
200 stopChrom = regionString;
204 // colon found, so we at least have some sort of startPos requested
207 // store start chrom from beginning to first colon
208 startChrom = regionString.substr(0,foundFirstColon);
210 // look for ".." after the colon
211 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
214 // so we have a startPos but no range
215 // store contents before colon as startChrom, after as startPos
216 if ( foundRangeDots == string::npos ) {
217 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
218 stopChrom = startChrom;
222 // ".." found, so we have some sort of range selected
225 // store startPos between first colon and range dots ".."
226 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
228 // look for second colon
229 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
231 // no second colon found
232 // so we have a "standard" chrom:start..stop input format (on single chrom)
233 if ( foundSecondColon == string::npos ) {
234 stopChrom = startChrom;
235 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
238 // second colon found
239 // so we have a range requested across 2 chrom's
241 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
242 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
247 // -------------------------------
248 // validate reference IDs & genomic positions
250 const RefVector references = reader.GetReferenceData();
252 // if startRefID not found, return false
253 int startRefID = reader.GetReferenceID(startChrom);
254 if ( startRefID == (int)references.size() ) return false;
256 // if startPos is larger than reference, return false
257 const RefData& startReference = references.at(startRefID);
258 if ( startPos > startReference.RefLength ) return false;
260 // if stopRefID not found, return false
261 int stopRefID = reader.GetReferenceID(stopChrom);
262 if ( stopRefID == (int)references.size() ) return false;
264 // if stopPosition larger than reference, return false
265 const RefData& stopReference = references.at(stopRefID);
266 if ( stopPos > stopReference.RefLength ) return false;
268 // if no stopPosition specified, set to reference end
269 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
271 // -------------------------------
272 // set up Region struct & return
274 region.LeftRefID = startRefID;
275 region.LeftPosition = startPos;
276 region.RightRefID = stopRefID;;
277 region.RightPosition = stopPos;
281 void Utilities::Reverse(string& sequence) {
282 reverse(sequence.begin(), sequence.end());
285 void Utilities::ReverseComplement(string& sequence) {
287 // do complement, in-place
288 size_t seqLength = sequence.length();
289 for ( size_t i = 0; i < seqLength; ++i )
290 sequence.replace(i, 1, 1, REVCOMP_LOOKUP[(int)sequence.at(i) - 65]);
296 vector<string> Utilities::Split(const string& source, const char delim) {
298 stringstream ss(source);
300 vector<string> fields;
302 while ( getline(ss, field, delim) )
303 fields.push_back(field);
307 vector<string> Utilities::Split(const string& source, const string& delims) {
309 vector<string> fields;
312 char* cchars = new char[source.size()+1];
313 char* cstr = &cchars[0];
314 strcpy(cstr, source.c_str());
315 tok = strtok(cstr, delims.c_str());
316 while (tok != NULL) {
317 fields.push_back(tok);
318 tok = strtok(NULL, delims.c_str());
326 // returns true if 'source' starts with 'pattern'
327 bool Utilities::StartsWith(const string& source, const string& pattern) {
328 return ( source.find(pattern) == 0 );
331 // returns true if 'source' starts with 'c'
332 bool Utilities::StartsWith(const std::string &source, const char c) {
333 return ( source.find(c) == 0 );