1 // ***************************************************************************
2 // bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 23 September 2010
7 // ---------------------------------------------------------------------------
8 // Provides general utilities used by BamTools sub-tools.
9 // ***************************************************************************
15 #include "bamtools_utilities.h"
16 #include "BamReader.h"
17 #include "BamMultiReader.h"
19 using namespace BamTools;
23 const char REVCOMP_LOOKUP[] = {'T', 0, 'G', 'H', 0, 0, 'C', 'D', 0, 0, 0, 0, 'K', 'N', 0, 0, 0, 'Y', 'W', 'A', 'A', 'B', 'S', 'X', 'R', 0 };
25 } // namespace BamTools
27 // check if a file exists
28 bool Utilities::FileExists(const std::string& filename) {
29 ifstream f(filename.c_str(), ifstream::in);
33 // Parses a region string, does validation (valid ID's, positions), stores in Region struct
34 // Returns success (true/false)
35 bool Utilities::ParseRegionString(const std::string& regionString, const BamReader& reader, BamRegion& region) {
37 // -------------------------------
38 // parse region string
40 // check first for empty string
41 if ( regionString.empty() )
44 // non-empty string, look for a colom
45 size_t foundFirstColon = regionString.find(':');
47 // store chrom strings, and numeric positions
54 // going to use entire contents of requested chromosome
55 // just store entire region string as startChrom name
56 // use BamReader methods to check if its valid for current BAM file
57 if ( foundFirstColon == string::npos ) {
58 startChrom = regionString;
60 stopChrom = regionString;
64 // colon found, so we at least have some sort of startPos requested
67 // store start chrom from beginning to first colon
68 startChrom = regionString.substr(0,foundFirstColon);
70 // look for ".." after the colon
71 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
74 // so we have a startPos but no range
75 // store contents before colon as startChrom, after as startPos
76 if ( foundRangeDots == string::npos ) {
77 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
78 stopChrom = startChrom;
82 // ".." found, so we have some sort of range selected
85 // store startPos between first colon and range dots ".."
86 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
88 // look for second colon
89 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
91 // no second colon found
92 // so we have a "standard" chrom:start..stop input format (on single chrom)
93 if ( foundSecondColon == string::npos ) {
94 stopChrom = startChrom;
95 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
99 // so we have a range requested across 2 chrom's
101 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
102 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
107 // -------------------------------
108 // validate reference IDs & genomic positions
110 const RefVector references = reader.GetReferenceData();
112 // if startRefID not found, return false
113 int startRefID = reader.GetReferenceID(startChrom);
114 if ( startRefID == (int)references.size() ) return false;
116 // if startPos is larger than reference, return false
117 const RefData& startReference = references.at(startRefID);
118 if ( startPos > startReference.RefLength ) return false;
120 // if stopRefID not found, return false
121 int stopRefID = reader.GetReferenceID(stopChrom);
122 if ( stopRefID == (int)references.size() ) return false;
124 // if stopPosition larger than reference, return false
125 const RefData& stopReference = references.at(stopRefID);
126 if ( stopPos > stopReference.RefLength ) return false;
128 // if no stopPosition specified, set to reference end
129 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
131 // -------------------------------
132 // set up Region struct & return
134 region.LeftRefID = startRefID;
135 region.LeftPosition = startPos;
136 region.RightRefID = stopRefID;;
137 region.RightPosition = stopPos;
141 // Same as ParseRegionString() above, but accepts a BamMultiReader
142 bool Utilities::ParseRegionString(const std::string& regionString, const BamMultiReader& reader, BamRegion& region) {
144 // -------------------------------
145 // parse region string
147 // check first for empty string
148 if ( regionString.empty() )
151 // non-empty string, look for a colom
152 size_t foundFirstColon = regionString.find(':');
154 // store chrom strings, and numeric positions
161 // going to use entire contents of requested chromosome
162 // just store entire region string as startChrom name
163 // use BamReader methods to check if its valid for current BAM file
164 if ( foundFirstColon == string::npos ) {
165 startChrom = regionString;
167 stopChrom = regionString;
171 // colon found, so we at least have some sort of startPos requested
174 // store start chrom from beginning to first colon
175 startChrom = regionString.substr(0,foundFirstColon);
177 // look for ".." after the colon
178 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
181 // so we have a startPos but no range
182 // store contents before colon as startChrom, after as startPos
183 if ( foundRangeDots == string::npos ) {
184 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
185 stopChrom = startChrom;
189 // ".." found, so we have some sort of range selected
192 // store startPos between first colon and range dots ".."
193 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
195 // look for second colon
196 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
198 // no second colon found
199 // so we have a "standard" chrom:start..stop input format (on single chrom)
200 if ( foundSecondColon == string::npos ) {
201 stopChrom = startChrom;
202 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
205 // second colon found
206 // so we have a range requested across 2 chrom's
208 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
209 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
214 // -------------------------------
215 // validate reference IDs & genomic positions
217 const RefVector references = reader.GetReferenceData();
219 // if startRefID not found, return false
220 int startRefID = reader.GetReferenceID(startChrom);
221 if ( startRefID == (int)references.size() ) return false;
223 // if startPos is larger than reference, return false
224 const RefData& startReference = references.at(startRefID);
225 if ( startPos > startReference.RefLength ) return false;
227 // if stopRefID not found, return false
228 int stopRefID = reader.GetReferenceID(stopChrom);
229 if ( stopRefID == (int)references.size() ) return false;
231 // if stopPosition larger than reference, return false
232 const RefData& stopReference = references.at(stopRefID);
233 if ( stopPos > stopReference.RefLength ) return false;
235 // if no stopPosition specified, set to reference end
236 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
238 // -------------------------------
239 // set up Region struct & return
241 region.LeftRefID = startRefID;
242 region.LeftPosition = startPos;
243 region.RightRefID = stopRefID;;
244 region.RightPosition = stopPos;
249 void Utilities::Reverse(string& sequence) {
250 reverse(sequence.begin(), sequence.end());
253 void Utilities::ReverseComplement(std::string& sequence) {
256 size_t seqLength = sequence.length();
257 for ( size_t i = 0; i < seqLength; ++i )
258 sequence.replace(i, 1, 1, REVCOMP_LOOKUP[(int)sequence.at(i) - 65]);