1 // ***************************************************************************
2 // bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 2 June 2010
7 // ---------------------------------------------------------------------------
8 // Provides general utilities used by BamTools sub-tools.
9 // ***************************************************************************
13 #include "bamtools_utilities.h"
14 #include "BamReader.h"
15 #include "BamMultiReader.h"
18 using namespace BamTools;
20 // Parses a region string, does validation (valid ID's, positions), stores in Region struct
21 // Returns success (true/false)
22 bool Utilities::ParseRegionString(const std::string& regionString, const BamReader& reader, BamRegion& region) {
24 // -------------------------------
25 // parse region string
27 // check first for empty string
28 if ( regionString.empty() )
31 // non-empty string, look for a colom
32 size_t foundFirstColon = regionString.find(':');
34 // store chrom strings, and numeric positions
41 // going to use entire contents of requested chromosome
42 // just store entire region string as startChrom name
43 // use BamReader methods to check if its valid for current BAM file
44 if ( foundFirstColon == string::npos ) {
45 startChrom = regionString;
47 stopChrom = regionString;
51 // colon found, so we at least have some sort of startPos requested
54 // store start chrom from beginning to first colon
55 startChrom = regionString.substr(0,foundFirstColon);
57 // look for ".." after the colon
58 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
61 // so we have a startPos but no range
62 // store contents before colon as startChrom, after as startPos
63 if ( foundRangeDots == string::npos ) {
64 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
65 stopChrom = startChrom;
69 // ".." found, so we have some sort of range selected
72 // store startPos between first colon and range dots ".."
73 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
75 // look for second colon
76 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
78 // no second colon found
79 // so we have a "standard" chrom:start..stop input format (on single chrom)
80 if ( foundSecondColon == string::npos ) {
81 stopChrom = startChrom;
82 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
86 // so we have a range requested across 2 chrom's
88 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
89 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
94 // -------------------------------
95 // validate reference IDs & genomic positions
97 const RefVector references = reader.GetReferenceData();
99 // if startRefID not found, return false
100 int startRefID = reader.GetReferenceID(startChrom);
101 if ( startRefID == (int)references.size() ) return false;
103 // if startPos is larger than reference, return false
104 const RefData& startReference = references.at(startRefID);
105 if ( startPos > startReference.RefLength ) return false;
107 // if stopRefID not found, return false
108 int stopRefID = reader.GetReferenceID(stopChrom);
109 if ( stopRefID == (int)references.size() ) return false;
111 // if stopPosition larger than reference, return false
112 const RefData& stopReference = references.at(stopRefID);
113 if ( stopPos > stopReference.RefLength ) return false;
115 // if no stopPosition specified, set to reference end
116 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
118 // -------------------------------
119 // set up Region struct & return
121 region.LeftRefID = startRefID;
122 region.LeftPosition = startPos;
123 region.RightRefID = stopRefID;;
124 region.RightPosition = stopPos;
128 // Same as ParseRegionString() above, but accepts a BamMultiReader
129 bool Utilities::ParseRegionString(const std::string& regionString, const BamMultiReader& reader, BamRegion& region) {
131 // -------------------------------
132 // parse region string
134 // check first for empty string
135 if ( regionString.empty() )
138 // non-empty string, look for a colom
139 size_t foundFirstColon = regionString.find(':');
141 // store chrom strings, and numeric positions
148 // going to use entire contents of requested chromosome
149 // just store entire region string as startChrom name
150 // use BamReader methods to check if its valid for current BAM file
151 if ( foundFirstColon == string::npos ) {
152 startChrom = regionString;
154 stopChrom = regionString;
158 // colon found, so we at least have some sort of startPos requested
161 // store start chrom from beginning to first colon
162 startChrom = regionString.substr(0,foundFirstColon);
164 // look for ".." after the colon
165 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
168 // so we have a startPos but no range
169 // store contents before colon as startChrom, after as startPos
170 if ( foundRangeDots == string::npos ) {
171 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
172 stopChrom = startChrom;
176 // ".." found, so we have some sort of range selected
179 // store startPos between first colon and range dots ".."
180 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
182 // look for second colon
183 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
185 // no second colon found
186 // so we have a "standard" chrom:start..stop input format (on single chrom)
187 if ( foundSecondColon == string::npos ) {
188 stopChrom = startChrom;
189 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
192 // second colon found
193 // so we have a range requested across 2 chrom's
195 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
196 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
201 // -------------------------------
202 // validate reference IDs & genomic positions
204 const RefVector references = reader.GetReferenceData();
206 // if startRefID not found, return false
207 int startRefID = reader.GetReferenceID(startChrom);
208 if ( startRefID == (int)references.size() ) return false;
210 // if startPos is larger than reference, return false
211 const RefData& startReference = references.at(startRefID);
212 if ( startPos > startReference.RefLength ) return false;
214 // if stopRefID not found, return false
215 int stopRefID = reader.GetReferenceID(stopChrom);
216 if ( stopRefID == (int)references.size() ) return false;
218 // if stopPosition larger than reference, return false
219 const RefData& stopReference = references.at(stopRefID);
220 if ( stopPos > stopReference.RefLength ) return false;
222 // if no stopPosition specified, set to reference end
223 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
225 // -------------------------------
226 // set up Region struct & return
228 region.LeftRefID = startRefID;
229 region.LeftPosition = startPos;
230 region.RightRefID = stopRefID;;
231 region.RightPosition = stopPos;
236 bool Utilities::FileExists(const std::string& filename) {
238 struct stat fileInfo;
239 return stat(filename.c_str(), &fileInfo) == 0;