1 // ***************************************************************************
2 // bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 19 November 2010
7 // ---------------------------------------------------------------------------
8 // Provides general utilities used by BamTools sub-tools.
9 // ***************************************************************************
11 #include <api/BamMultiReader.h>
12 #include <api/BamReader.h>
13 #include <utils/bamtools_utilities.h>
14 using namespace BamTools;
24 const char REVCOMP_LOOKUP[] = {'T', 0, 'G', 'H', 0, 0, 'C', 'D', 0, 0, 0, 0, 'K', 'N', 0, 0, 0, 'Y', 'W', 'A', 'A', 'B', 'S', 'X', 'R', 0 };
26 } // namespace BamTools
28 // check if a file exists
29 bool Utilities::FileExists(const std::string& filename) {
30 ifstream f(filename.c_str(), ifstream::in);
34 // Parses a region string, does validation (valid ID's, positions), stores in Region struct
35 // Returns success (true/false)
36 bool Utilities::ParseRegionString(const std::string& regionString, const BamReader& reader, BamRegion& region) {
38 // -------------------------------
39 // parse region string
41 // check first for empty string
42 if ( regionString.empty() )
45 // non-empty string, look for a colom
46 size_t foundFirstColon = regionString.find(':');
48 // store chrom strings, and numeric positions
55 // going to use entire contents of requested chromosome
56 // just store entire region string as startChrom name
57 // use BamReader methods to check if its valid for current BAM file
58 if ( foundFirstColon == string::npos ) {
59 startChrom = regionString;
61 stopChrom = regionString;
65 // colon found, so we at least have some sort of startPos requested
68 // store start chrom from beginning to first colon
69 startChrom = regionString.substr(0,foundFirstColon);
71 // look for ".." after the colon
72 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
75 // so we have a startPos but no range
76 // store contents before colon as startChrom, after as startPos
77 if ( foundRangeDots == string::npos ) {
78 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
79 stopChrom = startChrom;
83 // ".." found, so we have some sort of range selected
86 // store startPos between first colon and range dots ".."
87 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
89 // look for second colon
90 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
92 // no second colon found
93 // so we have a "standard" chrom:start..stop input format (on single chrom)
94 if ( foundSecondColon == string::npos ) {
95 stopChrom = startChrom;
96 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
100 // so we have a range requested across 2 chrom's
102 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
103 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
108 // -------------------------------
109 // validate reference IDs & genomic positions
111 const RefVector references = reader.GetReferenceData();
113 // if startRefID not found, return false
114 int startRefID = reader.GetReferenceID(startChrom);
115 if ( startRefID == (int)references.size() ) return false;
117 // if startPos is larger than reference, return false
118 const RefData& startReference = references.at(startRefID);
119 if ( startPos > startReference.RefLength ) return false;
121 // if stopRefID not found, return false
122 int stopRefID = reader.GetReferenceID(stopChrom);
123 if ( stopRefID == (int)references.size() ) return false;
125 // if stopPosition larger than reference, return false
126 const RefData& stopReference = references.at(stopRefID);
127 if ( stopPos > stopReference.RefLength ) return false;
129 // if no stopPosition specified, set to reference end
130 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
132 // -------------------------------
133 // set up Region struct & return
135 region.LeftRefID = startRefID;
136 region.LeftPosition = startPos;
137 region.RightRefID = stopRefID;;
138 region.RightPosition = stopPos;
142 // Same as ParseRegionString() above, but accepts a BamMultiReader
143 bool Utilities::ParseRegionString(const std::string& regionString, const BamMultiReader& reader, BamRegion& region) {
145 // -------------------------------
146 // parse region string
148 // check first for empty string
149 if ( regionString.empty() )
152 // non-empty string, look for a colom
153 size_t foundFirstColon = regionString.find(':');
155 // store chrom strings, and numeric positions
162 // going to use entire contents of requested chromosome
163 // just store entire region string as startChrom name
164 // use BamReader methods to check if its valid for current BAM file
165 if ( foundFirstColon == string::npos ) {
166 startChrom = regionString;
168 stopChrom = regionString;
172 // colon found, so we at least have some sort of startPos requested
175 // store start chrom from beginning to first colon
176 startChrom = regionString.substr(0,foundFirstColon);
178 // look for ".." after the colon
179 size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
182 // so we have a startPos but no range
183 // store contents before colon as startChrom, after as startPos
184 if ( foundRangeDots == string::npos ) {
185 startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
186 stopChrom = startChrom;
190 // ".." found, so we have some sort of range selected
193 // store startPos between first colon and range dots ".."
194 startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
196 // look for second colon
197 size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
199 // no second colon found
200 // so we have a "standard" chrom:start..stop input format (on single chrom)
201 if ( foundSecondColon == string::npos ) {
202 stopChrom = startChrom;
203 stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
206 // second colon found
207 // so we have a range requested across 2 chrom's
209 stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
210 stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
215 // -------------------------------
216 // validate reference IDs & genomic positions
218 const RefVector references = reader.GetReferenceData();
220 // if startRefID not found, return false
221 int startRefID = reader.GetReferenceID(startChrom);
222 if ( startRefID == (int)references.size() ) return false;
224 // if startPos is larger than reference, return false
225 const RefData& startReference = references.at(startRefID);
226 if ( startPos > startReference.RefLength ) return false;
228 // if stopRefID not found, return false
229 int stopRefID = reader.GetReferenceID(stopChrom);
230 if ( stopRefID == (int)references.size() ) return false;
232 // if stopPosition larger than reference, return false
233 const RefData& stopReference = references.at(stopRefID);
234 if ( stopPos > stopReference.RefLength ) return false;
236 // if no stopPosition specified, set to reference end
237 if ( stopPos == -1 ) stopPos = stopReference.RefLength;
239 // -------------------------------
240 // set up Region struct & return
242 region.LeftRefID = startRefID;
243 region.LeftPosition = startPos;
244 region.RightRefID = stopRefID;;
245 region.RightPosition = stopPos;
250 void Utilities::Reverse(string& sequence) {
251 reverse(sequence.begin(), sequence.end());
254 void Utilities::ReverseComplement(std::string& sequence) {
257 size_t seqLength = sequence.length();
258 for ( size_t i = 0; i < seqLength; ++i )
259 sequence.replace(i, 1, 1, REVCOMP_LOOKUP[(int)sequence.at(i) - 65]);