// BamAlignment.cpp (c) 2009 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 7 October 2011 (DB)
+// Last modified: 8 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides the BamAlignment data structure
// ***************************************************************************
return GetTag("NM", (uint32_t&)editDistance);
}
-/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool zeroBased = true) const
+/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = USE_CLOSED_DEFAULT) const
\brief Calculates alignment end position, based on starting position and CIGAR data.
- \param usePadded Inserted bases affect reported position. Default is false, so that reported
- position stays 'sync-ed' with reference coordinates.
- \param zeroBased Return (BAM standard) 0-based coordinate. Setting this to false can be useful
- when using BAM data with half-open formats (e.g. BED).
+ \warning The position returned now represents a zero-based, HALF-OPEN interval.
+ In previous versions of BamTools (0.x & 1.x) all intervals were treated
+ as zero-based, CLOSED. I whole-heartedly apologize for any inconsistencies this
+ may have caused if you assumed that BT was always half-open; full aplogies also
+ to those who recognized that BamTools originally used a closed interval, but may
+ need to update their code to reflect this new change.
+
+ \param usePadded Allow inserted bases to affect the reported position. Default is false, so that
+ reported position stays synced with reference coordinates.
+
+ \param closedInterval Setting this to true will return a 0-based end coordinate. Default is false,
+ so that his value represents a standard, half-open interval.
\return alignment end position
*/
-int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const {
-
- // TODO: Come back to this for coordinate issues !!!
+int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const {
// initialize alignment end to starting position
int alignEnd = Position;
vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
for ( ; cigarIter != cigarEnd; ++cigarIter) {
- const char cigarType = (*cigarIter).Type;
- const uint32_t& cigarLength = (*cigarIter).Length;
-
- if ( cigarType == Constants::BAM_CIGAR_MATCH_CHAR ||
- cigarType == Constants::BAM_CIGAR_DEL_CHAR ||
- cigarType == Constants::BAM_CIGAR_REFSKIP_CHAR )
- alignEnd += cigarLength;
- else if ( usePadded && cigarType == Constants::BAM_CIGAR_INS_CHAR )
- alignEnd += cigarLength;
+ const CigarOp& op = (*cigarIter);
+
+ switch ( op.Type ) {
+
+ // increase end position on CIGAR chars [DMXN=]
+ case Constants::BAM_CIGAR_DEL_CHAR :
+ case Constants::BAM_CIGAR_MATCH_CHAR :
+ case Constants::BAM_CIGAR_MISMATCH_CHAR :
+ case Constants::BAM_CIGAR_REFSKIP_CHAR :
+ case Constants::BAM_CIGAR_SEQMATCH_CHAR :
+ alignEnd += op.Length;
+ break;
+
+ // increase end position when CIGAR char is 'I' only if @usePadded is true
+ case Constants::BAM_CIGAR_INS_CHAR :
+ if ( usePadded )
+ alignEnd += op.Length;
+ break;
+
+ // all other CIGAR chars do not affect end position
+ default :
+ break;
+ }
}
- // adjust for zero-based coordinates, if requested
- if ( zeroBased ) alignEnd -= 1;
+ // adjust for closedInterval, if requested
+ if ( closedInterval )
+ alignEnd -= 1;
// return result
return alignEnd;
bool BuildCharData(void);
// calculates alignment end position
- int GetEndPosition(bool usePadded = false, bool zeroBased = true) const;
+ int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
// returns a description of the last error that occurred
std::string GetErrorString(void) const;
// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg\r
// Marth Lab, Department of Biology, Boston College\r
// ---------------------------------------------------------------------------\r
-// Last modified: 7 October 2011 (DB)\r
+// Last modified: 8 October 2011 (DB)\r
// ---------------------------------------------------------------------------\r
// Provides data structures & utility methods that are used throughout the API.\r
// ***************************************************************************\r
\brief Represents a sequential genomic region\r
\r
Allowed to span multiple (sequential) references.\r
+\r
+ \warning BamRegion now represents a zero-based, HALF-OPEN interval.\r
+ In previous versions of BamTools (0.x & 1.x) all intervals were treated\r
+ as zero-based, CLOSED. I whole-heartedly apologize for any inconsistencies this\r
+ may have caused if you assumed that BT was always half-open; full aplogies also\r
+ to those who recognized that BamTools originally used a closed interval, but may\r
+ need to update their code to reflect this new change.\r
*/\r
struct API_EXPORT BamRegion {\r
\r
\r
//! Returns true if region has a right boundary\r
bool isRightBoundSpecified(void) const {\r
- return ( RightRefID >= 0 && RightPosition >= 0 );\r
+ return ( RightRefID >= 0 && RightPosition >= 1 );\r
}\r
};\r
\r
// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 7 October 2011 (DB)
+// Last modified: 8 October 2011 (DB)
// ---------------------------------------------------------------------------
// Convenience class for reading multiple BAM files.
//
const int& rightRefID,
const int& rightPosition)
{
- BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
- return d->SetRegion(region);
+ return d->SetRegion( BamRegion(leftRefID, leftPosition, rightRefID, rightPosition) );
}
// BamRandomAccessController_p.cpp (c) 2011 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 7 October 2011(DB)
+// Last modified: 8 October 2011(DB)
// ---------------------------------------------------------------------------
// Manages random access operations in a BAM file
// **************************************************************************
// if alignment starts at or after left bound position
if ( alignment.Position >= m_region.LeftPosition) {
- if ( m_region.isRightBoundSpecified() && // right bound is specified AND
- m_region.LeftRefID == m_region.RightRefID && // left & right bounds on same reference AND
- alignment.Position > m_region.RightPosition ) // alignment starts after right bound position
+ if ( m_region.isRightBoundSpecified() && // right bound is specified AND
+ m_region.LeftRefID == m_region.RightRefID && // left & right bounds on same reference AND
+ alignment.Position >= m_region.RightPosition ) // alignment starts on or after right bound position
return AfterRegion;
// otherwise, alignment overlaps region
else {
// if alignment overlaps left bound position
- if ( alignment.GetEndPosition() >= m_region.LeftPosition )
+ if ( alignment.GetEndPosition() > m_region.LeftPosition )
return OverlapsRegion;
else
return BeforeRegion;
// alignment is on right bound reference
else {
- // if alignment starts on or before right bound position
- if ( alignment.Position <= m_region.RightPosition )
+ // if alignment starts before right bound position
+ if ( alignment.Position < m_region.RightPosition )
return OverlapsRegion;
else
return AfterRegion;
}
}
- // otherwise, alignment starts after left bound and there is no right bound
+ // otherwise, alignment starts after left bound and there is no right bound given
else return OverlapsRegion;
}
}
// BamStandardIndex.cpp (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 6 October 2011 (DB)
+// Last modified: 8 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides index operations for the standardized BAM index format (".bai")
// ***************************************************************************
// retrieve references from reader
const RefVector& references = m_reader->GetReferenceData();
- // make sure left-bound position is valid
- if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
+ // LeftPosition cannot be greater than or equal to reference length
+ if ( region.LeftPosition >= references.at(region.LeftRefID).RefLength )
throw BamException("BamStandardIndex::AdjustRegion", "invalid region requested");
// set region 'begin'
end = (unsigned int)region.RightPosition;
// otherwise, set region 'end' to last reference base
- else end = (unsigned int)references.at(region.LeftRefID).RefLength - 1;
+ else end = (unsigned int)references.at(region.LeftRefID).RefLength;
}
+// [begin, end)
void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin,
const uint32_t& end,
set<uint16_t>& candidateBins)
*hasAlignmentsInRegion = m_reader->LoadNextAlignment(al);
// check alignment against region
- if ( al.GetEndPosition() < region.LeftPosition ) {
+ if ( al.GetEndPosition() <= region.LeftPosition ) {
offsetFirst = ++offsetIter;
count -= step+1;
} else count = step;
// BamToolsIndex.cpp (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 6 October 2011 (DB)
+// Last modified: 8 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides index operations for the BamTools index format (".bti")
// ***************************************************************************
, m_cacheMode(BamIndex::LimitedIndexCaching)
, m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
, m_inputVersion(0)
- , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
+ , m_outputVersion(BTI_2_0) // latest version - used for writing new index files
{
m_isBigEndian = BamTools::SystemIsBigEndian();
}
// check for deprecated, unsupported versions
// (the format had to be modified to accomodate a particular bug fix)
- else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_0 ) {
+ // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals
+ // respondBy: throwing exception - we're not going to try to handle the old BTI files.
+ else if ( (Version)m_inputVersion < BamToolsIndex::BTI_2_0 ) {
const string message = "unsupported format: this version of the index may not properly handle "
- "reference ends. Please run 'bamtools index -bti -in yourData.bam' to "
- "generate an up-to-date, fixed BTI file.";
- throw BamException("BamToolsIndex::CheckVersion", message);
- }
-
- else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_1 ) {
- const string message = "unsupported format: this version of the index may not properly handle "
- "empty references. Please run 'bamtools index -bti -in yourData.bam to "
- "generate an up-to-date, fixed BTI file.";
+ "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' "
+ "to generate an up-to-date, fixed BTI file.";
throw BamException("BamToolsIndex::CheckVersion", message);
}
}
try {
// open new index file (read & write)
- string indexFilename = m_reader->Filename() + Extension();
+ const string indexFilename = m_reader->Filename() + Extension();
OpenFile(indexFilename, "w+b");
// initialize BtiFileSummary with number of references
else {
// store previous BTI block data in reference entry
- BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
refEntry.Blocks.push_back(block);
// write reference entry, then clear
++currentBlockCount;
// check end position
- int32_t alignmentEndPosition = al.GetEndPosition();
+ const int32_t alignmentEndPosition = al.GetEndPosition();
if ( alignmentEndPosition > blockMaxEndPosition )
blockMaxEndPosition = alignmentEndPosition;
if ( currentBlockCount == m_blockSize ) {
// store previous block data in reference entry
- BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
refEntry.Blocks.push_back(block);
// update markers
if ( blockRefId >= 0 ) {
// store last BTI block data in reference entry
- BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
refEntry.Blocks.push_back(block);
// write last reference entry, then clear
const BtiBlock& block = (*blockIter);
if ( block.StartPosition <= region.RightPosition ) {
- if ( block.MaxEndPosition >= region.LeftPosition ) {
+ if ( block.MaxEndPosition > region.LeftPosition ) {
offset = block.StartOffset;
break;
}
--blockIter;
const BtiBlock& previousBlock = (*blockIter);
- if ( previousBlock.MaxEndPosition < region.LeftPosition ) {
+ if ( previousBlock.MaxEndPosition <= region.LeftPosition ) {
offset = currentBlock.StartOffset;
found = true;
break;
// BamToolsIndex.h (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 6 October 2011 (DB)
+// Last modified: 8 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides index operations for the BamTools index format (".bti")
// ***************************************************************************
// (might be useful later to handle any 'legacy' versions if the format changes)
// listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
//
- // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
+ // so a change introduced in BTI_1_2 may be handled from then on by:
//
// if ( indexVersion >= BTI_1_2 )
// do something new
enum Version { BTI_1_0 = 1
, BTI_1_1
, BTI_1_2
+ , BTI_2_0
};
// ctor & dtor
// BamWriter_p.cpp (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 7 October 2011 (DB)
+// Last modified: 8 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides the basic functionality for producing BAM files
// ***************************************************************************
Close();
}
-// calculates minimum bin for a BAM alignment interval
+// calculates minimum bin for a BAM alignment interval [begin, end)
uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
--end;
if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
const unsigned int queryLength = al.QueryBases.size();
const unsigned int tagDataLength = al.TagData.size();
- // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
- // force calculation of Bin before storing
- const int endPosition = al.GetEndPosition();
- const uint32_t alignmentBin = CalculateMinimumBin(al.Position, endPosition);
+ // no way to tell if alignment's bin is already defined (there is no default, invalid value)
+ // so we'll go ahead calculate its bin ID before storing
+ const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
// create our packed cigar string
string packedCigar;
// bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 11 June 2011
+// Last modified: 8 October 2011
// ---------------------------------------------------------------------------
// Converts between BAM and a number of other formats
// ***************************************************************************
m_out << m_references.at(a.RefID).RefName << "\t"
<< a.Position << "\t"
- << a.GetEndPosition() + 1 << "\t"
+ << a.GetEndPosition() << "\t"
<< a.Name << "\t"
<< a.MapQuality << "\t"
<< (a.IsReverseStrand() ? "-" : "+") << endl;
CountTool::CountSettings* m_settings;
};
-static
-void printAlignments(const vector<BamAlignment>& alignments) {
-
- vector<BamAlignment>::const_iterator alIter = alignments.begin();
- vector<BamAlignment>::const_iterator alEnd = alignments.end();
- for ( ; alIter != alEnd; ++alIter ) {
- const BamAlignment& a = (*alIter);
-
- cerr << a.Name
- << "\t" << a.RefID << ":" << a.Position;
-
- int aqValue;
- bool hasTag = a.GetTag("Aq", aqValue);
- cerr << "\tAq=";
- if ( hasTag ) cerr << aqValue;
- else cerr << "?";
- cerr << endl;
- }
-}
-
bool CountTool::CountToolPrivate::Run(void) {
// if no '-in' args supplied, default to stdin
// if no region specified, count entire file
if ( !m_settings->HasRegion ) {
-
-
- vector<BamAlignment> alignments;
- while ( reader.GetNextAlignment(al) ) {
+ while ( reader.GetNextAlignmentCore(al) )
++alignmentCount;
-
- if ( alignments.size() < 100 )
- alignments.push_back(al);
- }
-
- using namespace BamTools::Algorithms;
-
- cerr << endl
- << "------------------------------" << endl
- << "Unsorted Alignments" << endl
- << "------------------------------" << endl
- << endl;
- std::stable_sort(alignments.begin(), alignments.end(), Sort::Unsorted());
- printAlignments(alignments);
- cerr << "------------------------------" << endl
- << endl;
-
- cerr << endl
- << "------------------------------" << endl
- << "Sorted Alignments (by name)" << endl
- << "------------------------------" << endl
- << endl;
-// std::sort(alignments.begin(), alignments.end(), Sort::ByName());
- Algorithms::SortAlignments(alignments, Sort::ByName());
- printAlignments(alignments);
- cerr << endl
- << "------------------------------" << endl
- << endl;
-
- cerr << endl
- << "------------------------------" << endl
- << "Sorted Alignments (by tag Aq)" << endl
- << "------------------------------" << endl
- << endl;
-// std::sort(alignments.begin(), alignments.end(), Sort::ByTag<int>("Aq"));
- Algorithms::SortAlignments(alignments, Sort::ByTag<int>("Aq"));
- printAlignments(alignments);
- cerr << endl
- << "------------------------------" << endl
- << endl;
-
- cerr << endl
- << "------------------------------" << endl
- << "Sorted Alignments (by tag Aq) desc" << endl
- << "------------------------------" << endl
- << endl;
- std::sort(alignments.begin(), alignments.end(), Sort::ByTag<int>("Aq", Sort::DescendingOrder));
- printAlignments(alignments);
- cerr << endl
- << "------------------------------" << endl
- << endl;
-
-
-
-
-// // ########################################
-// // original
-// // ########################################
-//
-// while ( reader.GetNextAlignmentCore(al) )
-// ++alignmentCount;
-//
-// //#########################################
}
// otherwise attempt to use region as constraint
// if index data available for all BAM files, we can use SetRegion
if ( reader.HasIndexes() ) {
- vector<BamAlignment> alignments;
- using namespace BamTools::Algorithms;
-
- alignments = GetSortedRegion(reader, region, Sort::ByName() );
- printAlignments(alignments);
-
- cerr << "################################" << endl;
-
- alignments = GetSortedRegion(reader, region, Sort::ByTag<int>("Aq"));
- printAlignments(alignments);
-
-// // attempt to set region on reader
-// if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
-// cerr << "bamtools count ERROR: set region failed. Check that REGION describes a valid range" << endl;
-// reader.Close();
-// return false;
-// }
+ // attempt to set region on reader
+ if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
+ cerr << "bamtools count ERROR: set region failed. Check that REGION describes a valid range" << endl;
+ reader.Close();
+ return false;
+ }
-// // everything checks out, just iterate through specified region, counting alignments
-// while ( reader.GetNextAlignmentCore(al) )
-// ++alignmentCount;
+ // everything checks out, just iterate through specified region, counting alignments
+ while ( reader.GetNextAlignmentCore(al) )
+ ++alignmentCount;
}
// no index data available, we have to iterate through until we
// bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 9 June 2011
+// Last modified: 8 October 2011
// ---------------------------------------------------------------------------
// Provides general utilities used by BamTools sub-tools.
// ***************************************************************************
startChrom = regionString;
startPos = 0;
stopChrom = regionString;
- stopPos = -1;
+ stopPos = 0;
}
// colon found, so we at least have some sort of startPos requested
// if startRefID not found, return false
int startRefID = reader.GetReferenceID(startChrom);
- if ( startRefID == (int)references.size() ) return false;
+ if ( startRefID == -1 ) return false;
- // if startPos is larger than reference, return false
+ // startPos cannot be greater than or equal to reference length
const RefData& startReference = references.at(startRefID);
- if ( startPos > startReference.RefLength ) return false;
+ if ( startPos >= startReference.RefLength ) return false;
// if stopRefID not found, return false
int stopRefID = reader.GetReferenceID(stopChrom);
- if ( stopRefID == (int)references.size() ) return false;
+ if ( stopRefID == -1 ) return false;
- // if stopPosition larger than reference, return false
+ // stopPosition cannot be larger than reference length
const RefData& stopReference = references.at(stopRefID);
if ( stopPos > stopReference.RefLength ) return false;
// -------------------------------
// set up Region struct & return
- region.LeftRefID = startRefID;
- region.LeftPosition = startPos;
- region.RightRefID = stopRefID;;
+ region.LeftRefID = startRefID;
+ region.LeftPosition = startPos;
+ region.RightRefID = stopRefID;;
region.RightPosition = stopPos;
return true;
}
// -------------------------------
// validate reference IDs & genomic positions
-
+
const RefVector references = reader.GetReferenceData();
-
+
// if startRefID not found, return false
int startRefID = reader.GetReferenceID(startChrom);
- if ( startRefID == (int)references.size() ) return false;
-
- // if startPos is larger than reference, return false
+ if ( startRefID == -1 ) return false;
+
+ // startPos cannot be greater than or equal to reference length
const RefData& startReference = references.at(startRefID);
- if ( startPos > startReference.RefLength ) return false;
-
+ if ( startPos >= startReference.RefLength ) return false;
+
// if stopRefID not found, return false
int stopRefID = reader.GetReferenceID(stopChrom);
- if ( stopRefID == (int)references.size() ) return false;
-
- // if stopPosition larger than reference, return false
+ if ( stopRefID == -1 ) return false;
+
+ // stopPosition cannot be larger than reference length
const RefData& stopReference = references.at(stopRefID);
if ( stopPos > stopReference.RefLength ) return false;
-
+
// if no stopPosition specified, set to reference end
- if ( stopPos == -1 ) stopPos = stopReference.RefLength;
-
+ if ( stopPos == -1 ) stopPos = stopReference.RefLength;
+
// -------------------------------
// set up Region struct & return
-
- region.LeftRefID = startRefID;
- region.LeftPosition = startPos;
- region.RightRefID = stopRefID;;
+
+ region.LeftRefID = startRefID;
+ region.LeftPosition = startPos;
+ region.RightRefID = stopRefID;;
region.RightPosition = stopPos;
return true;
}