1 // ***************************************************************************
2 // bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 3 September 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Grab a random subset of alignments.
9 // ***************************************************************************
16 #include "bamtools_random.h"
17 #include "bamtools_options.h"
18 #include "bamtools_utilities.h"
19 #include "BamMultiReader.h"
20 #include "BamWriter.h"
22 using namespace BamTools;
27 const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
29 // utility methods for RandomTool
30 const int getRandomInt(const int& lowerBound, const int& upperBound) {
31 const int range = (upperBound - lowerBound) + 1;
32 return ( lowerBound + (int)(range * (double)rand()/((double)RAND_MAX + 1)) );
35 } // namespace BamTools
37 // ---------------------------------------------
38 // RandomSettings implementation
40 struct RandomTool::RandomSettings {
43 bool HasAlignmentCount;
47 bool IsForceCompression;
50 unsigned int AlignmentCount;
51 vector<string> InputFiles;
52 string OutputFilename;
57 : HasAlignmentCount(false)
61 , IsForceCompression(false)
62 , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
63 , OutputFilename(Options::StandardOut())
67 // ---------------------------------------------
68 // RandomTool implementation
70 RandomTool::RandomTool(void)
72 , m_settings(new RandomSettings)
74 // set program details
75 Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in <filename> -in <filename> ...] [-out <filename>] [-forceCompression] [-n] [-region <REGION>]");
78 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
79 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
80 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
81 Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
83 OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
84 Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, FilterOpts, RANDOM_MAX_ALIGNMENT_COUNT);
85 Options::AddValueOption("-region", "REGION", "limit source of random alignment subset to a particular genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai or <filename>.bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
88 RandomTool::~RandomTool(void) {
93 int RandomTool::Help(void) {
94 Options::DisplayHelp();
98 int RandomTool::Run(int argc, char* argv[]) {
100 // parse command line arguments
101 Options::Parse(argc, argv, 1);
103 // set to default stdin if no input files provided
104 if ( !m_settings->HasInput )
105 m_settings->InputFiles.push_back(Options::StandardIn());
108 BamMultiReader reader;
109 if ( !reader.Open(m_settings->InputFiles) ) {
110 cerr << "ERROR: Could not open input BAM file(s)." << endl;
114 // make sure index data is available
115 if ( !reader.IsIndexLoaded() ) {
116 cerr << "ERROR: Could not load index data for all input BAM file(s)." << endl;
117 cerr << "\'bamtools random\' requires valid index files to provide efficient performance." << endl;
123 // get BamReader metadata
124 const string headerText = reader.GetHeaderText();
125 const RefVector references = reader.GetReferenceData();
126 if ( references.empty() ) {
127 cerr << "ERROR: No reference data available - required to perform random access throughtout input file(s)." << endl;
134 bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression );
135 if ( !writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed) ) {
136 cerr << "ERROR: Could not open BamWriter." << endl;
141 // if user specified a REGION constraint, attempt to parse REGION string
143 if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
144 cerr << "ERROR: Could not parse REGION: " << m_settings->Region << endl;
145 cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl;
151 // seed our random number generator
154 // grab random alignments
157 while ( i < m_settings->AlignmentCount ) {
160 int randomPosition = 0;
162 // use REGION constraints to select random refId & position
163 if ( m_settings->HasRegion ) {
165 // select a random refId
166 randomRefId = getRandomInt(region.LeftRefID, region.RightRefID);
168 // select a random position based on randomRefId
169 const int lowerBoundPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 );
170 const int upperBoundPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : (references.at(randomRefId).RefLength - 1) );
171 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
174 // otherwise select from all possible random refId & position
177 // select random refId
178 randomRefId = getRandomInt(0, (int)references.size() - 1);
180 // select random position based on randomRefId
181 const int lowerBoundPosition = 0;
182 const int upperBoundPosition = references.at(randomRefId).RefLength - 1;
183 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
186 // if jump & read successful, save first alignment that overlaps random refId & position
187 if ( reader.Jump(randomRefId, randomPosition) ) {
188 while ( reader.GetNextAlignmentCore(al) ) {
189 if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
190 writer.SaveAlignment(al);