1 // ***************************************************************************
2 // bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 7 April 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Grab a random subset of alignments (testing tool)
9 // ***************************************************************************
11 #include "bamtools_random.h"
13 #include <api/BamMultiReader.h>
14 #include <api/BamWriter.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
29 const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
31 // utility methods for RandomTool
32 int getRandomInt(const int& lowerBound, const int& upperBound) {
33 const int range = (upperBound - lowerBound) + 1;
34 return ( lowerBound + (int)(range * (double)rand()/((double)RAND_MAX + 1)) );
37 } // namespace BamTools
39 // ---------------------------------------------
40 // RandomSettings implementation
42 struct RandomTool::RandomSettings {
45 bool HasAlignmentCount;
49 bool IsForceCompression;
52 unsigned int AlignmentCount;
53 vector<string> InputFiles;
54 string OutputFilename;
59 : HasAlignmentCount(false)
63 , IsForceCompression(false)
64 , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
65 , OutputFilename(Options::StandardOut())
69 // ---------------------------------------------
70 // RandomToolPrivate implementation
72 struct RandomTool::RandomToolPrivate {
76 RandomToolPrivate(RandomTool::RandomSettings* settings)
77 : m_settings(settings)
80 ~RandomToolPrivate(void) { }
88 RandomTool::RandomSettings* m_settings;
91 bool RandomTool::RandomToolPrivate::Run(void) {
93 // set to default stdin if no input files provided
94 if ( !m_settings->HasInput )
95 m_settings->InputFiles.push_back(Options::StandardIn());
98 BamMultiReader reader;
99 if ( !reader.Open(m_settings->InputFiles) ) {
100 cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." << endl;
104 // look up index files for all BAM files
105 reader.LocateIndexes();
107 // make sure index data is available
108 if ( !reader.HasIndexes() ) {
109 cerr << "bamtools random ERROR: could not load index data for all input BAM file(s)... Aborting." << endl;
114 // get BamReader metadata
115 const string headerText = reader.GetHeaderText();
116 const RefVector references = reader.GetReferenceData();
117 if ( references.empty() ) {
118 cerr << "bamtools random ERROR: no reference data available... Aborting." << endl;
123 // determine compression mode for BamWriter
124 bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
125 !m_settings->IsForceCompression );
126 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
127 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
131 writer.SetCompressionMode(compressionMode);
132 if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) {
133 cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename
134 << " for writing... Aborting." << endl;
139 // if user specified a REGION constraint, attempt to parse REGION string
141 if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
142 cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region << endl;
143 cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
150 // seed our random number generator
153 // grab random alignments
156 while ( i < m_settings->AlignmentCount ) {
159 int randomPosition = 0;
161 // use REGION constraints to select random refId & position
162 if ( m_settings->HasRegion ) {
164 // select a random refId
165 randomRefId = getRandomInt(region.LeftRefID, region.RightRefID);
167 // select a random position based on randomRefId
168 const int lowerBoundPosition = ( (randomRefId == region.LeftRefID)
169 ? region.LeftPosition
171 const int upperBoundPosition = ( (randomRefId == region.RightRefID)
172 ? region.RightPosition
173 : (references.at(randomRefId).RefLength - 1) );
174 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
177 // otherwise select from all possible random refId & position
180 // select random refId
181 randomRefId = getRandomInt(0, (int)references.size() - 1);
183 // select random position based on randomRefId
184 const int lowerBoundPosition = 0;
185 const int upperBoundPosition = references.at(randomRefId).RefLength - 1;
186 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
189 // if jump & read successful, save first alignment that overlaps random refId & position
190 if ( reader.Jump(randomRefId, randomPosition) ) {
191 while ( reader.GetNextAlignmentCore(al) ) {
192 if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
193 writer.SaveAlignment(al);
207 // ---------------------------------------------
208 // RandomTool implementation
210 RandomTool::RandomTool(void)
212 , m_settings(new RandomSettings)
215 // set program details
216 Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in <filename> -in <filename> ...] [-out <filename>] [-forceCompression] [-n] [-region <REGION>]");
219 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
220 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
221 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
222 Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
223 Options::AddValueOption("-region", "REGION", "only pull random alignments from within this genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
225 OptionGroup* SettingsOpts = Options::CreateOptionGroup("Settings");
226 Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, SettingsOpts, RANDOM_MAX_ALIGNMENT_COUNT);
229 RandomTool::~RandomTool(void) {
238 int RandomTool::Help(void) {
239 Options::DisplayHelp();
243 int RandomTool::Run(int argc, char* argv[]) {
245 // parse command line arguments
246 Options::Parse(argc, argv, 1);
248 // initialize RandomTool with settings
249 m_impl = new RandomToolPrivate(m_settings);
251 // run RandomTool, return success/fail