1 // ***************************************************************************
2 // bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 7 April 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Grab a random subset of alignments (testing tool)
8 // ***************************************************************************
10 #include "bamtools_random.h"
12 #include <api/BamMultiReader.h>
13 #include <api/BamWriter.h>
14 #include <utils/bamtools_options.h>
15 #include <utils/bamtools_utilities.h>
16 using namespace BamTools;
28 const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
30 // utility methods for RandomTool
31 int getRandomInt(const int& lowerBound, const int& upperBound) {
32 const int range = (upperBound - lowerBound) + 1;
33 return ( lowerBound + (int)(range * (double)rand()/((double)RAND_MAX + 1)) );
36 } // namespace BamTools
38 // ---------------------------------------------
39 // RandomSettings implementation
41 struct RandomTool::RandomSettings {
44 bool HasAlignmentCount;
48 bool IsForceCompression;
51 unsigned int AlignmentCount;
52 vector<string> InputFiles;
53 string OutputFilename;
58 : HasAlignmentCount(false)
62 , IsForceCompression(false)
63 , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
64 , OutputFilename(Options::StandardOut())
68 // ---------------------------------------------
69 // RandomToolPrivate implementation
71 struct RandomTool::RandomToolPrivate {
75 RandomToolPrivate(RandomTool::RandomSettings* settings)
76 : m_settings(settings)
79 ~RandomToolPrivate(void) { }
87 RandomTool::RandomSettings* m_settings;
90 bool RandomTool::RandomToolPrivate::Run(void) {
92 // set to default stdin if no input files provided
93 if ( !m_settings->HasInput )
94 m_settings->InputFiles.push_back(Options::StandardIn());
97 BamMultiReader reader;
98 if ( !reader.Open(m_settings->InputFiles) ) {
99 cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." << endl;
103 // look up index files for all BAM files
104 reader.LocateIndexes();
106 // make sure index data is available
107 if ( !reader.HasIndexes() ) {
108 cerr << "bamtools random ERROR: could not load index data for all input BAM file(s)... Aborting." << endl;
113 // get BamReader metadata
114 const string headerText = reader.GetHeaderText();
115 const RefVector references = reader.GetReferenceData();
116 if ( references.empty() ) {
117 cerr << "bamtools random ERROR: no reference data available... Aborting." << endl;
122 // determine compression mode for BamWriter
123 bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
124 !m_settings->IsForceCompression );
125 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
126 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
130 writer.SetCompressionMode(compressionMode);
131 if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) {
132 cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename
133 << " for writing... Aborting." << endl;
138 // if user specified a REGION constraint, attempt to parse REGION string
140 if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
141 cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region << endl;
142 cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
149 // seed our random number generator
152 // grab random alignments
155 while ( i < m_settings->AlignmentCount ) {
158 int randomPosition = 0;
160 // use REGION constraints to select random refId & position
161 if ( m_settings->HasRegion ) {
163 // select a random refId
164 randomRefId = getRandomInt(region.LeftRefID, region.RightRefID);
166 // select a random position based on randomRefId
167 const int lowerBoundPosition = ( (randomRefId == region.LeftRefID)
168 ? region.LeftPosition
170 const int upperBoundPosition = ( (randomRefId == region.RightRefID)
171 ? region.RightPosition
172 : (references.at(randomRefId).RefLength - 1) );
173 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
176 // otherwise select from all possible random refId & position
179 // select random refId
180 randomRefId = getRandomInt(0, (int)references.size() - 1);
182 // select random position based on randomRefId
183 const int lowerBoundPosition = 0;
184 const int upperBoundPosition = references.at(randomRefId).RefLength - 1;
185 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
188 // if jump & read successful, save first alignment that overlaps random refId & position
189 if ( reader.Jump(randomRefId, randomPosition) ) {
190 while ( reader.GetNextAlignmentCore(al) ) {
191 if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
192 writer.SaveAlignment(al);
206 // ---------------------------------------------
207 // RandomTool implementation
209 RandomTool::RandomTool(void)
211 , m_settings(new RandomSettings)
214 // set program details
215 Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in <filename> -in <filename> ...] [-out <filename>] [-forceCompression] [-n] [-region <REGION>]");
218 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
219 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
220 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
221 Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
222 Options::AddValueOption("-region", "REGION", "only pull random alignments from within this genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
224 OptionGroup* SettingsOpts = Options::CreateOptionGroup("Settings");
225 Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, SettingsOpts, RANDOM_MAX_ALIGNMENT_COUNT);
228 RandomTool::~RandomTool(void) {
237 int RandomTool::Help(void) {
238 Options::DisplayHelp();
242 int RandomTool::Run(int argc, char* argv[]) {
244 // parse command line arguments
245 Options::Parse(argc, argv, 1);
247 // initialize RandomTool with settings
248 m_impl = new RandomToolPrivate(m_settings);
250 // run RandomTool, return success/fail