1 // ***************************************************************************
2 // bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 3 September 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Grab a random subset of alignments.
9 // ***************************************************************************
16 #include "bamtools_random.h"
17 #include "bamtools_options.h"
18 #include "bamtools_utilities.h"
19 #include "BamMultiReader.h"
20 #include "BamWriter.h"
22 using namespace BamTools;
27 const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
29 // utility methods for RandomTool
30 const int getRandomInt(const int& lowerBound, const int& upperBound) {
31 const int range = (upperBound - lowerBound) + 1;
32 return ( lowerBound + (int)(range * (double)rand()/((double)RAND_MAX + 1)) );
35 } // namespace BamTools
37 // ---------------------------------------------
38 // RandomSettings implementation
40 struct RandomTool::RandomSettings {
43 bool HasAlignmentCount;
47 bool IsForceCompression;
50 unsigned int AlignmentCount;
51 vector<string> InputFiles;
52 string OutputFilename;
57 : HasAlignmentCount(false)
61 , IsForceCompression(false)
62 , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
63 , OutputFilename(Options::StandardOut())
67 // ---------------------------------------------
68 // RandomTool implementation
70 RandomTool::RandomTool(void)
72 , m_settings(new RandomSettings)
74 // set program details
75 Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in <filename> -in <filename> ...] [-out <filename>] [-forceCompression] [-n] [-region <REGION>]");
78 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
79 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
80 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
81 Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
82 Options::AddValueOption("-region", "REGION", "only pull random alignments from within this genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
84 OptionGroup* SettingsOpts = Options::CreateOptionGroup("Settings");
85 Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, SettingsOpts, RANDOM_MAX_ALIGNMENT_COUNT);
88 RandomTool::~RandomTool(void) {
93 int RandomTool::Help(void) {
94 Options::DisplayHelp();
98 int RandomTool::Run(int argc, char* argv[]) {
100 // parse command line arguments
101 Options::Parse(argc, argv, 1);
103 // set to default stdin if no input files provided
104 if ( !m_settings->HasInput )
105 m_settings->InputFiles.push_back(Options::StandardIn());
108 BamMultiReader reader;
109 if ( !reader.Open(m_settings->InputFiles) ) {
110 cerr << "ERROR: Could not open input BAM file(s)." << endl;
114 // make sure index data is available
115 if ( !reader.IsIndexLoaded() ) {
116 cerr << "ERROR: Could not load index data for all input BAM file(s)." << endl;
117 cerr << "\'bamtools random\' requires valid index files to provide efficient performance." << endl;
123 // get BamReader metadata
124 const string headerText = reader.GetHeaderText();
125 const RefVector references = reader.GetReferenceData();
126 if ( references.empty() ) {
127 cerr << "ERROR: No reference data available - required to perform random access throughtout input file(s)." << endl;
134 bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression );
135 if ( !writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed) ) {
136 cerr << "ERROR: Could not open BamWriter." << endl;
141 // if user specified a REGION constraint, attempt to parse REGION string
143 if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
144 cerr << "ERROR: Could not parse REGION: " << m_settings->Region << endl;
145 cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl;
151 // seed our random number generator
154 // grab random alignments
157 while ( i < m_settings->AlignmentCount ) {
160 int randomPosition = 0;
162 // use REGION constraints to select random refId & position
163 if ( m_settings->HasRegion ) {
165 // select a random refId
166 randomRefId = getRandomInt(region.LeftRefID, region.RightRefID);
168 // select a random position based on randomRefId
169 const int lowerBoundPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 );
170 const int upperBoundPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : (references.at(randomRefId).RefLength - 1) );
171 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
174 // otherwise select from all possible random refId & position
177 // select random refId
178 randomRefId = getRandomInt(0, (int)references.size() - 1);
180 // select random position based on randomRefId
181 const int lowerBoundPosition = 0;
182 const int upperBoundPosition = references.at(randomRefId).RefLength - 1;
183 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
186 // if jump & read successful, save first alignment that overlaps random refId & position
187 if ( reader.Jump(randomRefId, randomPosition) ) {
188 while ( reader.GetNextAlignmentCore(al) ) {
189 if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
190 writer.SaveAlignment(al);