1 // ***************************************************************************
2 // bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 21 March 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Grab a random subset of alignments.
9 // ***************************************************************************
11 #include "bamtools_random.h"
13 #include <api/BamMultiReader.h>
14 #include <api/BamWriter.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
29 const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
31 // utility methods for RandomTool
32 int getRandomInt(const int& lowerBound, const int& upperBound) {
33 const int range = (upperBound - lowerBound) + 1;
34 return ( lowerBound + (int)(range * (double)rand()/((double)RAND_MAX + 1)) );
37 } // namespace BamTools
39 // ---------------------------------------------
40 // RandomSettings implementation
42 struct RandomTool::RandomSettings {
45 bool HasAlignmentCount;
49 bool IsForceCompression;
52 unsigned int AlignmentCount;
53 vector<string> InputFiles;
54 string OutputFilename;
59 : HasAlignmentCount(false)
63 , IsForceCompression(false)
64 , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
65 , OutputFilename(Options::StandardOut())
69 // ---------------------------------------------
70 // RandomTool implementation
72 RandomTool::RandomTool(void)
74 , m_settings(new RandomSettings)
76 // set program details
77 Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in <filename> -in <filename> ...] [-out <filename>] [-forceCompression] [-n] [-region <REGION>]");
80 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
81 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
82 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
83 Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
84 Options::AddValueOption("-region", "REGION", "only pull random alignments from within this genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
86 OptionGroup* SettingsOpts = Options::CreateOptionGroup("Settings");
87 Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, SettingsOpts, RANDOM_MAX_ALIGNMENT_COUNT);
90 RandomTool::~RandomTool(void) {
95 int RandomTool::Help(void) {
96 Options::DisplayHelp();
100 int RandomTool::Run(int argc, char* argv[]) {
102 // parse command line arguments
103 Options::Parse(argc, argv, 1);
105 // set to default stdin if no input files provided
106 if ( !m_settings->HasInput )
107 m_settings->InputFiles.push_back(Options::StandardIn());
110 BamMultiReader reader;
111 if ( !reader.Open(m_settings->InputFiles) ) {
112 cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." << endl;
116 // look up index files for all BAM files
117 reader.LocateIndexes();
119 // make sure index data is available
120 if ( !reader.HasIndexes() ) {
121 cerr << "bamtools random ERROR: could not load index data for all input BAM file(s)... Aborting." << endl;
126 // get BamReader metadata
127 const string headerText = reader.GetHeaderText();
128 const RefVector references = reader.GetReferenceData();
129 if ( references.empty() ) {
130 cerr << "bamtools random ERROR: no reference data available... Aborting." << endl;
135 // determine compression mode for BamWriter
136 bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
137 !m_settings->IsForceCompression );
138 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
139 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
143 writer.SetCompressionMode(compressionMode);
144 if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) {
145 cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename << " for writing... Aborting." << endl;
150 // if user specified a REGION constraint, attempt to parse REGION string
152 if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
153 cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region << endl;
154 cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
161 // seed our random number generator
164 // grab random alignments
167 while ( i < m_settings->AlignmentCount ) {
170 int randomPosition = 0;
172 // use REGION constraints to select random refId & position
173 if ( m_settings->HasRegion ) {
175 // select a random refId
176 randomRefId = getRandomInt(region.LeftRefID, region.RightRefID);
178 // select a random position based on randomRefId
179 const int lowerBoundPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 );
180 const int upperBoundPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : (references.at(randomRefId).RefLength - 1) );
181 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
184 // otherwise select from all possible random refId & position
187 // select random refId
188 randomRefId = getRandomInt(0, (int)references.size() - 1);
190 // select random position based on randomRefId
191 const int lowerBoundPosition = 0;
192 const int upperBoundPosition = references.at(randomRefId).RefLength - 1;
193 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
196 // if jump & read successful, save first alignment that overlaps random refId & position
197 if ( reader.Jump(randomRefId, randomPosition) ) {
198 while ( reader.GetNextAlignmentCore(al) ) {
199 if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
200 writer.SaveAlignment(al);