1 // ***************************************************************************
2 // bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 10 December 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Grab a random subset of alignments (testing tool)
8 // ***************************************************************************
10 #include "bamtools_random.h"
12 #include <api/BamMultiReader.h>
13 #include <api/BamWriter.h>
14 #include <utils/bamtools_options.h>
15 #include <utils/bamtools_utilities.h>
16 using namespace BamTools;
29 const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
31 // utility methods for RandomTool
32 int getRandomInt(const int& lowerBound, const int& upperBound) {
33 const int range = (upperBound - lowerBound) + 1;
34 return ( lowerBound + (int)(range * (double)rand()/((double)RAND_MAX + 1)) );
37 } // namespace BamTools
39 // ---------------------------------------------
40 // RandomSettings implementation
42 struct RandomTool::RandomSettings {
45 bool HasAlignmentCount;
47 bool HasInputFilelist;
50 bool IsForceCompression;
53 unsigned int AlignmentCount;
54 vector<string> InputFiles;
56 string OutputFilename;
61 : HasAlignmentCount(false)
63 , HasInputFilelist(false)
66 , IsForceCompression(false)
67 , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
68 , OutputFilename(Options::StandardOut())
72 // ---------------------------------------------
73 // RandomToolPrivate implementation
75 struct RandomTool::RandomToolPrivate {
79 RandomToolPrivate(RandomTool::RandomSettings* settings)
80 : m_settings(settings)
83 ~RandomToolPrivate(void) { }
91 RandomTool::RandomSettings* m_settings;
94 bool RandomTool::RandomToolPrivate::Run(void) {
96 // set to default stdin if no input files provided
97 if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
98 m_settings->InputFiles.push_back(Options::StandardIn());
100 // add files in the filelist to the input file list
101 if ( m_settings->HasInputFilelist ) {
103 ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
104 if ( !filelist.is_open() ) {
105 cerr << "bamtools random ERROR: could not open input BAM file list... Aborting." << endl;
110 while ( getline(filelist, line) )
111 m_settings->InputFiles.push_back(line);
115 BamMultiReader reader;
116 if ( !reader.Open(m_settings->InputFiles) ) {
117 cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." << endl;
121 // look up index files for all BAM files
122 reader.LocateIndexes();
124 // make sure index data is available
125 if ( !reader.HasIndexes() ) {
126 cerr << "bamtools random ERROR: could not load index data for all input BAM file(s)... Aborting." << endl;
131 // get BamReader metadata
132 const string headerText = reader.GetHeaderText();
133 const RefVector references = reader.GetReferenceData();
134 if ( references.empty() ) {
135 cerr << "bamtools random ERROR: no reference data available... Aborting." << endl;
140 // determine compression mode for BamWriter
141 bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
142 !m_settings->IsForceCompression );
143 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
144 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
148 writer.SetCompressionMode(compressionMode);
149 if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) {
150 cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename
151 << " for writing... Aborting." << endl;
156 // if user specified a REGION constraint, attempt to parse REGION string
158 if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
159 cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region << endl;
160 cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
167 // seed our random number generator
170 // grab random alignments
173 while ( i < m_settings->AlignmentCount ) {
176 int randomPosition = 0;
178 // use REGION constraints to select random refId & position
179 if ( m_settings->HasRegion ) {
181 // select a random refId
182 randomRefId = getRandomInt(region.LeftRefID, region.RightRefID);
184 // select a random position based on randomRefId
185 const int lowerBoundPosition = ( (randomRefId == region.LeftRefID)
186 ? region.LeftPosition
188 const int upperBoundPosition = ( (randomRefId == region.RightRefID)
189 ? region.RightPosition
190 : (references.at(randomRefId).RefLength - 1) );
191 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
194 // otherwise select from all possible random refId & position
197 // select random refId
198 randomRefId = getRandomInt(0, (int)references.size() - 1);
200 // select random position based on randomRefId
201 const int lowerBoundPosition = 0;
202 const int upperBoundPosition = references.at(randomRefId).RefLength - 1;
203 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
206 // if jump & read successful, save first alignment that overlaps random refId & position
207 if ( reader.Jump(randomRefId, randomPosition) ) {
208 while ( reader.GetNextAlignmentCore(al) ) {
209 if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
210 writer.SaveAlignment(al);
224 // ---------------------------------------------
225 // RandomTool implementation
227 RandomTool::RandomTool(void)
229 , m_settings(new RandomSettings)
232 // set program details
233 Options::SetProgramInfo("bamtools random", "grab a random subset of alignments",
234 "[-in <filename> -in <filename> ... | -list <filelist>] [-out <filename>] [-forceCompression] [-n] [-region <REGION>]");
237 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
238 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
239 Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts);
240 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
241 Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
242 Options::AddValueOption("-region", "REGION", "only pull random alignments from within this genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
244 OptionGroup* SettingsOpts = Options::CreateOptionGroup("Settings");
245 Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, SettingsOpts, RANDOM_MAX_ALIGNMENT_COUNT);
248 RandomTool::~RandomTool(void) {
257 int RandomTool::Help(void) {
258 Options::DisplayHelp();
262 int RandomTool::Run(int argc, char* argv[]) {
264 // parse command line arguments
265 Options::Parse(argc, argv, 1);
267 // initialize RandomTool with settings
268 m_impl = new RandomToolPrivate(m_settings);
270 // run RandomTool, return success/fail