1 // ***************************************************************************
2 // bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 24 July 2013 (DB)
6 // ---------------------------------------------------------------------------
7 // Grab a random subset of alignments (testing tool)
8 // ***************************************************************************
10 #include "bamtools_random.h"
12 #include <api/BamMultiReader.h>
13 #include <api/BamWriter.h>
14 #include <utils/bamtools_options.h>
15 #include <utils/bamtools_utilities.h>
16 using namespace BamTools;
29 const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
31 // utility methods for RandomTool
32 int getRandomInt(const int& lowerBound, const int& upperBound) {
33 const int range = (upperBound - lowerBound) + 1;
34 return ( lowerBound + (int)(range * (double)rand()/((double)RAND_MAX + 1)) );
37 } // namespace BamTools
39 // ---------------------------------------------
40 // RandomSettings implementation
42 struct RandomTool::RandomSettings {
45 bool HasAlignmentCount;
47 bool HasInputFilelist;
49 bool HasRandomNumberSeed;
51 bool IsForceCompression;
54 unsigned int AlignmentCount;
55 vector<string> InputFiles;
57 string OutputFilename;
58 unsigned int RandomNumberSeed;
63 : HasAlignmentCount(false)
65 , HasInputFilelist(false)
67 , HasRandomNumberSeed(false)
69 , IsForceCompression(false)
70 , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
71 , OutputFilename(Options::StandardOut())
76 // ---------------------------------------------
77 // RandomToolPrivate implementation
79 struct RandomTool::RandomToolPrivate {
83 RandomToolPrivate(RandomTool::RandomSettings* settings)
84 : m_settings(settings)
87 ~RandomToolPrivate(void) { }
95 RandomTool::RandomSettings* m_settings;
98 bool RandomTool::RandomToolPrivate::Run(void) {
100 // set to default stdin if no input files provided
101 if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
102 m_settings->InputFiles.push_back(Options::StandardIn());
104 // add files in the filelist to the input file list
105 if ( m_settings->HasInputFilelist ) {
107 ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
108 if ( !filelist.is_open() ) {
109 cerr << "bamtools random ERROR: could not open input BAM file list... Aborting." << endl;
114 while ( getline(filelist, line) )
115 m_settings->InputFiles.push_back(line);
119 BamMultiReader reader;
120 if ( !reader.Open(m_settings->InputFiles) ) {
121 cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." << endl;
125 // look up index files for all BAM files
126 reader.LocateIndexes();
128 // make sure index data is available
129 if ( !reader.HasIndexes() ) {
130 cerr << "bamtools random ERROR: could not load index data for all input BAM file(s)... Aborting." << endl;
135 // get BamReader metadata
136 const string headerText = reader.GetHeaderText();
137 const RefVector references = reader.GetReferenceData();
138 if ( references.empty() ) {
139 cerr << "bamtools random ERROR: no reference data available... Aborting." << endl;
144 // determine compression mode for BamWriter
145 bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
146 !m_settings->IsForceCompression );
147 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
148 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
152 writer.SetCompressionMode(compressionMode);
153 if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) {
154 cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename
155 << " for writing... Aborting." << endl;
160 // if user specified a REGION constraint, attempt to parse REGION string
162 if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
163 cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region << endl;
164 cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
171 // seed our random number generator
172 if ( m_settings->HasRandomNumberSeed )
173 srand( m_settings->RandomNumberSeed );
177 // grab random alignments
180 while ( i < m_settings->AlignmentCount ) {
183 int randomPosition = 0;
185 // use REGION constraints to select random refId & position
186 if ( m_settings->HasRegion ) {
188 // select a random refId
189 randomRefId = getRandomInt(region.LeftRefID, region.RightRefID);
191 // select a random position based on randomRefId
192 const int lowerBoundPosition = ( (randomRefId == region.LeftRefID)
193 ? region.LeftPosition
195 const int upperBoundPosition = ( (randomRefId == region.RightRefID)
196 ? region.RightPosition
197 : (references.at(randomRefId).RefLength - 1) );
198 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
201 // otherwise select from all possible random refId & position
204 // select random refId
205 randomRefId = getRandomInt(0, (int)references.size() - 1);
207 // select random position based on randomRefId
208 const int lowerBoundPosition = 0;
209 const int upperBoundPosition = references.at(randomRefId).RefLength - 1;
210 randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
213 // if jump & read successful, save first alignment that overlaps random refId & position
214 if ( reader.Jump(randomRefId, randomPosition) ) {
215 while ( reader.GetNextAlignmentCore(al) ) {
216 if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
217 writer.SaveAlignment(al);
231 // ---------------------------------------------
232 // RandomTool implementation
234 RandomTool::RandomTool(void)
236 , m_settings(new RandomSettings)
239 // set program details
240 Options::SetProgramInfo("bamtools random", "grab a random subset of alignments",
241 "[-in <filename> -in <filename> ... | -list <filelist>] [-out <filename>] [-forceCompression] [-n] [-region <REGION>]");
244 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
245 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
246 Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts);
247 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
248 Options::AddValueOption("-region", "REGION", "only pull random alignments from within this genomic region. Index file is recommended for better performance, and is used automatically if it exists. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
249 Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
251 OptionGroup* SettingsOpts = Options::CreateOptionGroup("Settings");
252 Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", "",
253 m_settings->HasAlignmentCount, m_settings->AlignmentCount, SettingsOpts, RANDOM_MAX_ALIGNMENT_COUNT);
254 Options::AddValueOption("-seed", "unsigned integer", "random number generator seed (for repeatable results). Current time is used if no seed value is provided.", "",
255 m_settings->HasRandomNumberSeed, m_settings->RandomNumberSeed, SettingsOpts);
258 RandomTool::~RandomTool(void) {
267 int RandomTool::Help(void) {
268 Options::DisplayHelp();
272 int RandomTool::Run(int argc, char* argv[]) {
274 // parse command line arguments
275 Options::Parse(argc, argv, 1);
277 // initialize RandomTool with settings
278 m_impl = new RandomToolPrivate(m_settings);
280 // run RandomTool, return success/fail