1 // ***************************************************************************
2 // bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 20 July 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Grab a random subset of alignments.
9 // ***************************************************************************
16 #include "bamtools_random.h"
17 #include "bamtools_options.h"
18 #include "bamtools_utilities.h"
19 #include "BamMultiReader.h"
20 #include "BamWriter.h"
22 using namespace BamTools;
27 const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
29 } // namespace BamTools
31 // ---------------------------------------------
32 // RandomSettings implementation
34 struct RandomTool::RandomSettings {
37 bool HasAlignmentCount;
41 bool IsForceCompression;
44 unsigned int AlignmentCount;
45 vector<string> InputFiles;
46 string OutputFilename;
51 : HasAlignmentCount(false)
55 , IsForceCompression(false)
56 , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
57 , OutputFilename(Options::StandardOut())
61 // ---------------------------------------------
62 // RandomTool implementation
64 RandomTool::RandomTool(void)
66 , m_settings(new RandomSettings)
68 // set program details
69 Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in <filename> -in <filename> ...] [-out <filename>] [-region <REGION>]");
72 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
73 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
74 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
75 Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
77 OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
78 Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed (currently)", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, FilterOpts, RANDOM_MAX_ALIGNMENT_COUNT);
79 Options::AddValueOption("-region", "REGION", "limit source of random alignment subset to a particular genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai or <filename>.bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
82 RandomTool::~RandomTool(void) {
87 int RandomTool::Help(void) {
88 Options::DisplayHelp();
92 int RandomTool::Run(int argc, char* argv[]) {
94 // TODO: Handle BAM input WITHOUT index files.
96 // parse command line arguments
97 Options::Parse(argc, argv, 1);
99 // set to default input if none provided
100 if ( !m_settings->HasInput )
101 m_settings->InputFiles.push_back(Options::StandardIn());
103 // open our BAM reader
104 BamMultiReader reader;
105 reader.Open(m_settings->InputFiles);
106 string headerText = reader.GetHeaderText();
107 RefVector references = reader.GetReferenceData();
109 // check that reference data is available, used for generating random jumps
110 if ( references.empty() ) {
111 cerr << "No reference data available... quitting." << endl;
116 // see if user specified a REGION
118 if ( m_settings->HasRegion ) {
119 if ( Utilities::ParseRegionString(m_settings->Region, reader, region) )
120 reader.SetRegion(region);
125 bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression );
126 writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed);
128 // seed our random number generator
131 // grab random alignments
134 while ( i < m_settings->AlignmentCount ) {
137 int randomPosition = 0;
139 // use REGION constraints to generate random refId & position
140 if ( m_settings->HasRegion ) {
142 int lowestRefId = region.LeftRefID;
143 int highestRefId = region.RightRefID;
144 int rangeRefId = (highestRefId - lowestRefId) + 1;
145 randomRefId = lowestRefId + (int)(rangeRefId * (double)(rand()/((double)RAND_MAX + 1)));
147 int lowestPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 );
148 int highestPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : references.at(randomRefId).RefLength - 1 );
149 int rangePosition = (highestPosition - lowestPosition) + 1;
150 randomPosition = lowestPosition + (int)(rangePosition * (double)(rand()/((double)RAND_MAX + 1)));
153 // otherwise generate 'normal' random refId & position
156 // generate random refId
158 int highestRefId = references.size() - 1;
159 int rangeRefId = (highestRefId - lowestRefId) + 1;
160 randomRefId = lowestRefId + (int)(rangeRefId * (double)(rand()/((double)RAND_MAX + 1)));
162 // generate random position
163 int lowestPosition = 0;
164 int highestPosition = references.at(randomRefId).RefLength - 1;
165 int rangePosition = (highestPosition - lowestPosition) + 1;
166 randomPosition = lowestPosition + (int)(rangePosition * (double)(rand()/((double)RAND_MAX + 1)));
169 // if jump & read successful, save alignment
170 if ( reader.Jump(randomRefId, randomPosition) ) {
171 while ( reader.GetNextAlignmentCore(al) ) {
172 if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
173 writer.SaveAlignment(al);
181 // close reader & writer