1 // ***************************************************************************
2 // bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 20 July 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Grab a random subset of alignments.
9 // ***************************************************************************
16 #include "bamtools_random.h"
17 #include "bamtools_options.h"
18 #include "bamtools_utilities.h"
19 #include "BamMultiReader.h"
20 #include "BamWriter.h"
22 using namespace BamTools;
27 const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
29 } // namespace BamTools
31 // ---------------------------------------------
32 // RandomSettings implementation
34 struct RandomTool::RandomSettings {
37 bool HasAlignmentCount;
43 unsigned int AlignmentCount;
44 vector<string> InputFiles;
45 string OutputFilename;
50 : HasAlignmentCount(false)
54 , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
58 // ---------------------------------------------
59 // RandomTool implementation
61 RandomTool::RandomTool(void)
63 , m_settings(new RandomSettings)
65 // set program details
66 Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in <filename> -in <filename> ...] [-out <filename>] [-region <REGION>]");
69 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
70 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
71 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
73 OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
74 Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed (currently)", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, FilterOpts, RANDOM_MAX_ALIGNMENT_COUNT);
75 Options::AddValueOption("-region", "REGION", "limit source of random alignment subset to a particular genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai or <filename>.bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
78 RandomTool::~RandomTool(void) {
83 int RandomTool::Help(void) {
84 Options::DisplayHelp();
88 int RandomTool::Run(int argc, char* argv[]) {
90 // TODO: Handle BAM input WITHOUT index files.
92 // parse command line arguments
93 Options::Parse(argc, argv, 1);
95 // set to default input if none provided
96 if ( !m_settings->HasInput )
97 m_settings->InputFiles.push_back(Options::StandardIn());
99 // open our BAM reader
100 BamMultiReader reader;
101 reader.Open(m_settings->InputFiles);
102 string headerText = reader.GetHeaderText();
103 RefVector references = reader.GetReferenceData();
105 // check that reference data is available, used for generating random jumps
106 if ( references.empty() ) {
107 cerr << "No reference data available... quitting." << endl;
112 // see if user specified a REGION
114 if ( m_settings->HasRegion ) {
115 if ( Utilities::ParseRegionString(m_settings->Region, reader, region) )
116 reader.SetRegion(region);
119 // open out BAM writer
121 writer.Open(m_settings->OutputFilename, headerText, references);
123 // seed our random number generator
126 // grab random alignments
129 while ( i < m_settings->AlignmentCount ) {
132 int randomPosition = 0;
134 // use REGION constraints to generate random refId & position
135 if ( m_settings->HasRegion ) {
137 int lowestRefId = region.LeftRefID;
138 int highestRefId = region.RightRefID;
139 int rangeRefId = (highestRefId - lowestRefId) + 1;
140 randomRefId = lowestRefId + (int)(rangeRefId * (double)(rand()/((double)RAND_MAX + 1)));
142 int lowestPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 );
143 int highestPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : references.at(randomRefId).RefLength - 1 );
144 int rangePosition = (highestPosition - lowestPosition) + 1;
145 randomPosition = lowestPosition + (int)(rangePosition * (double)(rand()/((double)RAND_MAX + 1)));
148 // otherwise generate 'normal' random refId & position
151 // generate random refId
153 int highestRefId = references.size() - 1;
154 int rangeRefId = (highestRefId - lowestRefId) + 1;
155 randomRefId = lowestRefId + (int)(rangeRefId * (double)(rand()/((double)RAND_MAX + 1)));
157 // generate random position
158 int lowestPosition = 0;
159 int highestPosition = references.at(randomRefId).RefLength - 1;
160 int rangePosition = (highestPosition - lowestPosition) + 1;
161 randomPosition = lowestPosition + (int)(rangePosition * (double)(rand()/((double)RAND_MAX + 1)));
164 // if jump & read successful, save alignment
165 if ( reader.Jump(randomRefId, randomPosition) ) {
166 while ( reader.GetNextAlignmentCore(al) ) {
167 if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
168 writer.SaveAlignment(al);
176 // close reader & writer