]> git.donarmstrong.com Git - bamtools.git/blob - src/utils/bamtools_pileup_engine.cpp
PileupEngine performance improvement for high coverage
[bamtools.git] / src / utils / bamtools_pileup_engine.cpp
1 // ***************************************************************************
2 // bamtools_pileup_engine.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 9 March 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides pileup at position functionality for various tools.
8 // ***************************************************************************
9
10 #include "utils/bamtools_pileup_engine.h"
11 using namespace BamTools;
12
13 #include <iostream>
14 using namespace std;
15
16 // ---------------------------------------------
17 // PileupEnginePrivate implementation
18
19 struct PileupEngine::PileupEnginePrivate {
20   
21     // data members
22     int CurrentId;
23     int CurrentPosition;
24     vector<BamAlignment> CurrentAlignments;
25     PileupPosition CurrentPileupData;
26     
27     bool IsFirstAlignment;
28     vector<PileupVisitor*> Visitors;
29   
30     // ctor & dtor
31     PileupEnginePrivate(void)
32         : CurrentId(-1)
33         , CurrentPosition(-1)
34         , IsFirstAlignment(true)
35     { }
36     ~PileupEnginePrivate(void) { }
37     
38     // 'public' methods
39     bool AddAlignment(const BamAlignment& al);
40     void Flush(void);
41     
42     // internal methods
43     private:
44         void ApplyVisitors(void);
45         void ClearOldData(void);
46         void CreatePileupData(void);
47         void ParseAlignmentCigar(const BamAlignment& al);
48 };
49
50 bool PileupEngine::PileupEnginePrivate::AddAlignment(const BamAlignment& al) {
51   
52     // if first time
53     if ( IsFirstAlignment ) {
54       
55         // set initial markers 
56         CurrentId       = al.RefID;
57         CurrentPosition = al.Position;
58         
59         // store first entry
60         CurrentAlignments.clear();
61         CurrentAlignments.push_back(al);
62         
63         // set flag & return
64         IsFirstAlignment = false;
65         return true;
66     }
67   
68     // if same reference
69     if ( al.RefID == CurrentId ) {
70       
71         // if same position, store and move on
72         if ( al.Position == CurrentPosition )
73             CurrentAlignments.push_back(al);
74         
75         // if less than CurrentPosition - sorting error => ABORT
76         else if ( al.Position < CurrentPosition ) {
77             cerr << "Pileup::Run() : Data not sorted correctly!" << endl;
78             return false;
79         }
80         
81         // else print pileup data until 'catching up' to CurrentPosition
82         else {
83             while ( al.Position > CurrentPosition ) {
84                 ApplyVisitors();
85                 ++CurrentPosition;
86             }
87             CurrentAlignments.push_back(al);
88         }
89     } 
90
91     // if reference ID less than CurrentId - sorting error => ABORT
92     else if ( al.RefID < CurrentId ) {
93         cerr << "Pileup::Run() : Data not sorted correctly!" << endl;
94         return false;
95     }
96
97     // else moved forward onto next reference
98     else {
99         
100         // print any remaining pileup data from previous reference
101         while ( !CurrentAlignments.empty() ) {
102             ApplyVisitors();
103             ++CurrentPosition;
104         }
105         
106         // store first entry on this new reference, update markers
107         CurrentAlignments.clear();
108         CurrentAlignments.push_back(al);
109         CurrentId = al.RefID;
110         CurrentPosition = al.Position;
111     }
112   
113     return true;
114 }
115
116 void PileupEngine::PileupEnginePrivate::ApplyVisitors(void) {
117   
118     // parse CIGAR data in BamAlignments to build up current pileup data
119     CreatePileupData();
120   
121     // apply all visitors to current alignment set
122     vector<PileupVisitor*>::const_iterator visitorIter = Visitors.begin();
123     vector<PileupVisitor*>::const_iterator visitorEnd  = Visitors.end();
124     for ( ; visitorIter != visitorEnd; ++visitorIter ) 
125         (*visitorIter)->Visit(CurrentPileupData);
126 }
127
128 void PileupEngine::PileupEnginePrivate::ClearOldData(void) {
129  
130     // remove any alignments that end before our CurrentPosition
131     // N.B. - BAM positions are 0-based, half-open. GetEndPosition() returns a 1-based position,
132     //        while our CurrentPosition is 0-based. For example, an alignment with 'endPosition' of
133     //        100 does not overlap a 'CurrentPosition' of 100, and should be discarded.
134
135     size_t i = 0;
136     size_t j = 0;
137     const size_t numAlignments = CurrentAlignments.size();
138     while ( i < numAlignments ) {
139
140         // skip over alignment if its (1-based) endPosition is <= to (0-based) CurrentPosition
141         // i.e. this entry will not be saved upon vector resize
142         const int endPosition = CurrentAlignments[i].GetEndPosition();
143         if ( endPosition <= CurrentPosition ) {
144             ++i;
145             continue;
146         }
147
148         // otherwise alignment ends after CurrentPosition
149         // move it towards vector beginning, at index j
150         if ( i != j )
151             CurrentAlignments[j] = CurrentAlignments[i];
152
153         // increment our indices
154         ++i;
155         ++j;
156     }
157
158     // 'squeeze' vector to size j, discarding all remaining alignments in the container
159     CurrentAlignments.resize(j);
160 }
161
162 void PileupEngine::PileupEnginePrivate::CreatePileupData(void) {
163   
164     // remove any non-overlapping alignments
165     ClearOldData();
166   
167     // set pileup refId, position to current markers
168     CurrentPileupData.RefId = CurrentId;
169     CurrentPileupData.Position = CurrentPosition;
170     CurrentPileupData.PileupAlignments.clear();
171     
172     // parse CIGAR data in remaining alignments 
173     vector<BamAlignment>::const_iterator alIter = CurrentAlignments.begin();
174     vector<BamAlignment>::const_iterator alEnd  = CurrentAlignments.end(); 
175     for ( ; alIter != alEnd; ++alIter )
176         ParseAlignmentCigar( (*alIter) );
177 }
178
179 void PileupEngine::PileupEnginePrivate::Flush(void) {
180     while ( !CurrentAlignments.empty() ) {
181         ApplyVisitors();
182         ++CurrentPosition;
183     }
184 }
185
186 void PileupEngine::PileupEnginePrivate::ParseAlignmentCigar(const BamAlignment& al) {
187   
188     // skip if unmapped
189     if ( !al.IsMapped() ) return;
190     
191     // intialize local variables
192     int  genomePosition      = al.Position;
193     int  positionInAlignment = 0;
194     bool isNewReadSegment    = true;
195     bool saveAlignment       = true;    
196     PileupAlignment pileupAlignment(al);
197     
198     // iterate over CIGAR operations
199     const int numCigarOps = (const int)al.CigarData.size();
200     for (int i = 0; i < numCigarOps; ++i ) { 
201         const CigarOp& op = al.CigarData.at(i);
202       
203         // if op is MATCH
204         if ( op.Type == 'M' ) {
205           
206             // if match op overlaps current position
207             if ( genomePosition + (int)op.Length > CurrentPosition ) {
208               
209                 // set pileup data
210                 pileupAlignment.IsCurrentDeletion   = false;
211                 pileupAlignment.IsNextDeletion      = false;
212                 pileupAlignment.IsNextInsertion     = false;
213                 pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition);
214                 
215                 // check for beginning of read segment
216                 if ( genomePosition == CurrentPosition && isNewReadSegment ) 
217                     pileupAlignment.IsSegmentBegin = true;
218                 
219                 // if we're at the end of a match operation
220                 if ( genomePosition + (int)op.Length - 1 == CurrentPosition ) {
221                     
222                     // if not last operation
223                     if ( i < numCigarOps - 1 ) {
224                         
225                         // check next CIGAR op
226                         const CigarOp& nextOp = al.CigarData.at(i+1);
227                         
228                         // if next CIGAR op is DELETION
229                         if ( nextOp.Type == 'D') {
230                             pileupAlignment.IsNextDeletion = true;
231                             pileupAlignment.DeletionLength = nextOp.Length;
232                         }
233                         
234                         // if next CIGAR op is INSERTION
235                         else if ( nextOp.Type == 'I' ) {
236                             pileupAlignment.IsNextInsertion = true;
237                             pileupAlignment.InsertionLength = nextOp.Length;
238                         }
239                             
240                         // if next CIGAR op is either DELETION or INSERTION
241                         if ( nextOp.Type == 'D' || nextOp.Type == 'I' ) {
242
243                             // if there is a CIGAR op after the DEL/INS
244                             if ( i < numCigarOps - 2 ) {
245                                 const CigarOp& nextNextOp = al.CigarData.at(i+2);
246                                 
247                                 // if next CIGAR op is clipping or ref_skip
248                                 if ( nextNextOp.Type == 'S' || 
249                                      nextNextOp.Type == 'N' ||
250                                      nextNextOp.Type == 'H' )
251                                     pileupAlignment.IsSegmentEnd = true;
252                             } 
253                             else {
254                                 pileupAlignment.IsSegmentEnd = true;
255                                 
256                                 // if next CIGAR op is clipping or ref_skip
257                                 if ( nextOp.Type == 'S' || 
258                                      nextOp.Type == 'N' ||
259                                      nextOp.Type == 'H' )
260                                     pileupAlignment.IsSegmentEnd = true;
261                             }
262                         }
263                         
264                         // otherwise
265                         else { 
266                         
267                             // if next CIGAR op is clipping or ref_skip
268                             if ( nextOp.Type == 'S' || 
269                                  nextOp.Type == 'N' ||
270                                  nextOp.Type == 'H' )
271                                 pileupAlignment.IsSegmentEnd = true;
272                         }
273                     }
274                     
275                     // else this is last operation
276                     else pileupAlignment.IsSegmentEnd = true;
277                 }
278             }
279           
280             // increment markers
281             genomePosition      += op.Length;
282             positionInAlignment += op.Length;
283         } 
284         
285         // if op is DELETION
286         else if ( op.Type == 'D' ) {
287           
288             // if deletion op overlaps current position
289             if ( genomePosition + (int)op.Length > CurrentPosition ) {
290               
291                 // set pileup data
292                 pileupAlignment.IsCurrentDeletion   = true;
293                 pileupAlignment.IsNextDeletion      = false;
294                 pileupAlignment.IsNextInsertion     = true;
295                 pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition);
296             }
297             
298             // increment marker
299             genomePosition += op.Length;
300         }
301
302         // if op is REF_SKIP
303         else if ( op.Type == 'N' ) {
304             genomePosition += op.Length;
305         }
306         
307         // if op is INSERTION or SOFT_CLIP
308         else if ( op.Type == 'I' || op.Type == 'S' ) {
309             positionInAlignment += op.Length;
310         }
311         
312         // checl for beginning of new read segment
313         if ( op.Type == 'N' ||
314              op.Type == 'S' ||
315              op.Type == 'H' )
316             isNewReadSegment = true;
317         else 
318             isNewReadSegment = false;
319       
320         // if we've moved beyond current position
321         if ( genomePosition > CurrentPosition ) {
322             if ( op.Type == 'N' ) saveAlignment = false; // ignore alignment if REF_SKIP
323             break;
324         }
325     }
326
327     // save pileup position if flag is true
328     if ( saveAlignment )
329         CurrentPileupData.PileupAlignments.push_back( pileupAlignment );
330 }
331
332 // ---------------------------------------------
333 // PileupEngine implementation
334
335 PileupEngine::PileupEngine(void)
336     : d( new PileupEnginePrivate )
337 { }
338
339 PileupEngine::~PileupEngine(void) {
340     delete d;
341     d = 0;
342 }
343
344 bool PileupEngine::AddAlignment(const BamAlignment& al) { return d->AddAlignment(al); }
345 void PileupEngine::AddVisitor(PileupVisitor* visitor) { d->Visitors.push_back(visitor); }
346 void PileupEngine::Flush(void) { d->Flush(); }