]> git.donarmstrong.com Git - mothur.git/blob - suffixtree.cpp
fd18109513bea9f59590e16a1c705128f61ca43e
[mothur.git] / suffixtree.cpp
1 /*
2  *  suffixtree.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This is my half-assed attempt to implement a suffix tree.  This is a cobbled together algorithm using materials that
9  *      I found at http://marknelson.us/1996/08/01/suffix-trees/ and:
10  *
11  *              Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260
12  *              Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology. 
13  *                      USA: Cambridge University Press
14  *
15  *      The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree.
16  *
17  *      I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects.  The root is stored at
18  *      nodeVector[0].  Each tree also stores the sequence name and the string that corresponds to the actual sequence. 
19  *      Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new
20  *      sequence (countSuffixes).  This method is used to determine similarity between sequences and was inspired by the
21  *      article and Perl source code provided at http://www.ddj.com/web-development/184416093.
22  *
23  */
24
25 #include "sequence.hpp"
26 #include "suffixnodes.hpp"
27 #include "suffixtree.hpp"
28
29
30 //********************************************************************************************************************
31
32 inline bool compareParents(SuffixNode* left, SuffixNode* right){//      this is necessary to print the tree and to sort the
33         return (left->getParentNode() < right->getParentNode());        //      nodes in order of their parent
34 }
35
36 //********************************************************************************************************************
37  
38 SuffixTree::SuffixTree(const SuffixTree& st) : root(st.root), activeEndPosition(st.activeEndPosition), activeStartPosition(st.activeStartPosition), activeNode(st.activeNode),
39                                                                                                 nodeCounter(st.nodeCounter), seqName(st.seqName), sequence(st.sequence) { 
40         try {
41                 m = MothurOut::getInstance(); 
42                 
43                 for (int i = 0; i < st.nodeVector.size(); i++) {
44                         SuffixNode* temp = new SuffixBranch(*((SuffixBranch*)st.nodeVector[i]));
45                         nodeVector.push_back(temp);
46                 }
47                 
48                 
49         }catch(exception& e) {
50                 m->errorOut(e, "SuffixTree", "SuffixTree");
51                 exit(1);
52         }
53 }
54  
55 //********************************************************************************************************************
56
57 SuffixTree::SuffixTree(){ m = MothurOut::getInstance(); }
58
59 //********************************************************************************************************************
60
61 SuffixTree::~SuffixTree(){
62         for(int i=0;i<nodeVector.size();i++){   delete nodeVector[i];   }       
63         nodeVector.clear();
64 }
65
66 //********************************************************************************************************************
67
68 void SuffixTree::loadSequence(Sequence seq){
69         nodeCounter = 0;                                                        //      initially there are 0 nodes in the tree
70         activeStartPosition = 0;
71         activeEndPosition = -1;                                         
72         seqName = seq.getName();
73         sequence = seq.convert2ints();
74         sequence += '5';                                                        //      this essentially concatenates a '$' to the end of the sequence to
75         int seqLength = sequence.length();                      //      make it a cononical suffix tree
76         
77         nodeVector.push_back(new SuffixBranch(-1, 0, -1));      //      enter the root of the suffix tree
78         
79         activeNode = root = 0;
80         string hold;
81         for(int i=0;i<seqLength;i++){
82                 addPrefix(i);                                                   //      step through the sequence adding each prefix
83         }
84 }
85
86 //********************************************************************************************************************
87
88 string SuffixTree::getSeqName() {
89         return seqName;         
90 }
91
92 //********************************************************************************************************************
93
94 void SuffixTree::print(){
95         vector<SuffixNode*> hold = nodeVector;
96         sort(hold.begin(), hold.end(), compareParents);
97         m->mothurOut("Address\t\tParent\tNode\tSuffix\tStartC\tEndC\tSuffix"); m->mothurOutEndLine();
98         for(int i=1;i<=nodeCounter;i++){
99                 hold[i]->print(sequence, i);
100         }
101 }
102
103 //********************************************************************************************************************
104
105 int SuffixTree::countSuffixes(string compareSequence, int& minValue){   //      here we count the number of suffix parts 
106                                                                                                                         //      we need to rewrite a user supplied sequence.  if the 
107         int numSuffixes = 0;                                                                    //      count exceeds the supplied minValue, bail out.  The
108         int seqLength = compareSequence.length();                               //      time complexity should be O(L)
109         int position = 0;
110         
111         int presentNode = 0;
112         
113         while(position < seqLength){            //      while the position in the query sequence isn't at the end...
114                 
115                 if(numSuffixes > minValue)      {       return 1000000;         }       //      bail if the count gets too high
116                 
117                 int newNode = nodeVector[presentNode]->getChild(compareSequence[position]);     //      see if the current node has a
118                                                                                                                                 //      child that matches the next character in the query
119                 if(newNode == -1){                                                                              
120                         if(presentNode == 0){   position++;             }                       //      if not, go back to the root and increase the count
121                         numSuffixes++;                                                                          //      by one.
122                         presentNode = 0;
123                 }
124                 else{                                                                                                   //      if there is, move to that node and see how far down
125                         presentNode = newNode;                                                          //      it we can get
126                         
127                         for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){
128                                 if(compareSequence[position] == sequence[i]){
129                                         position++;                                                                     //      as long as the query and branch agree, keep going
130                                 }
131                                 else{
132                                         numSuffixes++;                                                          //      if there is a mismatch, increase the number of 
133                                         presentNode = 0;                                                        //      suffixes and go back to the root
134                                         break;
135                                 }
136                         }
137                 }
138                 //      if we get all the way through the node we'll go to the top of the while loop and find the child node
139                 //      that corresponds to what we are interested in           
140         }
141         numSuffixes--;                                                                                          //      the method puts an extra count on numSuffixes
142         
143         if(numSuffixes < minValue)      {       minValue = numSuffixes; }       //      if the count is less than the previous minValue,
144         return numSuffixes;                                                                                     //      change the value and return the number of suffixes
145         
146 }
147 //********************************************************************************************************************
148
149 int SuffixTree::countSuffixes(string compareSequence){  //      here we count the number of suffix parts 
150                                                                                                                         //      we need to rewrite a user supplied sequence.  if the 
151         int numSuffixes = 0;                                                                    //      count exceeds the supplied minValue, bail out.  The
152         int seqLength = compareSequence.length();                               //      time complexity should be O(L)
153         int position = 0;
154         
155         int presentNode = 0;
156         
157         while(position < seqLength){            //      while the position in the query sequence isn't at the end...
158                 
159                 int newNode = nodeVector[presentNode]->getChild(compareSequence[position]);     //      see if the current node has a
160                                                                                                                                 //      child that matches the next character in the query
161                 if(newNode == -1){                                                                              
162                         if(presentNode == 0){   position++;             }                       //      if not, go back to the root and increase the count
163                         numSuffixes++;                                                                          //      by one.
164                         presentNode = 0;
165                 }
166                 else{                                                                                                   //      if there is, move to that node and see how far down
167                         presentNode = newNode;                                                          //      it we can get
168                         
169                         for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){
170                                 if(compareSequence[position] == sequence[i]){
171                                         position++;                                                                     //      as long as the query and branch agree, keep going
172                                 }
173                                 else{
174                                         numSuffixes++;                                                          //      if there is a mismatch, increase the number of 
175                                         presentNode = 0;                                                        //      suffixes and go back to the root
176                                         break;
177                                 }
178                         }
179                 }
180                 //      if we get all the way through the node we'll go to the top of the while loop and find the child node
181                 //      that corresponds to what we are interested in           
182         }
183         numSuffixes--;                                                                                          //      the method puts an extra count on numSuffixes
184         
185         return numSuffixes;                                                                                     //      change the value and return the number of suffixes
186 }
187 //********************************************************************************************************************
188
189 void SuffixTree::canonize(){    //      if you have to ask how this works, you don't really want to know and this really
190                                                                 //      isn't the place to ask.
191         if ( isExplicit() == 0 ) {      //      if the node has no children...
192                 
193                 int tempNodeIndex = nodeVector[activeNode]->getChild(sequence[activeStartPosition]);
194                 SuffixNode* tempNode = nodeVector[tempNodeIndex];
195                 
196                 int span = tempNode->getEndCharPos() - tempNode->getStartCharPos();
197                 
198                 while ( span <= ( activeEndPosition - activeStartPosition ) ) {
199                         
200             activeStartPosition = activeStartPosition + span + 1;
201                         
202                         activeNode = tempNodeIndex;
203                         
204             if ( activeStartPosition <= activeEndPosition ) {
205                                 tempNodeIndex = nodeVector[tempNodeIndex]->getChild(sequence[activeStartPosition]);
206                                 tempNode = nodeVector[tempNodeIndex];
207                                 span = tempNode->getEndCharPos() - tempNode->getStartCharPos();
208             }
209                         
210         }
211     }
212 }
213
214 //********************************************************************************************************************
215
216 int SuffixTree::split(int nodeIndex, int position){     //      leaves stay leaves, etc, to split a leaf we make a new interior 
217                                                                                                         //      node and reconnect everything
218         SuffixNode* node = nodeVector[nodeIndex];                                       //      get the node that needs to be split
219         SuffixNode* parentNode = nodeVector[node->getParentNode()];     //      get it's parent node
220         
221         parentNode->eraseChild(sequence[node->getStartCharPos()]);      //      erase the present node from the registry of its parent
222         
223         nodeCounter++;
224         SuffixNode* newNode = new SuffixBranch(node->getParentNode(), node->getStartCharPos(), node->getStartCharPos() + activeEndPosition - activeStartPosition);      //      create a new node that will link the parent with the old child
225         parentNode->setChildren(sequence[newNode->getStartCharPos()], nodeCounter);//   give the parent the new child
226         nodeVector.push_back(newNode);
227         
228         node->setParentNode(nodeCounter);       //      give the original node the new node as its parent
229         newNode->setChildren(sequence[node->getStartCharPos() + activeEndPosition - activeStartPosition + 1], nodeIndex);
230         //      put the original node in the registry of the new node's children
231         newNode->setSuffixNode(activeNode);//link the new node with the old active node
232         
233         //      recalculate the startCharPosition of the outermost node
234         node->setStartCharPos(node->getStartCharPos() + activeEndPosition - activeStartPosition + 1 );
235         
236         return node->getParentNode();
237 }
238
239 //********************************************************************************************************************
240
241 void SuffixTree::makeSuffixLink(int& previous, int present){
242         
243 //      here we link the nodes that are suffixes of one another to rapidly speed through the tree
244         if ( previous > 0 ) {   nodeVector[previous]->setSuffixNode(present);   }
245         else                            {       /*      do nothing                                                              */      }
246         
247     previous = present;
248 }
249
250 //********************************************************************************************************************
251
252 void SuffixTree::addPrefix(int prefixPosition){
253         
254         int lastParentNode = -1;        //      we need to place a new prefix in the suffix tree
255         int parentNode = 0;
256         
257         while(1){
258                 
259                 parentNode = activeNode;
260                 
261                 if(isExplicit() == 1){  //      if the node is explicit (has kids), try to follow it down the branch if its there...
262                         if(nodeVector[activeNode]->getChild(sequence[prefixPosition]) != -1){   //      break out and get next prefix...
263                                 break;                                                                                          
264                         }
265                         else{                           //      ...otherwise continue, we'll need to make a new node later on...
266                         }
267                 }
268                 else{                                   //      if it's not explicit (no kids), read through and see if all of the chars agree...
269                         int tempNode = nodeVector[activeNode]->getChild(sequence[activeStartPosition]);
270                         int span = activeEndPosition - activeStartPosition;
271                         
272                         if(sequence[nodeVector[tempNode]->getStartCharPos() + span + 1] == sequence[prefixPosition] ){
273                                 break;                  //      if the existing suffix agrees with the new one, grab a new prefix...
274                         }
275                         else{
276                                 parentNode = split(tempNode, prefixPosition);   //      ... otherwise we need to split the node
277                         }
278                         
279                 }
280                 
281                 nodeCounter++;  //      we need to generate a new node here if the kid didn't exist, or we split a node
282                 SuffixNode* newSuffixLeaf = new SuffixLeaf(parentNode, prefixPosition, sequence.length()-1);
283                 nodeVector[parentNode]->setChildren(sequence[prefixPosition], nodeCounter);
284                 nodeVector.push_back(newSuffixLeaf);
285                 
286                 makeSuffixLink( lastParentNode, parentNode );           //      make a suffix link for the parent node
287                 
288                 if(nodeVector[activeNode]->getParentNode() == -1){      //      move along the start position for the tree
289             activeStartPosition++;
290         } 
291                 else {
292             activeNode = nodeVector[activeNode]->getSuffixNode();
293                 }
294                 canonize();                                                                                     //      frankly, i'm not entirely clear on what canonize does.
295         }
296         
297         makeSuffixLink( lastParentNode, parentNode );
298         activeEndPosition++;                                                                    //      move along the end position for the tree
299         
300         canonize();                                                                                             //      frankly, i'm not entirely clear on what canonize does.
301         
302 }
303
304 //********************************************************************************************************************
305