5 * Created by Pat Schloss on 12/15/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
8 * This is my half-assed attempt to implement a suffix tree. This is a cobbled together algorithm using materials that
9 * I found at http://marknelson.us/1996/08/01/suffix-trees/ and:
11 * Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260
12 * Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology.
13 * USA: Cambridge University Press
15 * The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree.
17 * I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects. The root is stored at
18 * nodeVector[0]. Each tree also stores the sequence name and the string that corresponds to the actual sequence.
19 * Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new
20 * sequence (countSuffixes). This method is used to determine similarity between sequences and was inspired by the
21 * article and Perl source code provided at http://www.ddj.com/web-development/184416093.
25 #include "sequence.hpp"
26 #include "suffixnodes.hpp"
27 #include "suffixtree.hpp"
30 //********************************************************************************************************************
32 inline bool compareParents(SuffixNode* left, SuffixNode* right){// this is necessary to print the tree and to sort the
33 return (left->getParentNode() < right->getParentNode()); // nodes in order of their parent
36 //********************************************************************************************************************
38 SuffixTree::SuffixTree(const SuffixTree& st) : root(st.root), activeEndPosition(st.activeEndPosition), activeStartPosition(st.activeStartPosition), activeNode(st.activeNode),
39 nodeCounter(st.nodeCounter), seqName(st.seqName), sequence(st.sequence) {
41 m = MothurOut::getInstance();
43 for (int i = 0; i < st.nodeVector.size(); i++) {
44 SuffixNode* temp = new SuffixBranch(*((SuffixBranch*)st.nodeVector[i]));
45 nodeVector.push_back(temp);
49 }catch(exception& e) {
50 m->errorOut(e, "SuffixTree", "SuffixTree");
55 //********************************************************************************************************************
57 SuffixTree::SuffixTree(){ m = MothurOut::getInstance(); }
59 //********************************************************************************************************************
61 SuffixTree::~SuffixTree(){
62 for(int i=0;i<nodeVector.size();i++){ delete nodeVector[i]; }
66 //********************************************************************************************************************
68 void SuffixTree::loadSequence(Sequence seq){
69 nodeCounter = 0; // initially there are 0 nodes in the tree
70 activeStartPosition = 0;
71 activeEndPosition = -1;
72 seqName = seq.getName();
73 sequence = seq.convert2ints();
74 sequence += '5'; // this essentially concatenates a '$' to the end of the sequence to
75 int seqLength = sequence.length(); // make it a cononical suffix tree
77 nodeVector.push_back(new SuffixBranch(-1, 0, -1)); // enter the root of the suffix tree
79 activeNode = root = 0;
81 for(int i=0;i<seqLength;i++){
82 addPrefix(i); // step through the sequence adding each prefix
86 //********************************************************************************************************************
88 string SuffixTree::getSeqName() {
92 //********************************************************************************************************************
94 void SuffixTree::print(){
95 vector<SuffixNode*> hold = nodeVector;
96 sort(hold.begin(), hold.end(), compareParents);
97 m->mothurOut("Address\t\tParent\tNode\tSuffix\tStartC\tEndC\tSuffix"); m->mothurOutEndLine();
98 for(int i=1;i<=nodeCounter;i++){
99 hold[i]->print(sequence, i);
103 //********************************************************************************************************************
105 int SuffixTree::countSuffixes(string compareSequence, int& minValue){ // here we count the number of suffix parts
106 // we need to rewrite a user supplied sequence. if the
107 int numSuffixes = 0; // count exceeds the supplied minValue, bail out. The
108 int seqLength = compareSequence.length(); // time complexity should be O(L)
113 while(position < seqLength){ // while the position in the query sequence isn't at the end...
115 if(numSuffixes > minValue) { return 1000000; } // bail if the count gets too high
117 int newNode = nodeVector[presentNode]->getChild(compareSequence[position]); // see if the current node has a
118 // child that matches the next character in the query
120 if(presentNode == 0){ position++; } // if not, go back to the root and increase the count
121 numSuffixes++; // by one.
124 else{ // if there is, move to that node and see how far down
125 presentNode = newNode; // it we can get
127 for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){
128 if(compareSequence[position] == sequence[i]){
129 position++; // as long as the query and branch agree, keep going
132 numSuffixes++; // if there is a mismatch, increase the number of
133 presentNode = 0; // suffixes and go back to the root
138 // if we get all the way through the node we'll go to the top of the while loop and find the child node
139 // that corresponds to what we are interested in
141 numSuffixes--; // the method puts an extra count on numSuffixes
143 if(numSuffixes < minValue) { minValue = numSuffixes; } // if the count is less than the previous minValue,
144 return numSuffixes; // change the value and return the number of suffixes
147 //********************************************************************************************************************
149 int SuffixTree::countSuffixes(string compareSequence){ // here we count the number of suffix parts
150 // we need to rewrite a user supplied sequence. if the
151 int numSuffixes = 0; // count exceeds the supplied minValue, bail out. The
152 int seqLength = compareSequence.length(); // time complexity should be O(L)
157 while(position < seqLength){ // while the position in the query sequence isn't at the end...
159 int newNode = nodeVector[presentNode]->getChild(compareSequence[position]); // see if the current node has a
160 // child that matches the next character in the query
162 if(presentNode == 0){ position++; } // if not, go back to the root and increase the count
163 numSuffixes++; // by one.
166 else{ // if there is, move to that node and see how far down
167 presentNode = newNode; // it we can get
169 for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){
170 if(compareSequence[position] == sequence[i]){
171 position++; // as long as the query and branch agree, keep going
174 numSuffixes++; // if there is a mismatch, increase the number of
175 presentNode = 0; // suffixes and go back to the root
180 // if we get all the way through the node we'll go to the top of the while loop and find the child node
181 // that corresponds to what we are interested in
183 numSuffixes--; // the method puts an extra count on numSuffixes
185 return numSuffixes; // change the value and return the number of suffixes
187 //********************************************************************************************************************
189 void SuffixTree::canonize(){ // if you have to ask how this works, you don't really want to know and this really
190 // isn't the place to ask.
191 if ( isExplicit() == 0 ) { // if the node has no children...
193 int tempNodeIndex = nodeVector[activeNode]->getChild(sequence[activeStartPosition]);
194 SuffixNode* tempNode = nodeVector[tempNodeIndex];
196 int span = tempNode->getEndCharPos() - tempNode->getStartCharPos();
198 while ( span <= ( activeEndPosition - activeStartPosition ) ) {
200 activeStartPosition = activeStartPosition + span + 1;
202 activeNode = tempNodeIndex;
204 if ( activeStartPosition <= activeEndPosition ) {
205 tempNodeIndex = nodeVector[tempNodeIndex]->getChild(sequence[activeStartPosition]);
206 tempNode = nodeVector[tempNodeIndex];
207 span = tempNode->getEndCharPos() - tempNode->getStartCharPos();
214 //********************************************************************************************************************
216 int SuffixTree::split(int nodeIndex, int position){ // leaves stay leaves, etc, to split a leaf we make a new interior
217 // node and reconnect everything
218 SuffixNode* node = nodeVector[nodeIndex]; // get the node that needs to be split
219 SuffixNode* parentNode = nodeVector[node->getParentNode()]; // get it's parent node
221 parentNode->eraseChild(sequence[node->getStartCharPos()]); // erase the present node from the registry of its parent
224 SuffixNode* newNode = new SuffixBranch(node->getParentNode(), node->getStartCharPos(), node->getStartCharPos() + activeEndPosition - activeStartPosition); // create a new node that will link the parent with the old child
225 parentNode->setChildren(sequence[newNode->getStartCharPos()], nodeCounter);// give the parent the new child
226 nodeVector.push_back(newNode);
228 node->setParentNode(nodeCounter); // give the original node the new node as its parent
229 newNode->setChildren(sequence[node->getStartCharPos() + activeEndPosition - activeStartPosition + 1], nodeIndex);
230 // put the original node in the registry of the new node's children
231 newNode->setSuffixNode(activeNode);//link the new node with the old active node
233 // recalculate the startCharPosition of the outermost node
234 node->setStartCharPos(node->getStartCharPos() + activeEndPosition - activeStartPosition + 1 );
236 return node->getParentNode();
239 //********************************************************************************************************************
241 void SuffixTree::makeSuffixLink(int& previous, int present){
243 // here we link the nodes that are suffixes of one another to rapidly speed through the tree
244 if ( previous > 0 ) { nodeVector[previous]->setSuffixNode(present); }
245 else { /* do nothing */ }
250 //********************************************************************************************************************
252 void SuffixTree::addPrefix(int prefixPosition){
254 int lastParentNode = -1; // we need to place a new prefix in the suffix tree
259 parentNode = activeNode;
261 if(isExplicit() == 1){ // if the node is explicit (has kids), try to follow it down the branch if its there...
262 if(nodeVector[activeNode]->getChild(sequence[prefixPosition]) != -1){ // break out and get next prefix...
265 else{ // ...otherwise continue, we'll need to make a new node later on...
268 else{ // if it's not explicit (no kids), read through and see if all of the chars agree...
269 int tempNode = nodeVector[activeNode]->getChild(sequence[activeStartPosition]);
270 int span = activeEndPosition - activeStartPosition;
272 if(sequence[nodeVector[tempNode]->getStartCharPos() + span + 1] == sequence[prefixPosition] ){
273 break; // if the existing suffix agrees with the new one, grab a new prefix...
276 parentNode = split(tempNode, prefixPosition); // ... otherwise we need to split the node
281 nodeCounter++; // we need to generate a new node here if the kid didn't exist, or we split a node
282 SuffixNode* newSuffixLeaf = new SuffixLeaf(parentNode, prefixPosition, sequence.length()-1);
283 nodeVector[parentNode]->setChildren(sequence[prefixPosition], nodeCounter);
284 nodeVector.push_back(newSuffixLeaf);
286 makeSuffixLink( lastParentNode, parentNode ); // make a suffix link for the parent node
288 if(nodeVector[activeNode]->getParentNode() == -1){ // move along the start position for the tree
289 activeStartPosition++;
292 activeNode = nodeVector[activeNode]->getSuffixNode();
294 canonize(); // frankly, i'm not entirely clear on what canonize does.
297 makeSuffixLink( lastParentNode, parentNode );
298 activeEndPosition++; // move along the end position for the tree
300 canonize(); // frankly, i'm not entirely clear on what canonize does.
304 //********************************************************************************************************************