]> git.donarmstrong.com Git - mothur.git/blob - knn.cpp
changes while testing
[mothur.git] / knn.cpp
1 /*
2  *  knn.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/4/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "knn.h"
11
12 /**************************************************************************************************/
13 Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int n, int tid) 
14 : Classify(), num(n), search(method) {
15         try {
16                 threadID = tid;
17         shortcuts = true;
18                 
19                 //create search database and names vector
20                 generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch);
21         }
22         catch(exception& e) {
23                 m->errorOut(e, "Knn", "Knn");
24                 exit(1);
25         }
26 }
27 /**************************************************************************************************/
28 void Knn::setDistName(string s) {
29         try {
30                 outDistName = s;
31                 ofstream outDistance;
32                 m->openOutputFile(outDistName, outDistance);
33                 outDistance << "Name\tBestMatch\tDistance" << endl;
34                 outDistance.close();
35         }
36         catch(exception& e) {
37                 m->errorOut(e, "Knn", "setDistName");
38                 exit(1);
39         }
40 }
41 /**************************************************************************************************/
42 Knn::~Knn() {
43         try {
44                  delete phyloTree; 
45                  if (database != NULL) {  delete database; }
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "Knn", "~Knn");
49                 exit(1);
50         }
51 }
52 /**************************************************************************************************/
53 string Knn::getTaxonomy(Sequence* seq) {
54         try {
55                 string tax;
56                 
57                 //use database to find closest seq
58                 vector<int> closest = database->findClosestSequences(seq, num);
59         
60                 if (search == "distance") { ofstream outDistance; m->openOutputFileAppend(outDistName, outDistance); outDistance << seq->getName() << '\t' << database->getName(closest[0]) << '\t' << database->getSearchScore() << endl; outDistance.close();  }
61         
62                 if (m->control_pressed) { return tax; }
63
64                 vector<string> closestNames;
65                 for (int i = 0; i < closest.size(); i++) {
66                         //find that sequences taxonomy in map
67                         it = taxonomy.find(names[closest[i]]);
68                 
69                         //is this sequence in the taxonomy file
70                         if (it == taxonomy.end()) { //error not in file
71                                 m->mothurOut("Error: sequence " + names[closest[i]] + " is not in the taxonomy file.  It will be eliminated as a match to sequence " + seq->getName() + "."); m->mothurOutEndLine();
72                         }else{   closestNames.push_back(it->first);     }
73                 }
74                 
75                 if (closestNames.size() == 0) {
76                         m->mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. "); m->mothurOutEndLine();
77                         tax = "unknown;";
78                 }else{
79                         tax = findCommonTaxonomy(closestNames);
80                         if (tax == "") { m->mothurOut("There are no common levels for sequence " + seq->getName() + ". "); m->mothurOutEndLine(); tax = "unknown;"; }
81                 }
82                 
83                 simpleTax = tax;
84                 return tax;     
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "Knn", "getTaxonomy");
88                 exit(1);
89         }
90 }
91 /**************************************************************************************************/
92 string Knn::findCommonTaxonomy(vector<string> closest)  {
93         try {
94                 /*vector< vector<string> > taxons;  //taxon[0] = vector of taxonomy info for closest[0].
95                                                                                 //so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
96                                                                                 //taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria....
97                                                                                 
98                 taxons.resize(closest.size());
99                 int smallest = 100;
100                 
101                 for (int i = 0; i < closest.size(); i++) {
102                         if (m->control_pressed) { return "control"; }
103                 
104                         string tax = taxonomy[closest[i]];  //we know its there since we checked in getTaxonomy
105                         cout << tax << endl;
106                 
107                         taxons[i] = parseTax(tax);
108                 
109                         //figure out who has the shortest taxonomy info. so you can start comparing there
110                         if (taxons[i].size() < smallest) {
111                                 smallest = taxons[i].size();
112                         }
113                 }
114         
115                 //start at the highest level all the closest seqs have
116                 string common = "";
117                 for (int i = (smallest-1); i >= 0; i--) {
118                         if (m->control_pressed) { return "control"; }
119
120                         string thistax = taxons[0][i];
121                         int num = 0;
122                         for (int j = 1; j < taxons.size(); j++) {
123                                 if (taxons[j][i] != thistax) { break; }
124                                 num = j;
125                         }
126                 
127                         if (num == (taxons.size()-1)) { //they all match at this level
128                                 for (int k = 0; k <= i; k++) {
129                                         common += taxons[0][k] + ';';
130                                 }
131                                 break;
132                         }
133                 }*/
134                 
135                 string conTax;
136                 
137                 //create a tree containing sequences from this bin
138                 PhyloTree* p = new PhyloTree();
139                 
140                 for (int i = 0; i < closest.size(); i++) {
141                         p->addSeqToTree(closest[i], taxonomy[closest[i]]);
142                 }
143                 
144                 //build tree
145                 p->assignHeirarchyIDs(0);
146                 
147                 TaxNode currentNode = p->get(0);
148                 
149                 //at each level
150                 while (currentNode.children.size() != 0) { //you still have more to explore
151                         
152                         TaxNode bestChild;
153                         int bestChildSize = 0;
154                         
155                         //go through children
156                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
157                                 
158                                 TaxNode temp = p->get(itChild->second);
159                                 
160                                 //select child with largest accessions - most seqs assigned to it
161                                 if (temp.accessions.size() > bestChildSize) {
162                                         bestChild = p->get(itChild->second);
163                                         bestChildSize = temp.accessions.size();
164                                 }
165                                 
166                         }
167                         
168                         if (bestChildSize == closest.size()) { //if yes, add it
169                                 conTax += bestChild.name + ";";
170                         }else{ //if no, quit
171                                 break;
172                         }
173                         
174                         //move down a level
175                         currentNode = bestChild;
176                 }
177                 
178                 delete p;       
179                 
180                 return conTax;
181         }
182         catch(exception& e) {
183                 m->errorOut(e, "Knn", "findCommonTaxonomy");
184                 exit(1);
185         }
186 }
187 /**************************************************************************************************/
188