]> git.donarmstrong.com Git - mothur.git/blob - source/knn.cpp
fixing
[mothur.git] / source / knn.cpp
1 /*
2  *  knn.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/4/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "knn.h"
11
12 /**************************************************************************************************/
13 Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int n, int tid) 
14 : Classify(), num(n), search(method) {
15         try {
16                 threadID = tid;
17                 
18                 //create search database and names vector
19                 generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch);
20         }
21         catch(exception& e) {
22                 m->errorOut(e, "Knn", "Knn");
23                 exit(1);
24         }
25 }
26 /**************************************************************************************************/
27 void Knn::setDistName(string s) {
28         try {
29                 outDistName = s;
30                 ofstream outDistance;
31                 m->openOutputFile(outDistName, outDistance);
32                 outDistance << "Name\tBestMatch\tDistance" << endl;
33                 outDistance.close();
34         }
35         catch(exception& e) {
36                 m->errorOut(e, "Knn", "setDistName");
37                 exit(1);
38         }
39 }
40 /**************************************************************************************************/
41 Knn::~Knn() {
42         try {
43                  delete phyloTree; 
44                  if (database != NULL) {  delete database; }
45         }
46         catch(exception& e) {
47                 m->errorOut(e, "Knn", "~Knn");
48                 exit(1);
49         }
50 }
51 /**************************************************************************************************/
52 string Knn::getTaxonomy(Sequence* seq) {
53         try {
54                 string tax;
55                 
56                 //use database to find closest seq
57                 vector<int> closest = database->findClosestSequences(seq, num);
58         
59                 if (search == "distance") { ofstream outDistance; m->openOutputFileAppend(outDistName, outDistance); outDistance << seq->getName() << '\t' << database->getName(closest[0]) << '\t' << database->getSearchScore() << endl; outDistance.close();  }
60         
61                 if (m->control_pressed) { return tax; }
62
63                 vector<string> closestNames;
64                 for (int i = 0; i < closest.size(); i++) {
65                         //find that sequences taxonomy in map
66                         it = taxonomy.find(names[closest[i]]);
67                 
68                         //is this sequence in the taxonomy file
69                         if (it == taxonomy.end()) { //error not in file
70                                 m->mothurOut("Error: sequence " + names[closest[i]] + " is not in the taxonomy file.  It will be eliminated as a match to sequence " + seq->getName() + "."); m->mothurOutEndLine();
71                         }else{   closestNames.push_back(it->first);     }
72                 }
73                 
74                 if (closestNames.size() == 0) {
75                         m->mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. "); m->mothurOutEndLine();
76                         tax = "unknown;";
77                 }else{
78                         tax = findCommonTaxonomy(closestNames);
79                         if (tax == "") { m->mothurOut("There are no common levels for sequence " + seq->getName() + ". "); m->mothurOutEndLine(); tax = "unknown;"; }
80                 }
81                 
82                 simpleTax = tax;
83                 return tax;     
84         }
85         catch(exception& e) {
86                 m->errorOut(e, "Knn", "getTaxonomy");
87                 exit(1);
88         }
89 }
90 /**************************************************************************************************/
91 string Knn::findCommonTaxonomy(vector<string> closest)  {
92         try {
93                 /*vector< vector<string> > taxons;  //taxon[0] = vector of taxonomy info for closest[0].
94                                                                                 //so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
95                                                                                 //taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria....
96                                                                                 
97                 taxons.resize(closest.size());
98                 int smallest = 100;
99                 
100                 for (int i = 0; i < closest.size(); i++) {
101                         if (m->control_pressed) { return "control"; }
102                 
103                         string tax = taxonomy[closest[i]];  //we know its there since we checked in getTaxonomy
104                         cout << tax << endl;
105                 
106                         taxons[i] = parseTax(tax);
107                 
108                         //figure out who has the shortest taxonomy info. so you can start comparing there
109                         if (taxons[i].size() < smallest) {
110                                 smallest = taxons[i].size();
111                         }
112                 }
113         
114                 //start at the highest level all the closest seqs have
115                 string common = "";
116                 for (int i = (smallest-1); i >= 0; i--) {
117                         if (m->control_pressed) { return "control"; }
118
119                         string thistax = taxons[0][i];
120                         int num = 0;
121                         for (int j = 1; j < taxons.size(); j++) {
122                                 if (taxons[j][i] != thistax) { break; }
123                                 num = j;
124                         }
125                 
126                         if (num == (taxons.size()-1)) { //they all match at this level
127                                 for (int k = 0; k <= i; k++) {
128                                         common += taxons[0][k] + ';';
129                                 }
130                                 break;
131                         }
132                 }*/
133                 
134                 string conTax;
135                 
136                 //create a tree containing sequences from this bin
137                 PhyloTree* p = new PhyloTree();
138                 
139                 for (int i = 0; i < closest.size(); i++) {
140                         p->addSeqToTree(closest[i], taxonomy[closest[i]]);
141                 }
142                 
143                 //build tree
144                 p->assignHeirarchyIDs(0);
145                 
146                 TaxNode currentNode = p->get(0);
147                 
148                 //at each level
149                 while (currentNode.children.size() != 0) { //you still have more to explore
150                         
151                         TaxNode bestChild;
152                         int bestChildSize = 0;
153                         
154                         //go through children
155                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
156                                 
157                                 TaxNode temp = p->get(itChild->second);
158                                 
159                                 //select child with largest accessions - most seqs assigned to it
160                                 if (temp.accessions.size() > bestChildSize) {
161                                         bestChild = p->get(itChild->second);
162                                         bestChildSize = temp.accessions.size();
163                                 }
164                                 
165                         }
166                         
167                         if (bestChildSize == closest.size()) { //if yes, add it
168                                 conTax += bestChild.name + ";";
169                         }else{ //if no, quit
170                                 break;
171                         }
172                         
173                         //move down a level
174                         currentNode = bestChild;
175                 }
176                 
177                 delete p;       
178                 
179                 return conTax;
180         }
181         catch(exception& e) {
182                 m->errorOut(e, "Knn", "findCommonTaxonomy");
183                 exit(1);
184         }
185 }
186 /**************************************************************************************************/
187