5 * Created by westcott on 11/4/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
12 /**************************************************************************************************/
13 Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int n, int tid)
14 : Classify(), num(n), search(method) {
18 //create search database and names vector
19 generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch);
22 m->errorOut(e, "Knn", "Knn");
26 /**************************************************************************************************/
27 void Knn::setDistName(string s) {
31 m->openOutputFile(outDistName, outDistance);
32 outDistance << "Name\tBestMatch\tDistance" << endl;
36 m->errorOut(e, "Knn", "setDistName");
40 /**************************************************************************************************/
44 if (database != NULL) { delete database; }
47 m->errorOut(e, "Knn", "~Knn");
51 /**************************************************************************************************/
52 string Knn::getTaxonomy(Sequence* seq) {
56 //use database to find closest seq
57 vector<int> closest = database->findClosestSequences(seq, num);
59 if (search == "distance") { ofstream outDistance; m->openOutputFileAppend(outDistName, outDistance); outDistance << seq->getName() << '\t' << database->getName(closest[0]) << '\t' << database->getSearchScore() << endl; outDistance.close(); }
61 if (m->control_pressed) { return tax; }
63 vector<string> closestNames;
64 for (int i = 0; i < closest.size(); i++) {
65 //find that sequences taxonomy in map
66 it = taxonomy.find(names[closest[i]]);
68 //is this sequence in the taxonomy file
69 if (it == taxonomy.end()) { //error not in file
70 m->mothurOut("Error: sequence " + names[closest[i]] + " is not in the taxonomy file. It will be eliminated as a match to sequence " + seq->getName() + "."); m->mothurOutEndLine();
71 }else{ closestNames.push_back(it->first); }
74 if (closestNames.size() == 0) {
75 m->mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. "); m->mothurOutEndLine();
78 tax = findCommonTaxonomy(closestNames);
79 if (tax == "") { m->mothurOut("There are no common levels for sequence " + seq->getName() + ". "); m->mothurOutEndLine(); tax = "unknown;"; }
86 m->errorOut(e, "Knn", "getTaxonomy");
90 /**************************************************************************************************/
91 string Knn::findCommonTaxonomy(vector<string> closest) {
93 /*vector< vector<string> > taxons; //taxon[0] = vector of taxonomy info for closest[0].
94 //so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
95 //taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria....
97 taxons.resize(closest.size());
100 for (int i = 0; i < closest.size(); i++) {
101 if (m->control_pressed) { return "control"; }
103 string tax = taxonomy[closest[i]]; //we know its there since we checked in getTaxonomy
106 taxons[i] = parseTax(tax);
108 //figure out who has the shortest taxonomy info. so you can start comparing there
109 if (taxons[i].size() < smallest) {
110 smallest = taxons[i].size();
114 //start at the highest level all the closest seqs have
116 for (int i = (smallest-1); i >= 0; i--) {
117 if (m->control_pressed) { return "control"; }
119 string thistax = taxons[0][i];
121 for (int j = 1; j < taxons.size(); j++) {
122 if (taxons[j][i] != thistax) { break; }
126 if (num == (taxons.size()-1)) { //they all match at this level
127 for (int k = 0; k <= i; k++) {
128 common += taxons[0][k] + ';';
136 //create a tree containing sequences from this bin
137 PhyloTree* p = new PhyloTree();
139 for (int i = 0; i < closest.size(); i++) {
140 p->addSeqToTree(closest[i], taxonomy[closest[i]]);
144 p->assignHeirarchyIDs(0);
146 TaxNode currentNode = p->get(0);
149 while (currentNode.children.size() != 0) { //you still have more to explore
152 int bestChildSize = 0;
154 //go through children
155 for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
157 TaxNode temp = p->get(itChild->second);
159 //select child with largest accessions - most seqs assigned to it
160 if (temp.accessions.size() > bestChildSize) {
161 bestChild = p->get(itChild->second);
162 bestChildSize = temp.accessions.size();
167 if (bestChildSize == closest.size()) { //if yes, add it
168 conTax += bestChild.name + ";";
174 currentNode = bestChild;
181 catch(exception& e) {
182 m->errorOut(e, "Knn", "findCommonTaxonomy");
186 /**************************************************************************************************/