]> git.donarmstrong.com Git - mothur.git/blob - knn.cpp
started work on classify.seqs command. changed the database class so that it does...
[mothur.git] / knn.cpp
1 /*
2  *  knn.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/4/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "knn.h"
11
12 /**************************************************************************************************/
13 Knn::Knn(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch, int n) 
14 : Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch), num(n)  {}
15 /**************************************************************************************************/
16 string Knn::getTaxonomy(Sequence* seq) {
17         try {
18                 string tax;
19                 
20                 //use database to find closest seq
21                 vector<int> closest = database->findClosestSequences(seq, num);
22                 
23                 vector<string> closestNames;
24                 for (int i = 0; i < closest.size(); i++) {
25                         //find that sequences taxonomy in map
26                         it = taxonomy.find(names[closest[i]]);
27                 
28                         //is this sequence in the taxonomy file
29                         if (it == taxonomy.end()) { //error not in file
30                                 mothurOut("Error: sequence " + names[closest[i]] + " is not in the taxonomy file.  It will be eliminated as a match to sequence " + seq->getName() + "."); mothurOutEndLine();
31                         }else{   closestNames.push_back(it->first);     }
32                 }
33                 
34                 if (closestNames.size() == 0) {
35                         mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. " + seq->getName() + " will be disregarded."); mothurOutEndLine();
36                         tax = "bad seq";
37                 }else{
38                         tax = findCommonTaxonomy(closestNames);
39                         if (tax == "") { mothurOut("There are no common levels for sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); mothurOutEndLine(); tax = "bad seq"; }
40                 }
41                 
42                 return tax;     
43         }
44         catch(exception& e) {
45                 errorOut(e, "Knn", "getTaxonomy");
46                 exit(1);
47         }
48 }
49 /**************************************************************************************************/
50 string Knn::findCommonTaxonomy(vector<string> closest)  {
51         try {
52                 vector< vector<string> > taxons;  //taxon[0] = vector of taxonomy info for closest[0].
53                                                                                 //so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
54                                                                                 //taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria....
55                                                                                 
56                 taxons.resize(closest.size());
57                 int smallest = 100;
58                 
59                 for (int i = 0; i < closest.size(); i++) {
60                 
61                         string tax = taxonomy[closest[i]];  //we know its there since we checked in getTaxonomy
62                 
63                         tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
64         
65                         //parse taxonomy
66                         string individual;
67                         while (tax.find_first_of(';') != -1) {
68                                 individual = tax.substr(0,tax.find_first_of(';'));
69                                 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
70                                 taxons[i].push_back(individual);
71                         
72                         }
73                         //get last one
74                         taxons[i].push_back(tax);
75                 
76                         //figure out who has the shortest taxonomy info. so you can start comparing there
77                         if (taxons[i].size() < smallest) {
78                                 smallest = taxons[i].size();
79                         }
80                 }
81         
82                 //start at the highest level all the closest seqs have
83                 string common = "";
84                 for (int i = (smallest-1); i >= 0; i--) {
85
86                         string thistax = taxons[0][i];
87                         int num = 0;
88                         for (int j = 1; j < taxons.size(); j++) {
89                                 if (taxons[j][i] != thistax) { break; }
90                                 num = j;
91                         }
92                 
93                         if (num == (taxons.size()-1)) { //they all match at this level
94                                 for (int k = 0; k <= i; k++) {
95                                         common += taxons[0][k] + ';';
96                                 }
97                                 break;
98                         }
99                 }
100         
101                 return common;
102         }
103         catch(exception& e) {
104                 errorOut(e, "Knn", "findCommonTaxonomy");
105                 exit(1);
106         }
107 }
108 /**************************************************************************************************/
109