]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
modified bin.seqs and get.oturep commands to include use of a groupfile if provided...
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(){
14         try{
15                 globaldata = GlobalData::getInstance();
16         
17                 if(globaldata->gSparseMatrix != NULL)   {       matrix = new SparseMatrix(*globaldata->gSparseMatrix);          }
18                 
19                 //listOfNames bin 0 = first name read in distance matrix, listOfNames bin 1 = second name read in distance matrix
20                 if(globaldata->gListVector != NULL)             {       
21                         listOfNames = new ListVector(*globaldata->gListVector); 
22                         
23                         vector<string> names;
24                         string binnames;
25                         //map names to rows in sparsematrix
26                         for (int i = 0; i < listOfNames->size(); i++) {
27                                 names.clear();
28                                 binnames = listOfNames->get(i);
29                                 splitAtComma(binnames, names);
30                                 
31                                 for (int j = 0; j < names.size(); j++) {
32                                         nameToIndex[names[j]] = i;
33                                 }
34                         }
35                 }else { cout << "error, no listvector." << endl; }
36
37                 
38                 fastafile = globaldata->getFastaFile();
39                 namesfile = globaldata->getNameFile();
40                 groupfile = globaldata->getGroupFile();
41                 
42                 if (groupfile != "") {
43                         //read in group map info.
44                         groupMap = new GroupMap(groupfile);
45                         groupMap->readMap();
46                 }
47
48                 openInputFile(fastafile, in);
49                 
50                 fasta = new FastaMap();
51
52         }
53         catch(exception& e) {
54                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
55                 exit(1);
56         }
57         catch(...) {
58                 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
59                 exit(1);
60         }
61 }
62 //**********************************************************************************************************************
63
64 GetOTURepCommand::~GetOTURepCommand(){
65         delete matrix;
66         delete list;
67         delete input;
68         delete read;
69         delete fasta;
70         if (groupfile != "") {
71                 delete groupMap;
72         }
73 }
74
75 //**********************************************************************************************************************
76
77 int GetOTURepCommand::execute(){
78         try {
79                 int count = 1;
80                 string nameRep, name, sequence;
81                 
82                 //read fastafile
83                 fasta->readFastaFile(in);
84                 
85                 //set format to list so input can get listvector
86                 globaldata->setFormat("list");
87                 
88                 //if user gave a namesfile then use it
89                 if (namesfile != "") {
90                         readNamesFile();
91                 }
92                 
93                 //read list file
94                 read = new ReadOTUFile(globaldata->getListFile());      
95                 read->read(&*globaldata); 
96                 
97                 input = globaldata->ginput;
98                 list = globaldata->gListVector;
99                 
100                 while(list != NULL){
101                         
102                         if(globaldata->allLines == 1 || globaldata->lines.count(count) == 1 || globaldata->labels.count(list->getLabel()) == 1){
103                                 
104                                 //create output file
105                                 string outputFileName = getRootName(globaldata->getListFile()) + list->getLabel() + ".rep.fasta";
106                                 openOutputFile(outputFileName, out);
107
108                                 cout << list->getLabel() << '\t' << count << endl;
109                                 
110                                 //for each bin in the list vector
111                                 for (int i = 0; i < list->size(); i++) {
112                                         string groups;
113                                         nameRep = FindRep(i, groups);
114                                         
115                                         //print out name and sequence for that bin
116                                         sequence = fasta->getSequence(nameRep);
117
118                                         if (sequence != "not found") {
119                                                 if (groupfile == "") {
120                                                         nameRep = nameRep + "|" + toString(i+1);
121                                                         out << ">" << nameRep << endl;
122                                                         out << sequence << endl;
123                                                 }else {
124                                                         nameRep = nameRep + "|" + groups + "|" + toString(i+1);
125                                                         out << ">" << nameRep << endl;
126                                                         out << sequence << endl;
127                                                 }
128                                         }else { 
129                                                 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl; 
130                                                 remove(outputFileName.c_str());
131                                                 return 0;
132                                         }
133                                 }
134                                 
135                                 out.close();
136                         }
137                         
138                         list = input->getListVector();
139                         count++;
140                 }
141
142                 
143                 return 0;
144         }
145         catch(exception& e) {
146                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
147                 exit(1);
148         }
149         catch(...) {
150                 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
151                 exit(1);
152         }
153
154 }
155
156 //**********************************************************************************************************************
157 void GetOTURepCommand::readNamesFile() {
158         try {
159                 vector<string> dupNames;
160                 openInputFile(namesfile, inNames);
161                 
162                 string name, names, sequence;
163         
164                 while(inNames){
165                         inNames >> name;                        //read from first column  A
166                         inNames >> names;               //read from second column  A,B,C,D
167                         
168                         dupNames.clear();
169                         
170                         //parse names into vector
171                         splitAtComma(names, dupNames);
172                         
173                         //store names in fasta map
174                         sequence = fasta->getSequence(name);
175                         for (int i = 0; i < dupNames.size(); i++) {
176                                 fasta->push_back(dupNames[i], sequence);
177                         }
178                 
179                         gobble(inNames);
180                 }
181                 inNames.close();
182
183         }
184         catch(exception& e) {
185                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
186                 exit(1);
187         }
188         catch(...) {
189                 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
190                 exit(1);
191         }       
192 }
193 //**********************************************************************************************************************
194 string GetOTURepCommand::FindRep(int bin, string& group) {
195         try{
196                 vector<string> names;
197                 map<string, float> sums;
198                 map<string, float>::iterator it4;
199                 map<int, string> binMap; //subset of namesToIndex - just member of this bin
200                 string binnames;
201                 float min = 10000;
202                 string minName;
203                 map<string, string> groups;
204                 map<string, string>::iterator groupIt;
205                 
206                 binnames = list->get(bin);
207         
208                 //parse names into vector
209                 splitAtComma(binnames, names);
210                 
211                 //if you have a groupfile
212                 if(groupfile != "") {
213                         //find the groups that are in this bin
214                         for (int i = 0; i < names.size(); i++) {
215                                 string groupName = groupMap->getGroup(names[i]);
216                                 if (groupName == "not found") {  
217                                         cout << names[i] << " is missing from your group file. Please correct. " << endl;
218                                         groupError = true;
219                                 }else{
220                                         groups[groupName] = groupName;
221                                 }
222                         }
223                         
224                         //turn the groups into a string
225                         for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
226                         
227                         //rip off last dash
228                         group = group.substr(0, group.length()-1);
229                 }
230                 
231                 //if only 1 sequence in bin then that's the rep
232                 if (names.size() == 1) { return names[0]; }
233                 else {
234                         //fill binMap
235                         for (int i = 0; i < names.size(); i++) {
236                                 for (it3 = nameToIndex.begin(); it3 != nameToIndex.end(); it3++) {
237
238                                         if (it3->first == names[i]) {  
239                                                 binMap[it3->second] = it3->first;
240
241                                                 //initialize sums map
242                                                 sums[it3->first] = 0.0;
243                                                 break;
244                                         }
245                                 }
246                         }
247                         
248                         //go through each cell in the sparsematrix
249                         for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
250                                 //is this a distance between 2 members of this bin
251                                 it = binMap.find(currentCell->row);
252                                 it2 = binMap.find(currentCell->column);
253                                 
254                                 //sum the distance of the sequences in the bin to eachother
255                                 if ((it != binMap.end()) && (it2 != binMap.end())) {
256                                         //this is a cell that repesents the distance between to of this bins members
257                                         sums[it->second] += currentCell->dist;
258                                         sums[it2->second] += currentCell->dist;
259                                 }
260                         }
261                         
262                         //smallest sum is the representative
263                         for (it4 = sums.begin(); it4 != sums.end(); it4++) {
264                                 if (it4->second < min) {
265                                         min = it4->second;
266                                         minName = it4->first;
267                                 }
268
269                         }
270                         
271                         return minName;
272                 }
273         
274         }
275         catch(exception& e) {
276                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
277                 exit(1);
278         }
279         catch(...) {
280                 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
281                 exit(1);
282         }       
283 }
284
285
286
287
288