]> git.donarmstrong.com Git - mothur.git/blob - preclustercommand.cpp
fixes while testing
[mothur.git] / preclustercommand.cpp
1 /*
2  *  preclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/21/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "preclustercommand.h"
11
12 //**********************************************************************************************************************
13 inline bool comparePriority(seqPNode first, seqPNode second) {  return (first.numIdentical > second.numIdentical); }
14 //**********************************************************************************************************************
15 vector<string> PreClusterCommand::getValidParameters(){ 
16         try {
17                 string Array[] =  {"fasta", "name", "diffs", "outputdir","inputdir"};
18                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
19                 return myArray;
20         }
21         catch(exception& e) {
22                 m->errorOut(e, "PreClusterCommand", "getValidParameters");
23                 exit(1);
24         }
25 }
26 //**********************************************************************************************************************
27 PreClusterCommand::PreClusterCommand(){ 
28         try {
29                 abort = true;
30                 //initialize outputTypes
31                 vector<string> tempOutNames;
32                 outputTypes["fasta"] = tempOutNames;
33                 outputTypes["name"] = tempOutNames;
34         }
35         catch(exception& e) {
36                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
37                 exit(1);
38         }
39 }
40 //**********************************************************************************************************************
41 vector<string> PreClusterCommand::getRequiredParameters(){      
42         try {
43                 string Array[] =  {"fasta"};
44                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
45                 return myArray;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "PreClusterCommand", "getRequiredParameters");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 vector<string> PreClusterCommand::getRequiredFiles(){   
54         try {
55                 vector<string> myArray;
56                 return myArray;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "PreClusterCommand", "getRequiredFiles");
60                 exit(1);
61         }
62 }
63 //**********************************************************************************************************************
64
65 PreClusterCommand::PreClusterCommand(string option) {
66         try {
67                 abort = false;
68                 
69                 //allow user to run help
70                 if(option == "help") { help(); abort = true; }
71                 
72                 else {
73                         //valid paramters for this command
74                         string Array[] =  {"fasta", "name", "diffs", "outputdir","inputdir"};
75                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
76                         
77                         OptionParser parser(option);
78                         map<string, string> parameters = parser.getParameters();
79                         
80                         ValidParameters validParameter;
81                         map<string, string>::iterator it;
82                 
83                         //check to make sure all parameters are valid for command
84                         for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
85                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
86                         }
87                         
88                         //initialize outputTypes
89                         vector<string> tempOutNames;
90                         outputTypes["fasta"] = tempOutNames;
91                         outputTypes["name"] = tempOutNames;
92                 
93                         //if the user changes the input directory command factory will send this info to us in the output parameter 
94                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
95                         if (inputDir == "not found"){   inputDir = "";          }
96                         else {
97                                 string path;
98                                 it = parameters.find("fasta");
99                                 //user has given a template file
100                                 if(it != parameters.end()){ 
101                                         path = m->hasPath(it->second);
102                                         //if the user has not given a path then, add inputdir. else leave path alone.
103                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
104                                 }
105                                 
106                                 it = parameters.find("name");
107                                 //user has given a template file
108                                 if(it != parameters.end()){ 
109                                         path = m->hasPath(it->second);
110                                         //if the user has not given a path then, add inputdir. else leave path alone.
111                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
112                                 }
113                         }
114
115                         //check for required parameters
116                         fastafile = validParameter.validFile(parameters, "fasta", true);
117                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the pre.cluster command."); m->mothurOutEndLine(); abort = true; }
118                         else if (fastafile == "not open") { abort = true; }     
119                         
120                         //if the user changes the output directory command factory will send this info to us in the output parameter 
121                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
122                                 outputDir = ""; 
123                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
124                         }
125
126                         //check for optional parameter and set defaults
127                         // ...at some point should added some additional type checking...
128                         namefile = validParameter.validFile(parameters, "name", true);
129                         if (namefile == "not found") { namefile =  "";  }
130                         else if (namefile == "not open") { abort = true; }      
131                         else {  readNameFile();  }
132                         
133                         string temp     = validParameter.validFile(parameters, "diffs", false);                         if(temp == "not found"){        temp = "1"; }
134                         convert(temp, diffs); 
135                 }
136                                 
137         }
138         catch(exception& e) {
139                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
140                 exit(1);
141         }
142 }
143
144 //**********************************************************************************************************************
145 PreClusterCommand::~PreClusterCommand(){}       
146 //**********************************************************************************************************************
147
148 void PreClusterCommand::help(){
149         try {
150                 m->mothurOut("The pre.cluster command groups sequences that are within a given number of base mismatches.\n");
151                 m->mothurOut("The pre.cluster command outputs a new fasta and name file.\n");
152                 m->mothurOut("The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n");
153                 m->mothurOut("The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n");
154                 m->mothurOut("The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n");
155                 m->mothurOut("The pre.cluster command should be in the following format: \n");
156                 m->mothurOut("pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n");
157                 m->mothurOut("Example pre.cluster(fasta=amazon.fasta, diffs=2).\n");
158                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
159         }
160         catch(exception& e) {
161                 m->errorOut(e, "PreClusterCommand", "help");
162                 exit(1);
163         }
164 }
165 //**********************************************************************************************************************
166
167 int PreClusterCommand::execute(){
168         try {
169                 
170                 if (abort == true) { return 0; }
171                 
172                 int start = time(NULL);
173                 
174                 //reads fasta file and return number of seqs
175                 int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
176                 
177                 if (m->control_pressed) { return 0; }
178         
179                 if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0;  }
180                 if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0;  }
181                 
182                 //clear sizes since you only needed this info to build the alignSeqs seqPNode structs
183 //              sizes.clear();
184         
185                 //sort seqs by number of identical seqs
186                 sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
187         
188                 int count = 0;
189
190                 //think about running through twice...
191                 for (int i = 0; i < numSeqs; i++) {
192                         
193                         //are you active
194                         //                      itActive = active.find(alignSeqs[i].seq.getName());
195                         
196                         if (alignSeqs[i].active) {  //this sequence has not been merged yet
197                                 
198                                 //try to merge it with all smaller seqs
199                                 for (int j = i+1; j < numSeqs; j++) {
200                                         
201                                         if (m->control_pressed) { return 0; }
202                                         
203                                         if (alignSeqs[j].active) {  //this sequence has not been merged yet
204                                                 //are you within "diff" bases
205                                                 int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
206                                                 
207                                                 if (mismatch <= diffs) {
208                                                         //merge
209                                                         alignSeqs[i].names += ',' + alignSeqs[j].names;
210                                                         alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
211
212                                                         alignSeqs[j].active = 0;
213                                                         alignSeqs[j].numIdentical = 0;
214                                                         count++;
215                                                 }
216                                         }//end if j active
217                                 }//end if i != j
218                         
219                         //remove from active list 
220                                 alignSeqs[i].active = 0;
221                                 
222                         }//end if active i
223                         if(i % 100 == 0)        { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
224                 }
225                 
226                 if(numSeqs % 100 != 0)  { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine();   }
227         
228                 
229                 string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
230                 
231                 string newFastaFile = fileroot + "precluster" + m->getExtension(fastafile);
232                 string newNamesFile = fileroot + "precluster.names";
233                 
234                 if (m->control_pressed) { return 0; }
235                 
236                 m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
237                 m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
238                 printData(newFastaFile, newNamesFile);
239                 
240                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
241                 
242                 if (m->control_pressed) { remove(newFastaFile.c_str()); remove(newNamesFile.c_str()); return 0; }
243                 
244                 m->mothurOutEndLine();
245                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
246                 m->mothurOut(newFastaFile); m->mothurOutEndLine();      outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
247                 m->mothurOut(newNamesFile); m->mothurOutEndLine();      outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile);
248                 m->mothurOutEndLine();
249
250                 return 0;
251                 
252         }
253         catch(exception& e) {
254                 m->errorOut(e, "PreClusterCommand", "execute");
255                 exit(1);
256         }
257 }
258
259 /**************************************************************************************************/
260
261 //this requires the names and fasta file to be in the same order
262
263 int PreClusterCommand::readFASTA(){
264         try {
265                 //ifstream inNames;
266                 ifstream inFasta;
267                 
268                 //m->openInputFile(namefile, inNames);
269                 m->openInputFile(fastafile, inFasta);
270                 
271                 //string firstCol, secondCol, nameString;
272                 length = 0;
273                 
274                 while (!inFasta.eof()) {
275                         
276                         if (m->control_pressed) { inFasta.close(); return 0; }
277                         
278                         //inNames >> firstCol >> secondCol;
279                         //nameString = secondCol;
280                         
281                         //m->gobble(inNames);
282                         //int size = 1;
283                         //while (secondCol.find_first_of(',') != -1) { 
284                         //      size++;
285                         //      secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
286                         //}
287                         
288                         Sequence seq(inFasta);  m->gobble(inFasta);
289                         
290                         if (seq.getName() != "") {  //can get "" if commented line is at end of fasta file
291                                 if (namefile != "") {
292                                         itSize = sizes.find(seq.getName());
293                                         
294                                         if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); exit(1); }
295                                         else{
296                                                 seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
297                                                 alignSeqs.push_back(tempNode);
298                                                 if (seq.getAligned().length() > length) {  length = seq.getAligned().length();  }
299                                         }       
300                                 }else { //no names file, you are identical to yourself 
301                                         seqPNode tempNode(1, seq, seq.getName());
302                                         alignSeqs.push_back(tempNode);
303                                         if (seq.getAligned().length() > length) {  length = seq.getAligned().length();  }
304                                 }
305                         }
306                 }
307                 inFasta.close();
308                 //inNames.close();
309                 return alignSeqs.size();
310         }
311         
312         catch(exception& e) {
313                 m->errorOut(e, "PreClusterCommand", "readFASTA");
314                 exit(1);
315         }
316 }
317
318 /**************************************************************************************************/
319
320 int PreClusterCommand::calcMisMatches(string seq1, string seq2){
321         try {
322                 int numBad = 0;
323                 
324                 for (int i = 0; i < seq1.length(); i++) {
325                         //do they match
326                         if (seq1[i] != seq2[i]) { numBad++; }
327                         if (numBad > diffs) { return length;  } //to far to cluster
328                 }
329                 
330                 return numBad;
331         }
332         catch(exception& e) {
333                 m->errorOut(e, "PreClusterCommand", "calcMisMatches");
334                 exit(1);
335         }
336 }
337
338 /**************************************************************************************************/
339
340 void PreClusterCommand::printData(string newfasta, string newname){
341         try {
342                 ofstream outFasta;
343                 ofstream outNames;
344                 
345                 m->openOutputFile(newfasta, outFasta);
346                 m->openOutputFile(newname, outNames);
347                                 
348                 
349                 for (int i = 0; i < alignSeqs.size(); i++) {
350                         if (alignSeqs[i].numIdentical != 0) {
351                                 alignSeqs[i].seq.printSequence(outFasta); 
352                                 outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
353                         }
354                 }
355                 
356                 outFasta.close();
357                 outNames.close();
358                 
359         }
360         catch(exception& e) {
361                 m->errorOut(e, "PreClusterCommand", "printData");
362                 exit(1);
363         }
364 }
365 /**************************************************************************************************/
366
367 void PreClusterCommand::readNameFile(){
368         try {
369                 ifstream in;
370                 m->openInputFile(namefile, in);
371                 string firstCol, secondCol;
372                                 
373                 while (!in.eof()) {
374                         in >> firstCol >> secondCol; m->gobble(in);
375                         names[firstCol] = secondCol;
376                         int size = 1;
377
378                         for(int i=0;i<secondCol.size();i++){
379                                 if(secondCol[i] == ','){        size++; }
380                         }
381                         sizes[firstCol] = size;
382                 }
383                 in.close();
384         }
385         catch(exception& e) {
386                 m->errorOut(e, "PreClusterCommand", "readNameFile");
387                 exit(1);
388         }
389 }
390
391 /**************************************************************************************************/
392
393