]> git.donarmstrong.com Git - mothur.git/blob - clusterfragmentscommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / clusterfragmentscommand.cpp
1 /*
2  *  ryanscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/23/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clusterfragmentscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 //sort by unaligned
15 inline bool comparePriority(seqRNode first, seqRNode second) {  
16         bool better = false;
17         
18         if (first.length > second.length) { 
19                 better = true;
20         }else if (first.length == second.length) {
21                 if (first.numIdentical > second.numIdentical) {
22                         better = true;
23                 }
24         }
25         
26         return better; 
27 }
28 //**********************************************************************************************************************
29 vector<string> ClusterFragmentsCommand::setParameters(){        
30         try {
31                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
32                 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
33         CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
34                 CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs);
35                 CommandParameter ppercent("percent", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppercent);
36                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
37                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
38                 
39                 vector<string> myArray;
40                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
41                 return myArray;
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "ClusterFragmentsCommand", "setParameters");
45                 exit(1);
46         }
47 }
48 //**********************************************************************************************************************
49 string ClusterFragmentsCommand::getHelpString(){        
50         try {
51                 string helpString = "";
52                 helpString += "The cluster.fragments command groups sequences that are part of a larger sequence.\n";
53                 helpString += "The cluster.fragments command outputs a new fasta and name or count file.\n";
54                 helpString += "The cluster.fragments command parameters are fasta, name, count, diffs and percent. The fasta parameter is required, unless you have a valid current file. \n";
55                 helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
56                 helpString += "The diffs parameter allows you to set the number of differences allowed, default=0. \n";
57                 helpString += "The percent parameter allows you to set percentage of differences allowed, default=0. percent=2 means if the number of difference is less than or equal to two percent of the length of the fragment, then cluster.\n";
58                 helpString += "You may use diffs and percent at the same time to say something like: If the number or differences is greater than 1 or more than 2% of the fragment length, don't merge. \n";
59                 helpString += "The cluster.fragments command should be in the following format: \n";
60                 helpString += "cluster.fragments(fasta=yourFastaFile, names=yourNamesFile) \n";
61                 helpString += "Example cluster.fragments(fasta=amazon.fasta).\n";
62                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
63                 return helpString;
64         }
65         catch(exception& e) {
66                 m->errorOut(e, "ClusterFragmentsCommand", "getHelpString");
67                 exit(1);
68         }
69 }
70 //**********************************************************************************************************************
71 string ClusterFragmentsCommand::getOutputFileNameTag(string type, string inputName=""){ 
72         try {
73         string outputFileName = "";
74                 map<string, vector<string> >::iterator it;
75         
76         //is this a type this command creates
77         it = outputTypes.find(type);
78         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
79         else {
80             if (type == "fasta") {  outputFileName =  "fragclust.fasta"; }
81             else if (type == "name") {  outputFileName =  "fragclust.names"; }
82             else if (type == "count") {  outputFileName =  "fragclust.count_table"; }
83             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
84         }
85         return outputFileName;
86         }
87         catch(exception& e) {
88                 m->errorOut(e, "ClusterFragmentsCommand", "getOutputFileNameTag");
89                 exit(1);
90         }
91 }
92
93 //**********************************************************************************************************************
94 ClusterFragmentsCommand::ClusterFragmentsCommand(){     
95         try {
96                 abort = true; calledHelp = true; 
97                 setParameters();
98                 vector<string> tempOutNames;
99                 outputTypes["fasta"] = tempOutNames;
100                 outputTypes["name"] = tempOutNames;
101         outputTypes["count"] = tempOutNames;
102         }
103         catch(exception& e) {
104                 m->errorOut(e, "ClusterFragmentsCommand", "ClusterFragmentsCommand");
105                 exit(1);
106         }
107 }
108 //**********************************************************************************************************************
109 ClusterFragmentsCommand::ClusterFragmentsCommand(string option) {
110         try {
111                 abort = false; calledHelp = false;   
112                 
113                 //allow user to run help
114                 if(option == "help") { help(); abort = true; calledHelp = true; }
115                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
116                 
117                 else {
118                         vector<string> myArray = setParameters();
119                         
120                         OptionParser parser(option);
121                         map<string, string> parameters = parser.getParameters();
122                         
123                         ValidParameters validParameter;
124                         map<string, string>::iterator it;
125                 
126                         //check to make sure all parameters are valid for command
127                         for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
128                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
129                         }
130                         
131                         //initialize outputTypes
132                         vector<string> tempOutNames;
133                         outputTypes["fasta"] = tempOutNames;
134                         outputTypes["name"] = tempOutNames;
135             outputTypes["count"] = tempOutNames;
136                         
137                         //if the user changes the input directory command factory will send this info to us in the output parameter 
138                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
139                         if (inputDir == "not found"){   inputDir = "";          }
140                         else {
141                                 string path;
142                                 it = parameters.find("fasta");
143                                 //user has given a template file
144                                 if(it != parameters.end()){ 
145                                         path = m->hasPath(it->second);
146                                         //if the user has not given a path then, add inputdir. else leave path alone.
147                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
148                                 }
149                                 
150                                 it = parameters.find("name");
151                                 //user has given a template file
152                                 if(it != parameters.end()){ 
153                                         path = m->hasPath(it->second);
154                                         //if the user has not given a path then, add inputdir. else leave path alone.
155                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
156                                 }
157                 
158                 it = parameters.find("count");
159                                 //user has given a template file
160                                 if(it != parameters.end()){ 
161                                         path = m->hasPath(it->second);
162                                         //if the user has not given a path then, add inputdir. else leave path alone.
163                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
164                                 }
165                         }
166
167                         //check for required parameters
168                         fastafile = validParameter.validFile(parameters, "fasta", true);
169                         if (fastafile == "not found") {                                 
170                                 fastafile = m->getFastaFile(); 
171                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
172                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
173                         }
174                         else if (fastafile == "not open") { fastafile = ""; abort = true; }     
175                         else { m->setFastaFile(fastafile); }
176                         
177                         //if the user changes the output directory command factory will send this info to us in the output parameter 
178                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(fastafile);      }
179
180                         //check for optional parameter and set defaults
181                         // ...at some point should added some additional type checking...
182                         namefile = validParameter.validFile(parameters, "name", true);
183                         if (namefile == "not found") { namefile =  "";  }
184                         else if (namefile == "not open") { namefile = ""; abort = true; }       
185                         else {  readNameFile(); m->setNameFile(namefile); }
186             
187             countfile = validParameter.validFile(parameters, "count", true);
188                         if (countfile == "not open") { abort = true; countfile = ""; }  
189                         else if (countfile == "not found") { countfile = ""; }
190                         else { ct.readTable(countfile); m->setCountTableFile(countfile); }
191                         
192             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.fragments command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
193                         
194                         string temp;
195                         temp = validParameter.validFile(parameters, "diffs", false);            if (temp == "not found"){       temp = "0";                             }
196                         m->mothurConvert(temp, diffs); 
197                         
198                         temp = validParameter.validFile(parameters, "percent", false);          if (temp == "not found"){       temp = "0";                             }
199                         m->mothurConvert(temp, percent);
200                         
201                         if (countfile == "") {
202                 if (namefile == "") {
203                     vector<string> files; files.push_back(fastafile);
204                     parser.getNameFile(files);
205                 }
206             }
207                         
208                 }
209                                 
210         }
211         catch(exception& e) {
212                 m->errorOut(e, "ClusterFragmentsCommand", "ClusterFragmentsCommand");
213                 exit(1);
214         }
215 }
216 //**********************************************************************************************************************
217 int ClusterFragmentsCommand::execute(){
218         try {
219                 
220                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
221                 
222                 int start = time(NULL);
223                 
224                 //reads fasta file and return number of seqs
225                 int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
226                 
227                 if (m->control_pressed) { return 0; }
228         
229                 if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0;  }
230                 
231                 //sort seqs by length of unaligned sequence
232                 sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
233         
234                 int count = 0;
235
236                 //think about running through twice...
237                 for (int i = 0; i < numSeqs; i++) {
238                         
239                         if (alignSeqs[i].active) {  //this sequence has not been merged yet
240                                 
241                                 string iBases = alignSeqs[i].seq.getUnaligned();
242                                 
243                                 //try to merge it with all smaller seqs
244                                 for (int j = i+1; j < numSeqs; j++) {
245                                         
246                                         if (m->control_pressed) { return 0; }
247                                         
248                                         if (alignSeqs[j].active) {  //this sequence has not been merged yet
249                                                 
250                                                 string jBases = alignSeqs[j].seq.getUnaligned();
251                                                                                                         
252                                                 if (isFragment(iBases, jBases)) {
253                             if (countfile != "") {
254                                 ct.mergeCounts(alignSeqs[i].names, alignSeqs[j].names);
255                             }else {
256                                 //merge
257                                 alignSeqs[i].names += ',' + alignSeqs[j].names;
258                                 alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
259                             }
260                                                         alignSeqs[j].active = 0;
261                                                         alignSeqs[j].numIdentical = 0;
262                                                         count++;
263                                                 }
264                                         }//end if j active
265                                 }//end if i != j
266                         
267                                 //remove from active list 
268                                 alignSeqs[i].active = 0;
269                                 
270                         }//end if active i
271                         if(i % 100 == 0)        { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
272                 }
273                 
274                 if(numSeqs % 100 != 0)  { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine();   }
275         
276                 
277                 string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
278                 
279                 string newFastaFile = fileroot + getOutputFileNameTag("fasta");
280                 string newNamesFile = fileroot + getOutputFileNameTag("name");
281         if (countfile != "") { newNamesFile = fileroot + getOutputFileNameTag("count"); }
282                 
283                 if (m->control_pressed) { return 0; }
284                 
285                 m->mothurOutEndLine();
286                 m->mothurOut("Total number of sequences before cluster.fragments was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
287                 m->mothurOut("cluster.fragments removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
288                 
289                 printData(newFastaFile, newNamesFile);
290                 
291                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
292                 
293                 if (m->control_pressed) { m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
294                 
295                 m->mothurOutEndLine();
296                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
297                 m->mothurOut(newFastaFile); m->mothurOutEndLine();      
298                 m->mothurOut(newNamesFile); m->mothurOutEndLine();      
299                 outputNames.push_back(newFastaFile);  outputNames.push_back(newNamesFile); outputTypes["fasta"].push_back(newFastaFile); outputTypes["name"].push_back(newNamesFile);
300                 m->mothurOutEndLine();
301                 
302                 //set fasta file as new current fastafile
303                 string current = "";
304                 itTypes = outputTypes.find("fasta");
305                 if (itTypes != outputTypes.end()) {
306                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
307                 }
308                 
309                 itTypes = outputTypes.find("name");
310                 if (itTypes != outputTypes.end()) {
311                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
312                 }
313         
314         itTypes = outputTypes.find("count");
315                 if (itTypes != outputTypes.end()) {
316                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
317                 }
318
319                 return 0;
320                 
321         }
322         catch(exception& e) {
323                 m->errorOut(e, "ClusterFragmentsCommand", "execute");
324                 exit(1);
325         }
326 }
327 //***************************************************************************************************************
328 bool ClusterFragmentsCommand::isFragment(string seq1, string seq2){
329         try {
330                 bool fragment = false;
331                 
332                 //exact match
333                 int pos = seq1.find(seq2);
334                 if (pos != string::npos) { return true; }
335                 //no match, no diffs wanted
336                 else if ((diffs == 0) && (percent == 0)) { return false; }
337                 else { //try aligning and see if you can find it
338                         
339                         //find number of acceptable differences for this sequence fragment
340                         int totalDiffs;
341                         if (diffs == 0) { //you didnt set diffs you want a percentage
342                                 totalDiffs = floor((seq2.length() * (percent / 100.0)));
343                         }else if (percent == 0) { //you didn't set percent you want diffs
344                                 totalDiffs = diffs;
345                         }else if ((percent != 0) && (diffs != 0)) { //you want both, set total diffs to smaller of 2
346                                 totalDiffs = diffs;
347                                 int percentDiff = floor((seq2.length() * (percent / 100.0)));
348                                 if (percentDiff < totalDiffs) { totalDiffs = percentDiff; }
349                         }
350                         
351                         Alignment* alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (seq1.length()+totalDiffs+1));
352                                                         
353                         //use needleman to align 
354                         alignment->align(seq2, seq1);
355                         string tempSeq2 = alignment->getSeqAAln();
356                         string temp = alignment->getSeqBAln();
357                         
358                         delete alignment;
359                         
360                         //chop gap ends
361                         int startPos = 0;
362                         int endPos = tempSeq2.length()-1;
363                         for (int i = 0; i < tempSeq2.length(); i++) {  if (isalpha(tempSeq2[i])) { startPos = i; break; } }
364                         for (int i = tempSeq2.length()-1; i >= 0; i--) {  if (isalpha(tempSeq2[i])) { endPos = i; break; } }
365                         
366                         //count number of diffs
367                         int numDiffs = 0;
368                         for (int i = startPos; i <= endPos; i++) {
369                                 if (tempSeq2[i] != temp[i]) { numDiffs++; }
370                         }
371                         
372                         if (numDiffs <= totalDiffs) { fragment = true; }
373                         
374                 }
375                 
376                 return fragment;
377                 
378         }
379         catch(exception& e) {
380                 m->errorOut(e, "ClusterFragmentsCommand", "isFragment");
381                 exit(1);
382         }
383 }
384 /**************************************************************************************************/
385 int ClusterFragmentsCommand::readFASTA(){
386         try {
387         
388                 ifstream inFasta;
389                 m->openInputFile(fastafile, inFasta);
390                 
391                 while (!inFasta.eof()) {
392                         
393                         if (m->control_pressed) { inFasta.close(); return 0; }
394                         
395                         Sequence seq(inFasta);  m->gobble(inFasta);
396                         
397                         if (seq.getName() != "") {  //can get "" if commented line is at end of fasta file
398                                 if (namefile != "") {
399                                         itSize = sizes.find(seq.getName());
400                                         
401                                         if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); exit(1); }
402                                         else{
403                                                 seqRNode tempNode(itSize->second, seq, names[seq.getName()], seq.getUnaligned().length());
404                                                 alignSeqs.push_back(tempNode);
405                                         }
406                 }else if(countfile != "") {
407                     seqRNode tempNode(ct.getNumSeqs(seq.getName()), seq, seq.getName(), seq.getUnaligned().length());
408                     alignSeqs.push_back(tempNode);
409                                 }else { //no names file, you are identical to yourself 
410                                         seqRNode tempNode(1, seq, seq.getName(), seq.getUnaligned().length());
411                                         alignSeqs.push_back(tempNode);
412                                 }
413                         }
414                 }
415                 
416                 inFasta.close();
417                 return alignSeqs.size();
418         }
419         
420         catch(exception& e) {
421                 m->errorOut(e, "ClusterFragmentsCommand", "readFASTA");
422                 exit(1);
423         }
424 }
425 /**************************************************************************************************/
426 void ClusterFragmentsCommand::printData(string newfasta, string newname){
427         try {
428                 ofstream outFasta;
429                 ofstream outNames;
430                 
431                 m->openOutputFile(newfasta, outFasta);
432                 if (countfile == "") {  m->openOutputFile(newname, outNames); }
433                 
434                 for (int i = 0; i < alignSeqs.size(); i++) {
435                         if (alignSeqs[i].numIdentical != 0) {
436                                 alignSeqs[i].seq.printSequence(outFasta); 
437                                 if (countfile == "") {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;  }
438                         }
439                 }
440                 
441                 outFasta.close();
442                 if (countfile == "") {  outNames.close(); }
443         else { ct.printTable(newname); }
444         }
445         catch(exception& e) {
446                 m->errorOut(e, "ClusterFragmentsCommand", "printData");
447                 exit(1);
448         }
449 }
450 /**************************************************************************************************/
451
452 void ClusterFragmentsCommand::readNameFile(){
453         try {
454                 ifstream in;
455                 m->openInputFile(namefile, in);
456                 string firstCol, secondCol;
457                                 
458                 while (!in.eof()) {
459                         in >> firstCol >> secondCol; m->gobble(in);
460                         names[firstCol] = secondCol;
461                         int size = 1;
462
463                         for(int i=0;i<secondCol.size();i++){
464                                 if(secondCol[i] == ','){        size++; }
465                         }
466                         sizes[firstCol] = size;
467                 }
468                 in.close();
469         }
470         catch(exception& e) {
471                 m->errorOut(e, "ClusterFragmentsCommand", "readNameFile");
472                 exit(1);
473         }
474 }
475 /**************************************************************************************************/
476