]> git.donarmstrong.com Git - mothur.git/blob - deconvolutecommand.cpp
have make contigs use \r instead of \n
[mothur.git] / deconvolutecommand.cpp
1 /*
2  *  deconvolute.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 1/21/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "deconvolutecommand.h"
11 #include "sequence.hpp"
12 #include <unordered_map>
13 #include <unordered_set>
14 #include <streambuf>
15
16
17 //**********************************************************************************************************************
18 vector<string> DeconvoluteCommand::setParameters(){     
19         try {
20                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta-name",false,true,true); parameters.push_back(pfasta);
21                 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","name",false,false,true); parameters.push_back(pname);
22         CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","count",false,false,true); parameters.push_back(pcount);
23         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
24                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
25                 
26                 vector<string> myArray;
27                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
28                 return myArray;
29         }
30         catch(exception& e) {
31                 m->errorOut(e, "DeconvoluteCommand", "setParameters");
32                 exit(1);
33         }
34 }
35 //**********************************************************************************************************************
36 string DeconvoluteCommand::getHelpString(){     
37         try {
38                 string helpString = "";
39                 helpString += "The unique.seqs command reads a fastafile and creates a name or count file.\n";
40                 helpString += "It creates a file where the first column is the groupname and the second column is a list of sequence names who have the same sequence. \n";
41                 helpString += "If the sequence is unique the second column will just contain its name. \n";
42                 helpString += "The unique.seqs command parameters are fasta and name.  fasta is required, unless there is a valid current fasta file.\n";
43                 helpString += "The unique.seqs command should be in the following format: \n";
44                 helpString += "unique.seqs(fasta=yourFastaFile) \n";    
45                 return helpString;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "DeconvoluteCommand", "getHelpString");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 string DeconvoluteCommand::getOutputPattern(string type) {
54     try {
55         string pattern = "";
56         
57         if (type == "fasta") {  pattern = "[filename],unique,[extension]"; } 
58         else if (type == "name") {  pattern = "[filename],names-[filename],[tag],names"; } 
59         else if (type == "count") {  pattern = "[filename],count_table-[filename],[tag],count_table"; }
60         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
61         
62         return pattern;
63     }
64     catch(exception& e) {
65         m->errorOut(e, "DeconvoluteCommand", "getOutputPattern");
66         exit(1);
67     }
68 }
69
70 //**********************************************************************************************************************
71 DeconvoluteCommand::DeconvoluteCommand(){       
72         try {
73                 abort = true; calledHelp = true; 
74                 setParameters();
75                 vector<string> tempOutNames;
76                 outputTypes["fasta"] = tempOutNames;
77                 outputTypes["name"] = tempOutNames;
78         outputTypes["count"] = tempOutNames;
79         }
80         catch(exception& e) {
81                 m->errorOut(e, "DeconvoluteCommand", "DeconvoluteCommand");
82                 exit(1);
83         }
84 }
85 /**************************************************************************************/
86 DeconvoluteCommand::DeconvoluteCommand(string option)  {        
87         try {
88                 abort = false; calledHelp = false;   
89                 
90                 //allow user to run help
91                 if(option == "help") { help(); abort = true; calledHelp = true; }
92                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
93                 
94                 else {
95                         vector<string> myArray = setParameters();
96                         
97                         OptionParser parser(option);
98                         map<string,string> parameters = parser.getParameters();
99                         
100                         ValidParameters validParameter;
101                         map<string, string>::iterator it;
102                 
103                         //check to make sure all parameters are valid for command
104                         for (it = parameters.begin(); it != parameters.end(); it++) { 
105                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
106                         }
107                         
108                         //initialize outputTypes
109                         vector<string> tempOutNames;
110                         outputTypes["fasta"] = tempOutNames;
111                         outputTypes["name"] = tempOutNames;
112             outputTypes["count"] = tempOutNames;
113                 
114                         //if the user changes the input directory command factory will send this info to us in the output parameter 
115                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
116                         if (inputDir == "not found"){   inputDir = "";          }
117                         else {
118                                 string path;
119                                 it = parameters.find("fasta");
120                                 //user has given a template file
121                                 if(it != parameters.end()){ 
122                                         path = m->hasPath(it->second);
123                                         //if the user has not given a path then, add inputdir. else leave path alone.
124                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
125                                 }
126                                 
127                                 it = parameters.find("name");
128                                 //user has given a template file
129                                 if(it != parameters.end()){ 
130                                         path = m->hasPath(it->second);
131                                         //if the user has not given a path then, add inputdir. else leave path alone.
132                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
133                                 }
134                 
135                 it = parameters.find("count");
136                                 //user has given a template file
137                                 if(it != parameters.end()){ 
138                                         path = m->hasPath(it->second);
139                                         //if the user has not given a path then, add inputdir. else leave path alone.
140                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
141                                 }
142                         }
143
144                         
145                         //check for required parameters
146                         inFastaName = validParameter.validFile(parameters, "fasta", true);
147                         if (inFastaName == "not open") { abort = true; }
148                         else if (inFastaName == "not found") {                          
149                                 inFastaName = m->getFastaFile(); 
150                                 if (inFastaName != "") { m->mothurOut("Using " + inFastaName + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
151                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
152                         }else { m->setFastaFile(inFastaName); } 
153                         
154                         //if the user changes the output directory command factory will send this info to us in the output parameter 
155                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
156                                 outputDir = ""; 
157                                 outputDir += m->hasPath(inFastaName); //if user entered a file with a path then preserve it     
158                         }
159                         
160                         oldNameMapFName = validParameter.validFile(parameters, "name", true);
161                         if (oldNameMapFName == "not open") { oldNameMapFName = ""; abort = true; }
162                         else if (oldNameMapFName == "not found"){       oldNameMapFName = "";   }
163                         else { m->setNameFile(oldNameMapFName); }
164             
165             countfile = validParameter.validFile(parameters, "count", true);
166                         if (countfile == "not open") { abort = true; countfile = ""; }  
167                         else if (countfile == "not found") { countfile = ""; }
168                         else { m->setCountTableFile(countfile); }
169                         
170             if ((countfile != "") && (oldNameMapFName != "")) { m->mothurOut("When executing a unique.seqs command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
171                         
172
173                         if (countfile == "") {
174                 if (oldNameMapFName == "") {
175                     vector<string> files; files.push_back(inFastaName);
176                     parser.getNameFile(files);
177                 }
178             }
179                         
180                 }
181
182         }
183         catch(exception& e) {
184                 m->errorOut(e, "DeconvoluteCommand", "DeconvoluteCommand");
185                 exit(1);
186         }
187 }
188 /**************************************************************************************/
189 int DeconvoluteCommand::execute() {     
190         try {
191                 
192                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
193
194                 //prepare filenames and open files
195         map<string, string> variables; 
196         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inFastaName));
197         string outNameFile = getOutputFileName("name", variables);
198         string outCountFile = getOutputFileName("count", variables);
199         variables["[extension]"] = m->getExtension(inFastaName);
200                 string outFastaFile = getOutputFileName("fasta", variables);
201                 
202                 map<string, string> nameMap;
203                 map<string, string>::iterator itNames;
204                 if (oldNameMapFName != "")  {  
205             m->readNames(oldNameMapFName, nameMap);
206             if (oldNameMapFName == outNameFile){
207                 //prepare filenames and open files
208                 map<string, string> mvariables;
209                 mvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inFastaName));
210                 mvariables["[tag]"] = "unique";
211                 outNameFile = getOutputFileName("name", mvariables);
212             }
213         }
214         CountTable ct;
215         if (countfile != "")  {  
216             ct.readTable(countfile, true, false);
217             if (countfile == outCountFile){
218                 //prepare filenames and open files
219                 map<string, string> mvariables;
220                 mvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inFastaName));
221                 mvariables["[tag]"] = "unique";
222                 outCountFile = getOutputFileName("count", mvariables);   }
223         }
224                 
225                 if (m->control_pressed) { return 0; }
226                 
227                 ifstream in; 
228                 m->openInputFile(inFastaName, in);
229                 
230                 ofstream outFasta;
231                 m->openOutputFile(outFastaFile, outFasta);
232                 
233         hash<string> hash_fn;
234                 multimap<size_t,pair<streampos,string>> sequenceStrings; //sequenceString -> list of names.  "atgc...." -> seq1,seq2,seq3.
235                 unordered_set<string> nameInFastaFile; //for sanity checking
236         unordered_set<string>::iterator itname;
237                 int count = 0;
238                 while (!in.eof()) {
239                         
240                         if (m->control_pressed) { in.close(); outFasta.close(); m->mothurRemove(outFastaFile); return 0; }
241                         streampos seq_start = reliable_tellg(in);
242                         Sequence seq(in);
243                         
244                         if (seq.getName() != "") {
245                                 
246                                 //sanity checks
247                                 itname = nameInFastaFile.find(seq.getName());
248                                 if (itname == nameInFastaFile.end()) { nameInFastaFile.insert(seq.getName());  }
249                                 else { m->mothurOut("[ERROR]: You already have a sequence named " + seq.getName() + " in your fasta file, sequence names must be unique, please correct."); m->mothurOutEndLine(); }
250
251                 auto itStrings = in_fasta_from_hash_map(sequenceStrings,seq,in);
252                 if (itStrings == sequenceStrings.end()) { //this is a new unique sequence
253                                         //output to unique fasta file
254                                         seq.printSequence(outFasta);
255                                         
256                                         if (oldNameMapFName != "") {
257                                                 itNames = nameMap.find(seq.getName());
258                                                 
259                                                 if (itNames == nameMap.end()) { //namefile and fastafile do not match
260                                                         m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct.");
261                             m->mothurOutEndLine();
262                                                 }else {
263                           sequenceStrings.emplace(hash_fn(seq.getAligned()),
264                                                   make_pair(seq_start,seq.getName()));
265                                                 }
266                                         }else {
267                       if (countfile != "") { 
268                         ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table
269                       }
270                       sequenceStrings.emplace(hash_fn(seq.getAligned()),
271                                               make_pair(seq_start,
272                                                         seq.getName())
273                                               );
274                     }
275                                 }else { //this is a dup
276                                         if (oldNameMapFName != "" && 
277                         nameMap.find(seq.getName()) == nameMap.end()) { //namefile and fastafile do not match
278                       m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct.");
279                       m->mothurOutEndLine();
280                     } else if (countfile != "") { 
281                       int num = ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table
282                       if (num != 0) { //its in the table
283                         ct.mergeCounts(primary_seqname(itStrings->second.second), seq.getName()); //merges counts and saves in uniques name
284                       }
285                     }
286                     itStrings->second.second += "," + seq.getName();
287                                 }
288                                 
289                                 count++;
290                         }
291                         
292                         m->gobble(in);
293                         
294                         if(count % 1000 == 0)   { m->mothurOutJustToScreen(toString(count) + "\t" + toString(sequenceStrings.size()) + "\n");   }
295                 }
296                 
297                 if(count % 1000 != 0)   { m->mothurOut(toString(count) + "\t" + toString(sequenceStrings.size())); m->mothurOutEndLine();       }
298                 
299                 in.close();
300                 outFasta.close();
301                 
302                 if (m->control_pressed) { m->mothurRemove(outFastaFile); return 0; }
303                 
304                 //print new names file
305                 ofstream outNames;
306                 if (countfile == "") { m->openOutputFile(outNameFile, outNames); outputNames.push_back(outNameFile); outputTypes["name"].push_back(outNameFile);  }
307         else { m->openOutputFile(outCountFile, outNames); ct.printHeaders(outNames); outputTypes["count"].push_back(outCountFile); outputNames.push_back(outCountFile); }
308                 
309                 for (auto it = sequenceStrings.begin(); it != sequenceStrings.end(); it++) {
310                         if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
311                         
312             if (countfile == "") {
313               //get rep name
314               outNames << primary_seqname(it->second.second) << '\t' << it->second.second << endl;
315             } else {
316               //get rep name
317               ct.printSeq(outNames,primary_seqname(it->second.second));
318             }
319         }
320                 outNames.close();
321                 
322                 if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); }  return 0; }
323                 
324                 m->mothurOutEndLine();
325                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
326                 outputNames.push_back(outFastaFile);   outputTypes["fasta"].push_back(outFastaFile);  
327         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
328                 m->mothurOutEndLine();
329
330                 //set fasta file as new current fastafile
331                 string current = "";
332                 itTypes = outputTypes.find("fasta");
333                 if (itTypes != outputTypes.end()) {
334                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
335                 }
336                 
337                 itTypes = outputTypes.find("name");
338                 if (itTypes != outputTypes.end()) {
339                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
340                 }
341         
342         itTypes = outputTypes.find("count");
343                 if (itTypes != outputTypes.end()) {
344                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
345                 }
346                 
347                 return 0;
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "DeconvoluteCommand", "execute");
351                 exit(1);
352         }
353 }
354 /**************************************************************************************/
355
356 multimap<size_t,std::pair<std::streampos,string>>::iterator
357   in_fasta_from_hash_map(multimap<size_t,std::pair<std::streampos,string>>& seq_strings,
358                             Sequence& seq,
359                             ifstream& fasta_file) {
360   const std::hash<std::string> hash_fn;
361   auto it_range = seq_strings.equal_range(hash_fn(seq.getAligned()));
362   // If the hash of the aligned sequence is not in the map, it can't
363   // possibly be a duplicate
364   if (it_range.first == seq_strings.end()) {
365     return seq_strings.end();
366   }
367   pair<bool,streampos> file_pos = store_filepos(fasta_file);
368   // If it is, it might be a hash collision. Read the file to
369   // determine if that's the case
370   for (auto it = it_range.first; it != it_range.second; it++) {
371     streampos pos = it->second.first;
372     fasta_file.seekg(pos);
373     // read in sequence
374     Sequence old_seq(fasta_file);
375     if (old_seq.getAligned() == seq.getAligned()) {
376       restore_filepos(fasta_file,file_pos);
377       return it;
378     }
379   }
380   restore_filepos(fasta_file,file_pos);
381   return seq_strings.end();
382 }
383
384 streampos reliable_tellg(ifstream& file) {
385   bool was_eof = file.eof();
386   if (was_eof) { // we're at EOF: clear the bit so tellg works properly
387     file.clear(file.rdstate() & ~ios::eofbit);
388   }
389   streampos pos = file.tellg();
390   if (was_eof) { // we're at EOF: reset eofbit
391     file.clear(file.rdstate() | ios::eofbit);
392   }
393   return pos;
394 }
395
396
397 pair<bool,streampos> store_filepos(ifstream& file) {
398   bool was_eof = file.eof();
399   if (was_eof) { // we're at EOF: clear the bit so tellg works properly
400     file.clear(file.rdstate() & ~ios::eofbit);
401   }
402   streampos cur_pos = file.tellg();
403   return make_pair(was_eof,cur_pos);
404 }
405
406 void restore_filepos(ifstream& file, pair<bool,streampos>& pos_info) {
407   file.seekg(pos_info.second);
408   if (pos_info.first) { // at eof?
409     file.clear(file.rdstate() | ios::eofbit); // restore the eof bit if it was set
410   }
411 }
412
413 string primary_seqname(string name) {
414   int pos = name.find_first_of(',');
415   if (pos == string::npos) {
416     return name;
417   }
418   return name.substr(0,pos);
419 }