2 * classifyseqscommand.cpp
5 * Created by westcott on 11/2/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "classifyseqscommand.h"
11 #include "sequence.hpp"
13 #include "phylotree.h"
14 #include "phylosummary.h"
18 //**********************************************************************************************************************
19 vector<string> ClassifySeqsCommand::getValidParameters(){
21 string AlignArray[] = {"template","fasta","name","group","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"};
22 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
26 m->errorOut(e, "ClassifySeqsCommand", "getValidParameters");
30 //**********************************************************************************************************************
31 ClassifySeqsCommand::ClassifySeqsCommand(){
34 //initialize outputTypes
35 vector<string> tempOutNames;
36 outputTypes["taxonomy"] = tempOutNames;
37 outputTypes["taxsummary"] = tempOutNames;
38 outputTypes["matchdist"] = tempOutNames;
41 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
45 //**********************************************************************************************************************
46 vector<string> ClassifySeqsCommand::getRequiredParameters(){
48 string Array[] = {"fasta","template","taxonomy"};
49 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
53 m->errorOut(e, "ClassifySeqsCommand", "getRequiredParameters");
57 //**********************************************************************************************************************
58 vector<string> ClassifySeqsCommand::getRequiredFiles(){
60 vector<string> myArray;
64 m->errorOut(e, "ClassifySeqsCommand", "getRequiredFiles");
68 //**********************************************************************************************************************
69 ClassifySeqsCommand::ClassifySeqsCommand(string option) {
73 //allow user to run help
74 if(option == "help") { help(); abort = true; }
78 //valid paramters for this command
79 string AlignArray[] = {"template","fasta","name","group","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"};
80 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
82 OptionParser parser(option);
83 map<string, string> parameters = parser.getParameters();
85 ValidParameters validParameter("classify.seqs");
86 map<string, string>::iterator it;
88 //check to make sure all parameters are valid for command
89 for (it = parameters.begin(); it != parameters.end(); it++) {
90 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
93 //initialize outputTypes
94 vector<string> tempOutNames;
95 outputTypes["taxonomy"] = tempOutNames;
96 outputTypes["taxsummary"] = tempOutNames;
97 outputTypes["matchdist"] = tempOutNames;
99 //if the user changes the output directory command factory will send this info to us in the output parameter
100 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
102 //if the user changes the input directory command factory will send this info to us in the output parameter
103 string inputDir = validParameter.validFile(parameters, "inputdir", false);
104 if (inputDir == "not found"){ inputDir = ""; }
107 it = parameters.find("template");
108 //user has given a template file
109 if(it != parameters.end()){
110 path = m->hasPath(it->second);
111 //if the user has not given a path then, add inputdir. else leave path alone.
112 if (path == "") { parameters["template"] = inputDir + it->second; }
115 it = parameters.find("taxonomy");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
123 it = parameters.find("group");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["group"] = inputDir + it->second; }
132 //check for required parameters
133 templateFileName = validParameter.validFile(parameters, "template", true);
134 if (templateFileName == "not found") {
135 m->mothurOut("template is a required parameter for the classify.seqs command.");
136 m->mothurOutEndLine();
139 else if (templateFileName == "not open") { abort = true; }
142 fastaFileName = validParameter.validFile(parameters, "fasta", false);
143 if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true; }
145 m->splitAtDash(fastaFileName, fastaFileNames);
147 //go through files and make sure they are good, if not, then disregard them
148 for (int i = 0; i < fastaFileNames.size(); i++) {
149 if (inputDir != "") {
150 string path = m->hasPath(fastaFileNames[i]);
151 //if the user has not given a path then, add inputdir. else leave path alone.
152 if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; }
158 ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror");
160 //if you can't open it, try default location
161 if (ableToOpen == 1) {
162 if (m->getDefaultPath() != "") { //default path is set
163 string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]);
164 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
166 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
168 fastaFileNames[i] = tryPath;
172 if (ableToOpen == 1) {
173 if (m->getOutputDir() != "") { //default path is set
174 string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]);
175 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
177 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
179 fastaFileNames[i] = tryPath;
185 if (ableToOpen == 1) {
186 m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
187 //erase from file list
188 fastaFileNames.erase(fastaFileNames.begin()+i);
194 //make sure there is at least one valid file left
195 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
199 taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
200 if (taxonomyFileName == "not found") {
201 m->mothurOut("taxonomy is a required parameter for the classify.seqs command.");
202 m->mothurOutEndLine();
205 else if (taxonomyFileName == "not open") { abort = true; }
208 namefile = validParameter.validFile(parameters, "name", false);
209 if (namefile == "not found") { namefile = ""; }
212 m->splitAtDash(namefile, namefileNames);
214 //go through files and make sure they are good, if not, then disregard them
215 for (int i = 0; i < namefileNames.size(); i++) {
216 if (inputDir != "") {
217 string path = m->hasPath(namefileNames[i]);
218 //if the user has not given a path then, add inputdir. else leave path alone.
219 if (path == "") { namefileNames[i] = inputDir + namefileNames[i]; }
224 ableToOpen = m->openInputFile(namefileNames[i], in, "noerror");
226 //if you can't open it, try default location
227 if (ableToOpen == 1) {
228 if (m->getDefaultPath() != "") { //default path is set
229 string tryPath = m->getDefaultPath() + m->getSimpleName(namefileNames[i]);
230 m->mothurOut("Unable to open " + namefileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
232 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
234 namefileNames[i] = tryPath;
238 if (ableToOpen == 1) {
239 if (m->getOutputDir() != "") { //default path is set
240 string tryPath = m->getOutputDir() + m->getSimpleName(namefileNames[i]);
241 m->mothurOut("Unable to open " + namefileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
243 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
245 namefileNames[i] = tryPath;
250 if (ableToOpen == 1) {
251 m->mothurOut("Unable to open " + namefileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); abort = true;
252 //erase from file list
253 namefileNames.erase(namefileNames.begin()+i);
260 if (namefile != "") {
261 if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); }
264 groupfile = validParameter.validFile(parameters, "group", false);
265 if (groupfile == "not found") { groupfile = ""; }
267 m->splitAtDash(groupfile, groupfileNames);
269 //go through files and make sure they are good, if not, then disregard them
270 for (int i = 0; i < groupfileNames.size(); i++) {
271 if (inputDir != "") {
272 string path = m->hasPath(groupfileNames[i]);
273 //if the user has not given a path then, add inputdir. else leave path alone.
274 if (path == "") { groupfileNames[i] = inputDir + groupfileNames[i]; }
279 ableToOpen = m->openInputFile(groupfileNames[i], in, "noerror");
281 //if you can't open it, try default location
282 if (ableToOpen == 1) {
283 if (m->getDefaultPath() != "") { //default path is set
284 string tryPath = m->getDefaultPath() + m->getSimpleName(groupfileNames[i]);
285 m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
287 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
289 groupfileNames[i] = tryPath;
293 if (ableToOpen == 1) {
294 if (m->getOutputDir() != "") { //default path is set
295 string tryPath = m->getOutputDir() + m->getSimpleName(groupfileNames[i]);
296 m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
298 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
300 groupfileNames[i] = tryPath;
306 if (ableToOpen == 1) {
307 m->mothurOut("Unable to open " + groupfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); groupfileNames[i] = "";
308 //erase from file list
309 groupfileNames.erase(groupfileNames.begin()+i);
315 if (groupfile != "") {
316 if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); }
318 for (int i = 0; i < fastaFileNames.size(); i++) { groupfileNames.push_back(""); }
321 //check for optional parameter and set defaults
322 // ...at some point should added some additional type checking...
324 temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; }
325 convert(temp, kmerSize);
327 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
328 convert(temp, processors);
330 search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; }
332 method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "bayesian"; }
334 temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; }
335 convert(temp, match);
337 temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; }
338 convert(temp, misMatch);
340 temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; }
341 convert(temp, gapOpen);
343 temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; }
344 convert(temp, gapExtend);
346 temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found"){ temp = "10"; }
347 convert(temp, numWanted);
349 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0"; }
350 convert(temp, cutoff);
352 temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; }
353 probs = m->isTrue(temp);
355 temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; }
356 convert(temp, iters);
360 if ((method == "bayesian") && (search != "kmer")) {
361 m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine();
367 catch(exception& e) {
368 m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
373 //**********************************************************************************************************************
375 ClassifySeqsCommand::~ClassifySeqsCommand(){
377 if (abort == false) {
378 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
382 //**********************************************************************************************************************
384 void ClassifySeqsCommand::help(){
386 m->mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
387 m->mothurOut("The classify.seqs command parameters are template, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n");
388 m->mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
389 m->mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n");
390 m->mothurOut("The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n");
391 m->mothurOut("The group parameter allows you add a group file so you can have the summary totals broken up by group.\n");
392 m->mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n");
393 m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
394 m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
396 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
398 m->mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
399 m->mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n");
400 m->mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n");
401 m->mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n");
402 m->mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n");
403 m->mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n");
404 m->mothurOut("The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n");
405 m->mothurOut("The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n");
406 m->mothurOut("The classify.seqs command should be in the following format: \n");
407 m->mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
408 m->mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
409 m->mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
410 m->mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
411 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
413 catch(exception& e) {
414 m->errorOut(e, "ClassifySeqsCommand", "help");
420 //**********************************************************************************************************************
422 int ClassifySeqsCommand::execute(){
424 if (abort == true) { return 0; }
426 if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters); }
427 else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted); }
429 m->mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
430 m->mothurOutEndLine();
431 classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);
434 if (m->control_pressed) { delete classify; return 0; }
437 for (int s = 0; s < fastaFileNames.size(); s++) {
439 m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
441 string RippedTaxName = m->getRootName(m->getSimpleName(taxonomyFileName));
442 RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1));
443 if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); }
444 RippedTaxName += ".";
446 if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
447 string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy";
448 string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp";
449 string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "tax.summary";
451 if ((method == "knn") && (search == "distance")) {
452 string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "match.dist";
453 classify->setDistName(DistName); outputNames.push_back(DistName); outputTypes["matchdist"].push_back(DistName);
456 outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile);
457 outputNames.push_back(taxSummary); outputTypes["taxsummary"].push_back(taxSummary);
459 int start = time(NULL);
460 int numFastaSeqs = 0;
461 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
464 int pid, numSeqsPerProcessor;
466 vector<unsigned long int> MPIPos;
469 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
470 MPI_Comm_size(MPI_COMM_WORLD, &processors);
473 MPI_File outMPINewTax;
474 MPI_File outMPITempTax;
476 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
477 int inMode=MPI_MODE_RDONLY;
479 char outNewTax[1024];
480 strcpy(outNewTax, newTaxonomyFile.c_str());
482 char outTempTax[1024];
483 strcpy(outTempTax, tempTaxonomyFile.c_str());
485 char inFileName[1024];
486 strcpy(inFileName, fastaFileNames[s].c_str());
488 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
489 MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax);
490 MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax);
492 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; }
494 if (pid == 0) { //you are the root process
496 MPIPos = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs
498 //send file positions to all processes
499 for(int i = 1; i < processors; i++) {
500 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
501 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
504 //figure out how many sequences you have to align
505 numSeqsPerProcessor = numFastaSeqs / processors;
506 int startIndex = pid * numSeqsPerProcessor;
507 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
511 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
513 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; }
515 for (int i = 1; i < processors; i++) {
517 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
519 }else{ //you are a child process
520 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
521 MPIPos.resize(numFastaSeqs+1);
522 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
524 //figure out how many sequences you have to align
525 numSeqsPerProcessor = numFastaSeqs / processors;
526 int startIndex = pid * numSeqsPerProcessor;
527 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
531 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos);
533 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; }
536 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
540 MPI_File_close(&inMPI);
541 MPI_File_close(&outMPINewTax);
542 MPI_File_close(&outMPITempTax);
543 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
547 vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
549 for (int i = 0; i < (positions.size()-1); i++) {
550 lines.push_back(new linePair(positions[i], positions[(i+1)]));
553 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
555 numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
558 processIDS.resize(0);
560 numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
564 numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
568 m->mothurOutEndLine();
569 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
574 if (pid == 0) { //this part does not need to be paralellized
576 if(namefile != "") { m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush(); MPIReadNamesFile(namefileNames[s]); m->mothurOut(" Done."); m->mothurOutEndLine(); }
581 m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush();
583 nameMap.clear(); //remove old names
586 m->openInputFile(namefileNames[s], inNames);
588 string firstCol, secondCol;
589 while(!inNames.eof()) {
590 inNames >> firstCol >> secondCol; m->gobble(inNames);
593 m->splitAtComma(secondCol, temp);
595 nameMap[firstCol] = temp;
599 m->mothurOut(" Done."); m->mothurOutEndLine();
604 if (groupfile != "") { group = groupfileNames[s]; }
606 PhyloSummary taxaSum(taxonomyFileName, group);
608 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; }
610 if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); }
613 m->openInputFile(tempTaxonomyFile, in);
615 //read in users taxonomy file and add sequences to tree
619 in >> name >> taxon; m->gobble(in);
621 itNames = nameMap.find(name);
623 if (itNames == nameMap.end()) {
624 m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
626 for (int i = 0; i < itNames->second.size(); i++) {
627 taxaSum.addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs
629 itNames->second.clear();
630 nameMap.erase(itNames->first);
635 remove(tempTaxonomyFile.c_str());
637 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; }
641 m->openOutputFile(taxSummary, outTaxTree);
642 taxaSum.print(outTaxTree);
645 //output taxonomy with the unclassified bins added
647 m->openInputFile(newTaxonomyFile, inTax);
650 string unclass = newTaxonomyFile + ".unclass.temp";
651 m->openOutputFile(unclass, outTax);
653 //get maxLevel from phylotree so you know how many 'unclassified's to add
654 int maxLevel = taxaSum.getMaxLevel();
656 //read taxfile - this reading and rewriting is done to preserve the confidence scores.
658 while (!inTax.eof()) {
659 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; }
661 inTax >> name >> taxon; m->gobble(inTax);
663 string newTax = addUnclassifieds(taxon, maxLevel);
665 outTax << name << '\t' << newTax << endl;
670 remove(newTaxonomyFile.c_str());
671 rename(unclass.c_str(), newTaxonomyFile.c_str());
673 m->mothurOutEndLine();
674 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
680 m->mothurOutEndLine();
681 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
682 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
683 m->mothurOutEndLine();
689 catch(exception& e) {
690 m->errorOut(e, "ClassifySeqsCommand", "execute");
695 /**************************************************************************************************/
696 string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
698 string newTax, taxon;
701 //keep what you have counting the levels
702 while (tax.find_first_of(';') != -1) {
704 taxon = tax.substr(0,tax.find_first_of(';'))+';';
705 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
710 //add "unclassified" until you reach maxLevel
711 while (level < maxlevel) {
712 newTax += "unclassified;";
718 catch(exception& e) {
719 m->errorOut(e, "ClassifySeqsCommand", "addUnclassifieds");
724 /**************************************************************************************************/
726 int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) {
728 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
732 //loop through and create all the processes you want
733 while (process != processors) {
737 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
740 num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename);
742 //pass numSeqs to parent
744 string tempFile = filename + toString(getpid()) + ".num.temp";
745 m->openOutputFile(tempFile, out);
751 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
752 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
757 //parent does its part
758 num = driver(lines[0], taxFileName, tempTaxFile, filename);
760 //force parent to wait until all the processes are done
761 for (int i=0;i<processIDS.size();i++) {
762 int temp = processIDS[i];
766 for (int i = 0; i < processIDS.size(); i++) {
768 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
769 m->openInputFile(tempFile, in);
770 if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
771 in.close(); remove(tempFile.c_str());
774 for(int i=0;i<processIDS.size();i++){
775 appendTaxFiles((taxFileName + toString(processIDS[i]) + ".temp"), taxFileName);
776 appendTaxFiles((tempTaxFile + toString(processIDS[i]) + ".temp"), tempTaxFile);
777 remove((taxFileName + toString(processIDS[i]) + ".temp").c_str());
778 remove((tempTaxFile + toString(processIDS[i]) + ".temp").c_str());
784 catch(exception& e) {
785 m->errorOut(e, "ClassifySeqsCommand", "createProcesses");
789 /**************************************************************************************************/
791 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
796 m->openOutputFileAppend(filename, output);
797 m->openInputFile(temp, input);
799 while(char c = input.get()){
800 if(input.eof()) { break; }
801 else { output << c; }
807 catch(exception& e) {
808 m->errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
813 //**********************************************************************************************************************
815 int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string filename){
818 m->openOutputFile(taxFName, outTax);
820 ofstream outTaxSimple;
821 m->openOutputFile(tempTFName, outTaxSimple);
824 m->openInputFile(filename, inFASTA);
828 inFASTA.seekg(filePos->start);
834 if (m->control_pressed) { return 0; }
836 Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA);
838 if (candidateSeq->getName() != "") {
840 taxonomy = classify->getTaxonomy(candidateSeq);
842 if (m->control_pressed) { delete candidateSeq; return 0; }
844 if (taxonomy != "bad seq") {
845 //output confidence scores or not
847 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
849 outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
852 outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
858 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
859 unsigned long int pos = inFASTA.tellg();
860 if ((pos == -1) || (pos >= filePos->end)) { break; }
862 if (inFASTA.eof()) { break; }
866 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
870 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
874 outTaxSimple.close();
878 catch(exception& e) {
879 m->errorOut(e, "ClassifySeqsCommand", "driver");
883 //**********************************************************************************************************************
885 int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector<unsigned long int>& MPIPos){
887 MPI_Status statusNew;
888 MPI_Status statusTemp;
892 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
897 for(int i=0;i<num;i++){
899 if (m->control_pressed) { return 0; }
902 int length = MPIPos[start+i+1] - MPIPos[start+i];
903 char* buf4 = new char[length];
904 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
906 string tempBuf = buf4;
907 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
908 istringstream iss (tempBuf,istringstream::in);
911 Sequence* candidateSeq = new Sequence(iss);
913 if (candidateSeq->getName() != "") {
914 taxonomy = classify->getTaxonomy(candidateSeq);
916 if (taxonomy != "bad seq") {
917 //output confidence scores or not
919 outputString = candidateSeq->getName() + "\t" + taxonomy + "\n";
921 outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
924 int length = outputString.length();
925 char* buf2 = new char[length];
926 memcpy(buf2, outputString.c_str(), length);
928 MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew);
931 outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n";
932 length = outputString.length();
933 char* buf = new char[length];
934 memcpy(buf, outputString.c_str(), length);
936 MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp);
942 if((i+1) % 100 == 0){ cout << "Classifying sequence " << (i+1) << endl; }
945 if(num % 100 != 0){ cout << "Classifying sequence " << (num) << endl; }
950 catch(exception& e) {
951 m->errorOut(e, "ClassifySeqsCommand", "driverMPI");
956 //**********************************************************************************************************************
957 int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){
960 nameMap.clear(); //remove old names
966 //char* inFileName = new char[nameFilename.length()];
967 //memcpy(inFileName, nameFilename.c_str(), nameFilename.length());
969 char inFileName[1024];
970 strcpy(inFileName, nameFilename.c_str());
972 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
973 MPI_File_get_size(inMPI, &size);
976 char* buffer = new char[size];
977 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
979 string tempBuf = buffer;
980 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
981 istringstream iss (tempBuf,istringstream::in);
984 string firstCol, secondCol;
986 iss >> firstCol >> secondCol; m->gobble(iss);
989 m->splitAtComma(secondCol, temp);
991 nameMap[firstCol] = temp;
994 MPI_File_close(&inMPI);
998 catch(exception& e) {
999 m->errorOut(e, "ClassifySeqsCommand", "MPIReadNamesFile");
1004 /**************************************************************************************************/