classifyseqscommand.cpp

   1 /*
   2  *  classifyseqscommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 11/2/09.
   6  *  Copyright 2009 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "classifyseqscommand.h"
  11 #include "sequence.hpp"
  12 #include "bayesian.h"
  13 #include "phylotree.h"
  14 #include "knn.h"
  15
  16 //**********************************************************************************************************************
  17
  18 ClassifySeqsCommand::ClassifySeqsCommand(string option){
  19         try {
  20                 abort = false;
  21
  22                 //allow user to run help
  23                 if(option == "help") { help(); abort = true; }
  24
  25                 else {
  26
  27                         //valid paramters for this command
  28                         string AlignArray[] =  {"template","fasta","name","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters"};
  29                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
  30
  31                         OptionParser parser(option);
  32                         map<string, string> parameters = parser.getParameters();
  33
  34                         ValidParameters validParameter;
  35
  36                         //check to make sure all parameters are valid for command
  37                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
  38                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
  39                         }
  40
  41                         //check for required parameters
  42                         templateFileName = validParameter.validFile(parameters, "template", true);
  43                         if (templateFileName == "not found") {
  44                                 mothurOut("template is a required parameter for the classify.seqs command.");
  45                                 mothurOutEndLine();
  46                                 abort = true;
  47                         }
  48                         else if (templateFileName == "not open") { abort = true; }
  49
  50                         fastaFileName = validParameter.validFile(parameters, "fasta", false);
  51                         if (fastaFileName == "not found") { mothurOut("fasta is a required parameter for the classify.seqs command."); mothurOutEndLine(); abort = true;  }
  52                         else {
  53                                 splitAtDash(fastaFileName, fastaFileNames);
  54
  55                                 //go through files and make sure they are good, if not, then disregard them
  56                                 for (int i = 0; i < fastaFileNames.size(); i++) {
  57                                         int ableToOpen;
  58                                         ifstream in;
  59                                         ableToOpen = openInputFile(fastaFileNames[i], in);
  60                                         if (ableToOpen == 1) {
  61                                                 mothurOut(fastaFileNames[i] + " will be disregarded."); mothurOutEndLine();
  62                                                 //erase from file list
  63                                                 fastaFileNames.erase(fastaFileNames.begin()+i);
  64                                                 i--;
  65                                         }
  66                                         in.close();
  67                                 }
  68
  69                                 //make sure there is at least one valid file left
  70                                 if (fastaFileNames.size() == 0) { mothurOut("no valid files."); mothurOutEndLine(); abort = true; }
  71                         }
  72
  73
  74                         taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
  75                         if (taxonomyFileName == "not found") {
  76                                 mothurOut("taxonomy is a required parameter for the classify.seqs command.");
  77                                 mothurOutEndLine();
  78                                 abort = true;
  79                         }
  80                         else if (taxonomyFileName == "not open") { abort = true; }
  81
  82
  83                         namefile = validParameter.validFile(parameters, "name", false);
  84                         if (namefile == "not found") { namefile = "";  }
  85                         else {
  86                                 splitAtDash(namefile, namefileNames);
  87
  88                                 //go through files and make sure they are good, if not, then disregard them
  89                                 for (int i = 0; i < namefileNames.size(); i++) {
  90                                         int ableToOpen;
  91                                         ifstream in;
  92                                         ableToOpen = openInputFile(namefileNames[i], in);
  93                                         if (ableToOpen == 1) {  mothurOut("Unable to match name file with fasta file."); mothurOutEndLine(); abort = true;      }
  94                                         in.close();
  95                                 }
  96                         }
  97
  98                         if (namefile != "") {
  99                                 if (namefileNames.size() != fastaFileNames.size()) { abort = true; mothurOut("If you provide a name file, you must have one for each fasta file."); mothurOutEndLine(); }
 100                         }
 101
 102                         //check for optional parameter and set defaults
 103                         // ...at some point should added some additional type checking...
 104                         string temp;
 105                         temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
 106                         convert(temp, kmerSize);
 107
 108                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
 109                         convert(temp, processors);
 110
 111                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
 112
 113                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
 114
 115                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
 116                         convert(temp, match);
 117
 118                         temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
 119                         convert(temp, misMatch);
 120
 121                         temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
 122                         convert(temp, gapOpen);
 123
 124                         temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
 125                         convert(temp, gapExtend);
 126
 127                         temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
 128                         convert(temp, numWanted);
 129
 130                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found"){       temp = "0";                             }
 131                         convert(temp, cutoff);
 132
 133                         temp = validParameter.validFile(parameters, "probs", false);            if (temp == "not found"){       temp = "true";                  }
 134                         probs = isTrue(temp);
 135
 136                         temp = validParameter.validFile(parameters, "iters", false);            if (temp == "not found") { temp = "100";                        }
 137                         convert(temp, iters);
 138
 139
 140
 141                         if ((method == "bayesian") && (search != "kmer"))  {
 142                                 mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); mothurOutEndLine();
 143                                 search = "kmer";
 144                         }
 145                 }
 146
 147         }
 148         catch(exception& e) {
 149                 errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
 150                 exit(1);
 151         }
 152 }
 153
 154 //**********************************************************************************************************************
 155
 156 ClassifySeqsCommand::~ClassifySeqsCommand(){
 157
 158         if (abort == false) {
 159                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
 160         }
 161 }
 162
 163 //**********************************************************************************************************************
 164
 165 void ClassifySeqsCommand::help(){
 166         try {
 167                 mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
 168                 mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n");
 169                 mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
 170                 mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer and blast. The default is kmer.\n");
 171                 mothurOut("The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n");
 172                 mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
 173                 mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
 174                 mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
 175                 mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
 176                 mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n");
 177                 mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n");
 178                 mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n");
 179                 mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n");
 180                 mothurOut("The probs parameter shut off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be run.\n");
 181                 mothurOut("The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n");
 182                 mothurOut("The classify.seqs command should be in the following format: \n");
 183                 mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
 184                 mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
 185                 mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
 186                 mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
 187                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
 188         }
 189         catch(exception& e) {
 190                 errorOut(e, "ClassifySeqsCommand", "help");
 191                 exit(1);
 192         }
 193 }
 194
 195
 196 //**********************************************************************************************************************
 197
 198 int ClassifySeqsCommand::execute(){
 199         try {
 200                 if (abort == true) {    return 0;       }
 201
 202                 if(method == "bayesian")                        {       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);           }
 203                 else if(method == "knn")                        {       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
 204                 else {
 205                         mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
 206                         mothurOutEndLine();
 207                         classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);
 208                 }
 209
 210
 211                 for (int s = 0; s < fastaFileNames.size(); s++) {
 212
 213                         //read namefile
 214                         if(namefile != "") {
 215                                 nameMap.clear(); //remove old names
 216
 217                                 ifstream inNames;
 218                                 openInputFile(namefileNames[s], inNames);
 219
 220                                 string firstCol, secondCol;
 221                                 while(!inNames.eof()) {
 222                                         inNames >> firstCol >> secondCol; gobble(inNames);
 223                                         nameMap[firstCol] = getNumNames(secondCol);  //ex. seq1 seq1,seq3,seq5 -> seq1 = 3.
 224                                 }
 225                                 inNames.close();
 226                         }
 227
 228                         mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); mothurOutEndLine();
 229                         string newTaxonomyFile = getRootName(fastaFileNames[s]) + getRootName(taxonomyFileName) + "taxonomy";
 230                         string tempTaxonomyFile = getRootName(fastaFileNames[s]) + "taxonomy.temp";
 231                         string taxSummary = getRootName(fastaFileNames[s]) + getRootName(taxonomyFileName) + "tax.summary";
 232
 233                         int start = time(NULL);
 234                         int numFastaSeqs = 0;
 235                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
 236
 237 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 238                         if(processors == 1){
 239                                 ifstream inFASTA;
 240                                 openInputFile(fastaFileNames[s], inFASTA);
 241                                 numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
 242                                 inFASTA.close();
 243
 244                                 lines.push_back(new linePair(0, numFastaSeqs));
 245
 246                                 driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
 247                         }
 248                         else{
 249                                 vector<int> positions;
 250                                 processIDS.resize(0);
 251
 252                                 ifstream inFASTA;
 253                                 openInputFile(fastaFileNames[s], inFASTA);
 254
 255                                 string input;
 256                                 while(!inFASTA.eof()){
 257                                         input = getline(inFASTA);
 258                                         if (input.length() != 0) {
 259                                                 if(input[0] == '>'){    int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);       }
 260                                         }
 261                                 }
 262                                 inFASTA.close();
 263
 264                                 numFastaSeqs = positions.size();
 265
 266                                 int numSeqsPerProcessor = numFastaSeqs / processors;
 267
 268                                 for (int i = 0; i < processors; i++) {
 269                                         int startPos = positions[ i * numSeqsPerProcessor ];
 270                                         if(i == processors - 1){
 271                                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
 272                                         }
 273                                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
 274                                 }
 275                                 createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
 276
 277                                 rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
 278                                 rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str());
 279
 280                                 for(int i=1;i<processors;i++){
 281                                         appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
 282                                         appendTaxFiles((tempTaxonomyFile + toString(processIDS[i]) + ".temp"), tempTaxonomyFile);
 283                                         remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
 284                                         remove((tempTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
 285                                 }
 286
 287                         }
 288 #else
 289                         ifstream inFASTA;
 290                         openInputFile(fastaFileNames[s], inFASTA);
 291                         numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
 292                         inFASTA.close();
 293
 294                         lines.push_back(new linePair(0, numFastaSeqs));
 295
 296                         driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
 297 #endif
 298                         //make taxonomy tree from new taxonomy file
 299                         PhyloTree taxaBrowser;
 300
 301                         ifstream in;
 302                         openInputFile(tempTaxonomyFile, in);
 303
 304                         //read in users taxonomy file and add sequences to tree
 305                         string name, taxon;
 306                         while(!in.eof()){
 307                                 in >> name >> taxon; gobble(in);
 308
 309                                 if (namefile != "") {
 310                                         itNames = nameMap.find(name);
 311
 312                                         if (itNames == nameMap.end()) {
 313                                                 mothurOut(name + " is not in your name file please correct."); mothurOutEndLine(); exit(1);
 314                                         }else{
 315                                                 for (int i = 0; i < itNames->second; i++) {
 316                                                         taxaBrowser.addSeqToTree(name+toString(i), taxon);  //add it as many times as there are identical seqs
 317                                                 }
 318                                         }
 319                                 }else {  taxaBrowser.addSeqToTree(name, taxon);  } //add it once
 320                         }
 321                         in.close();
 322
 323                         taxaBrowser.assignHeirarchyIDs(0);
 324
 325                         taxaBrowser.binUnclassified();
 326
 327                         remove(tempTaxonomyFile.c_str());
 328
 329                         //print summary file
 330                         ofstream outTaxTree;
 331                         openOutputFile(taxSummary, outTaxTree);
 332                         taxaBrowser.print(outTaxTree);
 333                         outTaxTree.close();
 334
 335                         //output taxonomy with the unclassified bins added
 336                         ifstream inTax;
 337                         openInputFile(newTaxonomyFile, inTax);
 338
 339                         ofstream outTax;
 340                         string unclass = newTaxonomyFile + ".unclass.temp";
 341                         openOutputFile(unclass, outTax);
 342
 343                         //get maxLevel from phylotree so you know how many 'unclassified's to add
 344                         int maxLevel = taxaBrowser.getMaxLevel();
 345
 346                         //read taxfile - this reading and rewriting is done to preserve the confidence sscores.
 347                         while (!inTax.eof()) {
 348                                 inTax >> name >> taxon; gobble(inTax);
 349
 350                                 string newTax = addUnclassifieds(taxon, maxLevel);
 351
 352                                 outTax << name << '\t' << newTax << endl;
 353                         }
 354                         inTax.close();
 355                         outTax.close();
 356
 357                         remove(newTaxonomyFile.c_str());
 358                         rename(unclass.c_str(), newTaxonomyFile.c_str());
 359
 360                         mothurOutEndLine();
 361                         mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); mothurOutEndLine(); mothurOutEndLine();
 362                 }
 363
 364                 delete classify;
 365                 return 0;
 366         }
 367         catch(exception& e) {
 368                 errorOut(e, "ClassifySeqsCommand", "execute");
 369                 exit(1);
 370         }
 371 }
 372
 373 /**************************************************************************************************/
 374 string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
 375         try{
 376                 string newTax, taxon;
 377                 int level = 0;
 378
 379                 //keep what you have counting the levels
 380                 while (tax.find_first_of(';') != -1) {
 381                         //get taxon
 382                         taxon = tax.substr(0,tax.find_first_of(';'));
 383                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
 384                         newTax += taxon;
 385                         level++;
 386                 }
 387
 388                 //add "unclassified" until you reach maxLevel
 389                 while (level < maxlevel) {
 390                         newTax += "unclassified;";
 391                         level++;
 392                 }
 393
 394                 return newTax;
 395         }
 396         catch(exception& e) {
 397                 errorOut(e, "ClassifySeqsCommand", "addUnclassifieds");
 398                 exit(1);
 399         }
 400 }
 401
 402 /**************************************************************************************************/
 403
 404 void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) {
 405         try {
 406 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 407                 int process = 0;
 408                 //              processIDS.resize(0);
 409
 410                 //loop through and create all the processes you want
 411                 while (process != processors) {
 412                         int pid = fork();
 413
 414                         if (pid > 0) {
 415                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 416                                 process++;
 417                         }else if (pid == 0){
 418                                 driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename);
 419                                 exit(0);
 420                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
 421                 }
 422
 423                 //force parent to wait until all the processes are done
 424                 for (int i=0;i<processors;i++) {
 425                         int temp = processIDS[i];
 426                         wait(&temp);
 427                 }
 428 #endif
 429         }
 430         catch(exception& e) {
 431                 errorOut(e, "ClassifySeqsCommand", "createProcesses");
 432                 exit(1);
 433         }
 434 }
 435 /**************************************************************************************************/
 436
 437 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
 438         try{
 439
 440                 ofstream output;
 441                 ifstream input;
 442                 openOutputFileAppend(filename, output);
 443                 openInputFile(temp, input);
 444
 445                 while(char c = input.get()){
 446                         if(input.eof())         {       break;                  }
 447                         else                            {       output << c;    }
 448                 }
 449
 450                 input.close();
 451                 output.close();
 452         }
 453         catch(exception& e) {
 454                 errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
 455                 exit(1);
 456         }
 457 }
 458
 459 //**********************************************************************************************************************
 460
 461 int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName, string filename){
 462         try {
 463                 ofstream outTax;
 464                 openOutputFile(taxFName, outTax);
 465
 466                 ofstream outTaxSimple;
 467                 openOutputFile(tempTFName, outTaxSimple);
 468
 469                 ifstream inFASTA;
 470                 openInputFile(filename, inFASTA);
 471
 472                 inFASTA.seekg(line->start);
 473
 474                 string taxonomy;
 475
 476                 for(int i=0;i<line->numSeqs;i++){
 477
 478                         Sequence* candidateSeq = new Sequence(inFASTA);
 479
 480                         if (candidateSeq->getName() != "") {
 481                                 taxonomy = classify->getTaxonomy(candidateSeq);
 482
 483                                 if (taxonomy != "bad seq") {
 484                                         //output confidence scores or not
 485                                         if (probs) {
 486                                                 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
 487                                         }else{
 488                                                 outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
 489                                         }
 490
 491                                         outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
 492                                 }
 493                         }
 494                         delete candidateSeq;
 495
 496                         if((i+1) % 100 == 0){
 497                                 mothurOut("Classifying sequence " + toString(i+1)); mothurOutEndLine();
 498                         }
 499                 }
 500
 501                 inFASTA.close();
 502                 outTax.close();
 503                 outTaxSimple.close();
 504
 505                 return 1;
 506         }
 507         catch(exception& e) {
 508                 errorOut(e, "ClassifySeqsCommand", "driver");
 509                 exit(1);
 510         }
 511 }
 512
 513 /**************************************************************************************************/