From: westcott Date: Tue, 24 May 2011 19:39:22 +0000 (+0000) Subject: paralellized seq.error and dist.shared added some error checks to libshuff and dist... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=f06fdb807822f8e06db003ed809c87250905cfc8 paralellized seq.error and dist.shared added some error checks to libshuff and dist.seqs --- diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index df97fd8..6f7657f 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -8,6 +8,7 @@ */ #include "deconvolutecommand.h" +#include "sequence.hpp" //********************************************************************************************************************** vector DeconvoluteCommand::setParameters(){ @@ -144,15 +145,95 @@ int DeconvoluteCommand::execute() { string outNameFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + "names"; string outFastaFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + "unique" + m->getExtension(inFastaName); - FastaMap fastamap; - - if(oldNameMapFName == "") { fastamap.readFastaFile(inFastaName); } - else { fastamap.readFastaFile(inFastaName, oldNameMapFName); } + map nameMap; + map::iterator itNames; + if (oldNameMapFName != "") { m->readNames(oldNameMapFName, nameMap); } if (m->control_pressed) { return 0; } - fastamap.printCondensedFasta(outFastaFile); - fastamap.printNamesFile(outNameFile); + ifstream in; + m->openInputFile(inFastaName, in); + + ofstream outFasta; + m->openOutputFile(outFastaFile, outFasta); + + map sequenceStrings; //sequenceString -> list of names. "atgc...." -> seq1,seq2,seq3. + map::iterator itStrings; + set nameInFastaFile; //for sanity checking + set::iterator itname; + int count = 0; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); outFasta.close(); remove(outFastaFile.c_str()); return 0; } + + Sequence seq(in); + + if (seq.getName() != "") { + + //sanity checks + itname = nameInFastaFile.find(seq.getName()); + if (itname == nameInFastaFile.end()) { nameInFastaFile.insert(seq.getName()); } + else { m->mothurOut("[ERROR]: You already have a sequence named " + seq.getName() + " in your fasta file, sequence names must be unique, please correct."); m->mothurOutEndLine(); } + + itStrings = sequenceStrings.find(seq.getAligned()); + + if (itStrings == sequenceStrings.end()) { //this is a new unique sequence + //output to unique fasta file + seq.printSequence(outFasta); + + if (oldNameMapFName != "") { + itNames = nameMap.find(seq.getName()); + + if (itNames == nameMap.end()) { //namefile and fastafile do not match + m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine(); + }else { + sequenceStrings[seq.getAligned()] = itNames->second; + } + }else { sequenceStrings[seq.getAligned()] = seq.getName(); } + }else { //this is a dup + if (oldNameMapFName != "") { + itNames = nameMap.find(seq.getName()); + + if (itNames == nameMap.end()) { //namefile and fastafile do not match + m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine(); + }else { + sequenceStrings[seq.getAligned()] += "," + itNames->second; + } + }else { sequenceStrings[seq.getAligned()] += "," + seq.getName(); } + } + + count++; + } + + m->gobble(in); + + if(count % 1000 == 0) { m->mothurOut(toString(count) + "\t" + toString(sequenceStrings.size())); m->mothurOutEndLine(); } + } + + if(count % 1000 != 0) { m->mothurOut(toString(count) + "\t" + toString(sequenceStrings.size())); m->mothurOutEndLine(); } + + in.close(); + outFasta.close(); + + if (m->control_pressed) { remove(outFastaFile.c_str()); return 0; } + + //print new names file + ofstream outNames; + m->openOutputFile(outNameFile, outNames); + + for (itStrings = sequenceStrings.begin(); itStrings != sequenceStrings.end(); itStrings++) { + if (m->control_pressed) { outputTypes.clear(); remove(outFastaFile.c_str()); outNames.close(); remove(outNameFile.c_str()); return 0; } + + //get rep name + int pos = (itStrings->second).find_first_of(','); + + if (pos == string::npos) { // only reps itself + outNames << itStrings->second << '\t' << itStrings->second << endl; + }else { + outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl; + } + } + outNames.close(); if (m->control_pressed) { outputTypes.clear(); remove(outFastaFile.c_str()); remove(outNameFile.c_str()); return 0; } diff --git a/distancecommand.cpp b/distancecommand.cpp index 34206a9..b73cd7c 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -247,6 +247,8 @@ int DistanceCommand::execute(){ int numSeqs = alignDB.getNumSeqs(); cutoff += 0.005; + if (!alignDB.sameLength()) { m->mothurOut("[ERROR]: your sequences are not the same length, aborting."); m->mothurOutEndLine(); return 0; } + string outputFile; if (output == "lt") { //does the user want lower triangle phylip formatted file diff --git a/getlineagecommand.cpp b/getlineagecommand.cpp index c6671d1..fc17310 100644 --- a/getlineagecommand.cpp +++ b/getlineagecommand.cpp @@ -210,7 +210,7 @@ GetLineageCommand::GetLineageCommand(string option) { if (taxons[0] == '\'') { taxons = taxons.substr(1); } if (taxons[(taxons.length()-1)] == '\'') { taxons = taxons.substr(0, (taxons.length()-1)); } } - + m->splitAtChar(taxons, listOfTaxons, '-'); if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } } @@ -550,15 +550,18 @@ int GetLineageCommand::readTax(){ string name, tax; //bool wroteSomething = false; - - bool taxonsHasConfidence = false; - vector< map > searchTaxons; - string noConfidenceTaxons = taxons; - int hasConPos = taxons.find_first_of('('); - if (hasConPos != string::npos) { - taxonsHasConfidence = true; - searchTaxons = getTaxons(taxons); - noConfidenceTaxons = removeConfidences(taxons); + vector taxonsHasConfidence; taxonsHasConfidence.resize(listOfTaxons.size(), false); + vector< vector< map > > searchTaxons; searchTaxons.resize(listOfTaxons.size()); + vector noConfidenceTaxons; noConfidenceTaxons.resize(listOfTaxons.size(), ""); + + for (int i = 0; i < listOfTaxons.size(); i++) { + noConfidenceTaxons[i] = listOfTaxons[i]; + int hasConPos = listOfTaxons[i].find_first_of('('); + if (hasConPos != string::npos) { + taxonsHasConfidence[i] = true; + searchTaxons[i] = getTaxons(listOfTaxons[i]); + noConfidenceTaxons[i] = removeConfidences(listOfTaxons[i]); + } } @@ -569,91 +572,96 @@ int GetLineageCommand::readTax(){ in >> name; //read from first column in >> tax; //read from second column - string newtax = tax; - + for (int j = 0; j < listOfTaxons.size(); j++) { + + string newtax = tax; - //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them - if (!taxonsHasConfidence) { - int hasConfidences = tax.find_first_of('('); - if (hasConfidences != string::npos) { - newtax = removeConfidences(tax); - } - - int pos = newtax.find(taxons); + //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them + if (!taxonsHasConfidence[j]) { + int hasConfidences = tax.find_first_of('('); + if (hasConfidences != string::npos) { + newtax = removeConfidences(tax); + } - if (pos != string::npos) { //this sequence contains the taxon the user wants - names.insert(name); - out << name << '\t' << tax << endl; - } + int pos = newtax.find(noConfidenceTaxons[j]); - }else{//if taxons has them and you don't them remove taxons - int hasConfidences = tax.find_first_of('('); - if (hasConfidences == string::npos) { - - int pos = newtax.find(noConfidenceTaxons); - if (pos != string::npos) { //this sequence contains the taxon the user wants - names.insert(name); + names.insert(name); out << name << '\t' << tax << endl; + //since you belong to at least one of the taxons we want you are included so no need to search for other + break; } - }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons - //first remove confidences from both and see if the taxonomy exists - - string noNewTax = tax; + }else{//if listOfTaxons[i] has them and you don't them remove taxons int hasConfidences = tax.find_first_of('('); - if (hasConfidences != string::npos) { - noNewTax = removeConfidences(tax); - } + if (hasConfidences == string::npos) { - int pos = noNewTax.find(noConfidenceTaxons); + int pos = newtax.find(noConfidenceTaxons[j]); - if (pos != string::npos) { //if yes, then are the confidences okay + if (pos != string::npos) { //this sequence contains the taxon the user wants + names.insert(name); + out << name << '\t' << tax << endl; + //since you belong to at least one of the taxons we want you are included so no need to search for other + break; + } + }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons + //first remove confidences from both and see if the taxonomy exists + + string noNewTax = tax; + int hasConfidences = tax.find_first_of('('); + if (hasConfidences != string::npos) { + noNewTax = removeConfidences(tax); + } + + int pos = noNewTax.find(noConfidenceTaxons[j]); + + if (pos != string::npos) { //if yes, then are the confidences okay - bool good = true; - vector< map > usersTaxon = getTaxons(newtax); + bool good = true; + vector< map > usersTaxon = getTaxons(newtax); - //the usersTaxon is most likely longer than the searchTaxons, and searchTaxon[0] may relate to userTaxon[4] - //we want to "line them up", so we will find the the index where the searchstring starts - int index = 0; - for (int i = 0; i < usersTaxon.size(); i++) { + //the usersTaxon is most likely longer than the searchTaxons, and searchTaxon[0] may relate to userTaxon[4] + //we want to "line them up", so we will find the the index where the searchstring starts + int index = 0; + for (int i = 0; i < usersTaxon.size(); i++) { - if (usersTaxon[i].begin()->first == searchTaxons[0].begin()->first) { - index = i; - int spot = 0; - bool goodspot = true; - //is this really the start, or are we dealing with a taxon of the same name? - while ((spot < searchTaxons.size()) && ((i+spot) < usersTaxon.size())) { - if (usersTaxon[i+spot].begin()->first != searchTaxons[spot].begin()->first) { goodspot = false; break; } - else { spot++; } - } + if (usersTaxon[i].begin()->first == searchTaxons[j][0].begin()->first) { + index = i; + int spot = 0; + bool goodspot = true; + //is this really the start, or are we dealing with a taxon of the same name? + while ((spot < searchTaxons[j].size()) && ((i+spot) < usersTaxon.size())) { + if (usersTaxon[i+spot].begin()->first != searchTaxons[j][spot].begin()->first) { goodspot = false; break; } + else { spot++; } + } - if (goodspot) { break; } + if (goodspot) { break; } + } } - } - for (int i = 0; i < searchTaxons.size(); i++) { + for (int i = 0; i < searchTaxons[j].size(); i++) { - if ((i+index) < usersTaxon.size()) { //just in case, should never be false - if (usersTaxon[i+index].begin()->second < searchTaxons[i].begin()->second) { //is the users cutoff less than the search taxons + if ((i+index) < usersTaxon.size()) { //just in case, should never be false + if (usersTaxon[i+index].begin()->second < searchTaxons[j][i].begin()->second) { //is the users cutoff less than the search taxons + good = false; + break; + } + }else { good = false; break; } - }else { - good = false; - break; } - } - //passed the test so add you - if (good) { - names.insert(name); - out << name << '\t' << tax << endl; + //passed the test so add you + if (good) { + names.insert(name); + out << name << '\t' << tax << endl; + break; + } } } } - } - + } m->gobble(in); } diff --git a/getlineagecommand.h b/getlineagecommand.h index d62c50f..ca84bf2 100644 --- a/getlineagecommand.h +++ b/getlineagecommand.h @@ -32,7 +32,7 @@ class GetLineageCommand : public Command { private: set names; - vector outputNames; + vector outputNames, listOfTaxons; string fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir, taxons; bool abort, dups; diff --git a/getseqscommand.cpp b/getseqscommand.cpp index ad9698b..a4697ca 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -25,7 +25,8 @@ vector GetSeqsCommand::setParameters(){ CommandParameter pdups("dups", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pdups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); - + CommandParameter paccnos2("accnos2", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(paccnos2); + vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } return myArray; @@ -232,6 +233,11 @@ GetSeqsCommand::GetSeqsCommand(string option) { if (qualfile == "not open") { abort = true; } else if (qualfile == "not found") { qualfile = ""; } + accnosfile2 = validParameter.validFile(parameters, "accnos2", true); + if (accnosfile2 == "not open") { abort = true; } + else if (accnosfile2 == "not found") { accnosfile2 = ""; } + + string usedDups = "true"; string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "true"; usedDups = ""; } dups = m->isTrue(temp); @@ -824,7 +830,15 @@ int GetSeqsCommand::compareAccnos(){ in >> name; if (namesAccnos.count(name) == 0){ //name unique to accnos2 - namesAccnos2.insert(name); + int pos = name.find_last_of('_'); + string tempName = name; + if (pos != string::npos) { tempName = tempName.substr(pos+1); cout << tempName << endl; } + if (namesAccnos.count(tempName) == 0){ + namesAccnos2.insert(name); + }else { //you are in both so erase + namesAccnos.erase(name); + namesDups.insert(name); + } }else { //you are in both so erase namesAccnos.erase(name); namesDups.insert(name); diff --git a/libshuffcommand.cpp b/libshuffcommand.cpp index 0210414..1143011 100644 --- a/libshuffcommand.cpp +++ b/libshuffcommand.cpp @@ -243,7 +243,11 @@ int LibShuffCommand::execute(){ setGroups(); //set the groups to be analyzed and sorts them - + + if (numGroups < 2) { m->mothurOut("[ERROR]: libshuff requires at least 2 groups, you only have " + toString(numGroups) + ", aborting."); m->mothurOutEndLine(); m->control_pressed = true; } + + if (m->control_pressed) { delete groupMap; delete matrix; return 0; } + /********************************************************************************************/ //this is needed because when we read the matrix we sort it into groups in alphabetical order //the rest of the command and the classes used in this command assume specific order diff --git a/matrixoutputcommand.cpp b/matrixoutputcommand.cpp index 9c7a836..adb145a 100644 --- a/matrixoutputcommand.cpp +++ b/matrixoutputcommand.cpp @@ -327,7 +327,6 @@ int MatrixOutputCommand::execute(){ for (int i = 0; i < processors; i++) { lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups); lines[i].end = int (sqrt(float(i+1)/float(processors)) * numGroups); - cout << i << '\t' << lines[i].start << '\t' << lines[i].end << endl; } if (m->control_pressed) { delete input; for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } m->Groups.clear(); return 0; } diff --git a/matrixoutputcommand.h b/matrixoutputcommand.h index 2c403aa..b42cfb9 100644 --- a/matrixoutputcommand.h +++ b/matrixoutputcommand.h @@ -38,21 +38,30 @@ public: void help() { m->mothurOut(getHelpString()); } private: - void printSims(ostream&); + struct linePair { + int start; + int end; + }; + vector lines; + + void printSims(ostream&, vector< vector >&); int process(vector); vector matrixCalculators; - vector< vector > simMatrix; + //vector< vector > simMatrix; InputData* input; vector lookup; string exportFileName, output, sharedfile; - int numGroups; + int numGroups, processors; ofstream out; bool abort, allLines; set labels; //holds labels to be used string outputFile, calc, groups, label, outputDir; vector Estimators, Groups, outputNames; //holds estimators to be used + int process(vector, string, string); + int driver(vector, int, int, vector< vector >&); + }; diff --git a/mothurout.cpp b/mothurout.cpp index 5a0ea12..a56a621 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -247,6 +247,53 @@ void MothurOut::mothurOutEndLine() { } } /*********************************************************************************************/ +void MothurOut::mothurOut(string output, ofstream& outputFile) { + try { + +#ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + + if (pid == 0) { //only one process should output to screen +#endif + + cout << output; + out << output; + outputFile << output; + +#ifdef USE_MPI + } +#endif + } + catch(exception& e) { + errorOut(e, "MothurOut", "MothurOut"); + exit(1); + } +} +/*********************************************************************************************/ +void MothurOut::mothurOutEndLine(ofstream& outputFile) { + try { +#ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + + if (pid == 0) { //only one process should output to screen +#endif + + cout << endl; + out << endl; + outputFile << endl; + +#ifdef USE_MPI + } +#endif + } + catch(exception& e) { + errorOut(e, "MothurOut", "MothurOutEndLine"); + exit(1); + } +} +/*********************************************************************************************/ void MothurOut::mothurOutJustToLog(string output) { try { #ifdef USE_MPI diff --git a/mothurout.h b/mothurout.h index 7b71fcf..fb5b432 100644 --- a/mothurout.h +++ b/mothurout.h @@ -21,8 +21,10 @@ class MothurOut { static MothurOut* getInstance(); void setFileName(string); - void mothurOut(string); - void mothurOutEndLine(); + void mothurOut(string); //writes to cout and the logfile + void mothurOutEndLine(); //writes to cout and the logfile + void mothurOut(string, ofstream&); //writes to the ofstream, cout and the logfile + void mothurOutEndLine(ofstream&); //writes to the ofstream, cout and the logfile void mothurOutJustToLog(string); void errorOut(exception&, string, string); void closeLog(); diff --git a/refchimeratest.cpp b/refchimeratest.cpp index 28e6728..2f397e8 100644 --- a/refchimeratest.cpp +++ b/refchimeratest.cpp @@ -14,11 +14,10 @@ int MAXINT = numeric_limits::max(); //*************************************************************************************************************** -RefChimeraTest::RefChimeraTest(vector& refs, string chimeraReportFileName){ +RefChimeraTest::RefChimeraTest(vector& refs){ m = MothurOut::getInstance(); - m->openOutputFile(chimeraReportFileName, chimeraReportFile); numRefSeqs = refs.size(); referenceSeqs.resize(numRefSeqs); @@ -30,14 +29,22 @@ RefChimeraTest::RefChimeraTest(vector& refs, string chimeraReportFileN alignLength = referenceSeqs[0].length(); - chimeraReportFile << "queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\tdistToBestMera\tnumParents" << endl; -// chimeraReportFile << "leftParentTri,middleParentTri,rightParentTri\tbreakPointTriA,breakPointTriB\tminMismatchToTrimera\tdistToBestMera\tnMera" << endl; } +//*************************************************************************************************************** +int RefChimeraTest::printHeader(ofstream& chimeraReportFile){ + try { + chimeraReportFile << "queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\tdistToBestMera\tnumParents" << endl; + return 0; + }catch(exception& e) { + m->errorOut(e, "RefChimeraTest", "execute"); + exit(1); + } +} //*************************************************************************************************************** -int RefChimeraTest::analyzeQuery(string queryName, string querySeq){ +int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& chimeraReportFile){ vector > left(numRefSeqs); vector singleLeft, bestLeft; diff --git a/refchimeratest.h b/refchimeratest.h index 6b648fb..d4983e0 100644 --- a/refchimeratest.h +++ b/refchimeratest.h @@ -16,8 +16,9 @@ class RefChimeraTest { public: - RefChimeraTest(vector&, string); - int analyzeQuery(string, string); + RefChimeraTest(vector&); + int printHeader(ofstream&); + int analyzeQuery(string, string, ofstream&); int getClosestRefIndex(); private: int getMismatches(string&, vector >&, vector >&, int&); @@ -32,7 +33,7 @@ private: int numRefSeqs; int alignLength; int bestMatch; - ofstream chimeraReportFile; + //ofstream chimeraReportFile; MothurOut* m; }; diff --git a/removelineagecommand.cpp b/removelineagecommand.cpp index 22453a4..5f5435f 100644 --- a/removelineagecommand.cpp +++ b/removelineagecommand.cpp @@ -211,7 +211,7 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { if (taxons[0] == '\'') { taxons = taxons.substr(1); } if (taxons[(taxons.length()-1)] == '\'') { taxons = taxons.substr(0, (taxons.length()-1)); } } - + m->splitAtChar(taxons, listOfTaxons, '-'); if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } @@ -541,14 +541,18 @@ int RemoveLineageCommand::readTax(){ bool wroteSomething = false; - bool taxonsHasConfidence = false; - vector< map > searchTaxons; - string noConfidenceTaxons = taxons; - int hasConPos = taxons.find_first_of('('); - if (hasConPos != string::npos) { - taxonsHasConfidence = true; - searchTaxons = getTaxons(taxons); - noConfidenceTaxons = removeConfidences(taxons); + vector taxonsHasConfidence; taxonsHasConfidence.resize(listOfTaxons.size(), false); + vector< vector< map > > searchTaxons; searchTaxons.resize(listOfTaxons.size()); + vector noConfidenceTaxons; noConfidenceTaxons.resize(listOfTaxons.size(), ""); + + for (int i = 0; i < listOfTaxons.size(); i++) { + noConfidenceTaxons[i] = listOfTaxons[i]; + int hasConPos = listOfTaxons[i].find_first_of('('); + if (hasConPos != string::npos) { + taxonsHasConfidence[i] = true; + searchTaxons[i] = getTaxons(listOfTaxons[i]); + noConfidenceTaxons[i] = removeConfidences(listOfTaxons[i]); + } } @@ -559,102 +563,107 @@ int RemoveLineageCommand::readTax(){ in >> name; //read from first column in >> tax; //read from second column - string newtax = tax; + bool remove = false; - //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them - if (!taxonsHasConfidence) { - - int hasConfidences = tax.find_first_of('('); - if (hasConfidences != string::npos) { - newtax = removeConfidences(tax); - } - - int pos = newtax.find(taxons); + for (int j = 0; j < listOfTaxons.size(); j++) { + string newtax = tax; - if (pos == string::npos) { - wroteSomething = true; - out << name << '\t' << tax << endl; - }else{ //this sequence contains the taxon the user wants to remove - names.insert(name); - } - - }else{//if taxons has them and you don't them remove taxons - int hasConfidences = tax.find_first_of('('); - if (hasConfidences == string::npos) { - - int pos = newtax.find(noConfidenceTaxons); + //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them + if (!taxonsHasConfidence[j]) { - if (pos == string::npos) { - wroteSomething = true; - out << name << '\t' << tax << endl; - }else{ //this sequence contains the taxon the user wants to remove - names.insert(name); - } - }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons - //first remove confidences from both and see if the taxonomy exists - - string noNewTax = tax; int hasConfidences = tax.find_first_of('('); if (hasConfidences != string::npos) { - noNewTax = removeConfidences(tax); + newtax = removeConfidences(tax); } - int pos = noNewTax.find(noConfidenceTaxons); + int pos = newtax.find(noConfidenceTaxons[j]); - if (pos != string::npos) { //if yes, then are the confidences okay + if (pos == string::npos) { + //wroteSomething = true; + //out << name << '\t' << tax << endl; + }else{ //this sequence contains the taxon the user wants to remove + names.insert(name); + remove=true; break; + } + + }else{//if taxons has them and you don't them remove taxons + int hasConfidences = tax.find_first_of('('); + if (hasConfidences == string::npos) { - bool remove = false; - vector< map > usersTaxon = getTaxons(newtax); + int pos = newtax.find(noConfidenceTaxons[j]); - //the usersTaxon is most likely longer than the searchTaxons, and searchTaxon[0] may relate to userTaxon[4] - //we want to "line them up", so we will find the the index where the searchstring starts - int index = 0; - for (int i = 0; i < usersTaxon.size(); i++) { + if (pos == string::npos) { + //wroteSomething = true; + //out << name << '\t' << tax << endl; + }else{ //this sequence contains the taxon the user wants to remove + names.insert(name); + remove=true; break; + } + }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons + //first remove confidences from both and see if the taxonomy exists + + string noNewTax = tax; + int hasConfidences = tax.find_first_of('('); + if (hasConfidences != string::npos) { + noNewTax = removeConfidences(tax); + } + + int pos = noNewTax.find(noConfidenceTaxons[j]); + + if (pos != string::npos) { //if yes, then are the confidences okay - if (usersTaxon[i].begin()->first == searchTaxons[0].begin()->first) { - index = i; - int spot = 0; - bool goodspot = true; - //is this really the start, or are we dealing with a taxon of the same name? - while ((spot < searchTaxons.size()) && ((i+spot) < usersTaxon.size())) { - if (usersTaxon[i+spot].begin()->first != searchTaxons[spot].begin()->first) { goodspot = false; break; } - else { spot++; } - } + vector< map > usersTaxon = getTaxons(newtax); + + //the usersTaxon is most likely longer than the searchTaxons, and searchTaxon[0] may relate to userTaxon[4] + //we want to "line them up", so we will find the the index where the searchstring starts + int index = 0; + for (int i = 0; i < usersTaxon.size(); i++) { - if (goodspot) { break; } + if (usersTaxon[i].begin()->first == searchTaxons[j][0].begin()->first) { + index = i; + int spot = 0; + bool goodspot = true; + //is this really the start, or are we dealing with a taxon of the same name? + while ((spot < searchTaxons[j].size()) && ((i+spot) < usersTaxon.size())) { + if (usersTaxon[i+spot].begin()->first != searchTaxons[j][spot].begin()->first) { goodspot = false; break; } + else { spot++; } + } + + if (goodspot) { break; } + } } - } - - for (int i = 0; i < searchTaxons.size(); i++) { - if ((i+index) < usersTaxon.size()) { //just in case, should never be false - if (usersTaxon[i+index].begin()->second < searchTaxons[i].begin()->second) { //is the users cutoff less than the search taxons + for (int i = 0; i < searchTaxons[j].size(); i++) { + + if ((i+index) < usersTaxon.size()) { //just in case, should never be false + if (usersTaxon[i+index].begin()->second < searchTaxons[j][i].begin()->second) { //is the users cutoff less than the search taxons + remove = true; + break; + } + }else { remove = true; break; } + } + + //passed the test so remove you + if (remove) { + names.insert(name); + remove=true; break; }else { - remove = true; - break; + //wroteSomething = true; + //out << name << '\t' << tax << endl; } - } - - //passed the test so remove you - if (remove) { - names.insert(name); }else { - wroteSomething = true; - out << name << '\t' << tax << endl; + //wroteSomething = true; + //out << name << '\t' << tax << endl; } - }else { - wroteSomething = true; - out << name << '\t' << tax << endl; } } + } - - - + if (!remove) { wroteSomething = true; out << name << '\t' << tax << endl; } m->gobble(in); } in.close(); diff --git a/removelineagecommand.h b/removelineagecommand.h index 124f6e3..310a66a 100644 --- a/removelineagecommand.h +++ b/removelineagecommand.h @@ -31,7 +31,7 @@ class RemoveLineageCommand : public Command { private: set names; - vector outputNames; + vector outputNames, listOfTaxons; string fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir, taxons; bool abort, dups; diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index d06a5ad..2774922 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -11,6 +11,7 @@ #include "reportfile.h" #include "qualityscores.h" #include "refchimeratest.h" +#include "filterseqscommand.h" //********************************************************************************************************************** vector SeqErrorCommand::setParameters(){ @@ -23,6 +24,7 @@ vector SeqErrorCommand::setParameters(){ CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pignorechimeras); CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pthreshold); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pfilter("filter", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pfilter); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -199,8 +201,11 @@ SeqErrorCommand::SeqErrorCommand(string option) { temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; } convert(temp, threshold); - temp = validParameter.validFile(parameters, "ignorechimeras", false); if (temp == "not found") { temp = "1"; } - convert(temp, ignoreChimeras); + temp = validParameter.validFile(parameters, "ignorechimeras", false); if (temp == "not found") { temp = "T"; } + ignoreChimeras = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; } + filter = m->isTrue(temp); temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); @@ -220,96 +225,443 @@ SeqErrorCommand::SeqErrorCommand(string option) { int SeqErrorCommand::execute(){ try{ if (abort == true) { if (calledHelp) { return 0; } return 2; } - + + int start = time(NULL); maxLength = 2000; + totalBases = 0; + totalMatches = 0; + + //run vertical filter on query and reference files. + if (filter) { + string inputString = "fasta=" + queryFileName + "-" + referenceFileName; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: filter.seqs(" + inputString + ") to improve processing time."); m->mothurOutEndLine(); + + Command* filterCommand = new FilterSeqsCommand(inputString); + filterCommand->execute(); + + map > filenames = filterCommand->getOutputFiles(); + + delete filterCommand; + + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + queryFileName = filenames["fasta"][0]; + referenceFileName = filenames["fasta"][1]; + } string errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary"; - m->openOutputFile(errorSummaryFileName, errorSummaryFile); outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName); - printErrorHeader(); - + string errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq"; - m->openOutputFile(errorSeqFileName, errorSeqFile); outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName); - + + string errorChimeraFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.chimera"; + outputNames.push_back(errorChimeraFileName); outputTypes["error.chimera"].push_back(errorChimeraFileName); + getReferences(); //read in reference sequences - make sure there's no ambiguous bases - map weights; if(namesFileName != ""){ weights = getWeights(); } - ifstream queryFile; - m->openInputFile(queryFileName, queryFile); + vector fastaFilePos; + vector qFilePos; + vector reportFilePos; - ifstream reportFile; - ifstream qualFile; + setLines(queryFileName, qualFileName, reportFileName, fastaFilePos, qFilePos, reportFilePos); + + if (m->control_pressed) { return 0; } + + for (int i = 0; i < (fastaFilePos.size()-1); i++) { + lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)])); + if (qualFileName != "") { qLines.push_back(linePair(qFilePos[i], qFilePos[(i+1)])); } + if (reportFileName != "") { rLines.push_back(linePair(reportFilePos[i], reportFilePos[(i+1)])); } + } + if(qualFileName == "") { qLines = lines; rLines = lines; } //fills with duds + + int numSeqs = 0; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]); + }else{ + numSeqs = createProcesses(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName); + } +#else + numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]); +#endif + + if(qualFileName != "" && reportFileName != ""){ + printErrorQuality(qScoreErrorMap); + printQualityFR(qualForwardMap, qualReverseMap); + } + + printErrorFRFile(errorForward, errorReverse); + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } - ReportFile report; - QualityScores quality; - vector > qualForwardMap; - vector > qualReverseMap; + string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count"; + ofstream errorCountFile; + m->openOutputFile(errorCountFileName, errorCountFile); + outputNames.push_back(errorCountFileName); outputTypes["error.count"].push_back(errorCountFileName); + m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n"); + m->mothurOut("Errors\tSequences\n"); + errorCountFile << "Errors\tSequences\n"; + for(int i=0;imothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n'); + errorCountFile << i << '\t' << misMatchCounts[i] << endl; + } + errorCountFile.close(); + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + printSubMatrix(); + + string megAlignmentFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.ref-query"; + ofstream megAlignmentFile; + m->openOutputFile(megAlignmentFileName, megAlignmentFile); + outputNames.push_back(megAlignmentFileName); outputTypes["error.ref-query"].push_back(megAlignmentFileName); + + for(int i=0;imothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); + m->mothurOutEndLine(); + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +int SeqErrorCommand::createProcesses(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName) { + try { + int process = 1; + processIDS.clear(); + map >::iterator it; + int num = 0; - if(qualFileName != "" && reportFileName != ""){ - m->openInputFile(qualFileName, qualFile); - report = ReportFile(reportFile, reportFileName); + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); - qualForwardMap.resize(maxLength); - qualReverseMap.resize(maxLength); - for(int i=0;i 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + + num = driver(filename, qFileName, rFileName, summaryFileName + toString(getpid()) + ".temp", errorOutputFileName+ toString(getpid()) + ".temp", chimeraOutputFileName + toString(getpid()) + ".temp", lines[process], qLines[process], rLines[process]); + + //pass groupCounts to parent + ofstream out; + string tempFile = filename + toString(getpid()) + ".info.temp"; + m->openOutputFile(tempFile, out); + + //output totalBases and totalMatches + out << num << '\t' << totalBases << '\t' << totalMatches << endl << endl; + + //output substitutionMatrix + for(int i = 0; i < substitutionMatrix.size(); i++) { + for (int j = 0; j < substitutionMatrix[i].size(); j++) { + out << substitutionMatrix[i][j] << '\t'; + } + out << endl; + } + out << endl; + + //output qScoreErrorMap + for (it = qScoreErrorMap.begin(); it != qScoreErrorMap.end(); it++) { + vector thisScoreErrorMap = it->second; + out << it->first << '\t'; + for (int i = 0; i < thisScoreErrorMap.size(); i++) { + out << thisScoreErrorMap[i] << '\t'; + } + out << endl; + } + out << endl; + + //output qualForwardMap + for(int i = 0; i < qualForwardMap.size(); i++) { + for (int j = 0; j < qualForwardMap[i].size(); j++) { + out << qualForwardMap[i][j] << '\t'; + } + out << endl; + } + out << endl; + + //output qualReverseMap + for(int i = 0; i < qualReverseMap.size(); i++) { + for (int j = 0; j < qualReverseMap[i].size(); j++) { + out << qualReverseMap[i][j] << '\t'; + } + out << endl; + } + out << endl; + + + //output errorForward + for (it = errorForward.begin(); it != errorForward.end(); it++) { + vector thisErrorForward = it->second; + out << it->first << '\t'; + for (int i = 0; i < thisErrorForward.size(); i++) { + out << thisErrorForward[i] << '\t'; + } + out << endl; + } + out << endl; + + //output errorReverse + for (it = errorReverse.begin(); it != errorReverse.end(); it++) { + vector thisErrorReverse = it->second; + out << it->first << '\t'; + for (int i = 0; i < thisErrorReverse.size(); i++) { + out << thisErrorReverse[i] << '\t'; + } + out << endl; + } + out << endl; + + //output misMatchCounts + out << misMatchCounts.size() << endl; + for (int j = 0; j < misMatchCounts.size(); j++) { + out << misMatchCounts[j] << '\t'; + } + out << endl; + + + //output megaAlignVector + for (int j = 0; j < megaAlignVector.size(); j++) { + out << megaAlignVector[j] << endl; + } + out << endl; + + out.close(); + + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + //do my part + num = driver(filename, qFileName, rFileName, summaryFileName, errorOutputFileName, chimeraOutputFileName, lines[0], qLines[0], rLines[0]); + + //force parent to wait until all the processes are done + for (int i=0;imothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine(); + + m->appendFiles((summaryFileName + toString(processIDS[i]) + ".temp"), summaryFileName); + remove((summaryFileName + toString(processIDS[i]) + ".temp").c_str()); + m->appendFiles((errorOutputFileName + toString(processIDS[i]) + ".temp"), errorOutputFileName); + remove((errorOutputFileName + toString(processIDS[i]) + ".temp").c_str()); + m->appendFiles((chimeraOutputFileName + toString(processIDS[i]) + ".temp"), chimeraOutputFileName); + remove((chimeraOutputFileName + toString(processIDS[i]) + ".temp").c_str()); + + ifstream in; + string tempFile = filename + toString(processIDS[i]) + ".info.temp"; + m->openInputFile(tempFile, in); + + //input totalBases and totalMatches + int tempBases, tempMatches, tempNumSeqs; + in >> tempNumSeqs >> tempBases >> tempMatches; m->gobble(in); + totalBases += tempBases; totalMatches += tempMatches; num += tempNumSeqs; + + //input substitutionMatrix + int tempNum; + for(int i = 0; i < substitutionMatrix.size(); i++) { + for (int j = 0; j < substitutionMatrix[i].size(); j++) { + in >> tempNum; substitutionMatrix[i][j] += tempNum; + } + m->gobble(in); + } + m->gobble(in); + + //input qScoreErrorMap + char first; + for (int i = 0; i < qScoreErrorMap.size(); i++) { + in >> first; + vector thisScoreErrorMap = qScoreErrorMap[first]; + + for (int i = 0; i < thisScoreErrorMap.size(); i++) { + in >> tempNum; thisScoreErrorMap[i] += tempNum; + } + qScoreErrorMap[first] = thisScoreErrorMap; + m->gobble(in); + } + m->gobble(in); + + //input qualForwardMap + for(int i = 0; i < qualForwardMap.size(); i++) { + for (int j = 0; j < qualForwardMap[i].size(); j++) { + in >> tempNum; qualForwardMap[i][j] += tempNum; + } + m->gobble(in); + } + m->gobble(in); + + //input qualReverseMap + for(int i = 0; i < qualReverseMap.size(); i++) { + for (int j = 0; j < qualReverseMap[i].size(); j++) { + in >> tempNum; qualReverseMap[i][j] += tempNum; + } + m->gobble(in); + } + m->gobble(in); + + //input errorForward + for (int i = 0; i < errorForward.size(); i++) { + in >> first; + vector thisErrorForward = errorForward[first]; + + for (int i = 0; i < thisErrorForward.size(); i++) { + in >> tempNum; thisErrorForward[i] += tempNum; + } + errorForward[first] = thisErrorForward; + m->gobble(in); + } + m->gobble(in); + + //input errorReverse + for (int i = 0; i < errorReverse.size(); i++) { + in >> first; + vector thisErrorReverse = errorReverse[first]; + + for (int i = 0; i < thisErrorReverse.size(); i++) { + in >> tempNum; thisErrorReverse[i] += tempNum; + } + errorReverse[first] = thisErrorReverse; + m->gobble(in); + } + m->gobble(in); + + //input misMatchCounts + int misMatchSize; + in >> misMatchSize; m->gobble(in); + if (misMatchSize > misMatchCounts.size()) { misMatchCounts.resize(misMatchSize, 0); } + for (int j = 0; j < misMatchCounts.size(); j++) { + in >> tempNum; misMatchCounts[j] += tempNum; + } + m->gobble(in); + + //input megaAlignVector + string thisLine; + for (int j = 0; j < megaAlignVector.size(); j++) { + thisLine = m->getline(in); m->gobble(in); megaAlignVector[j] += thisLine + '\n'; + } + m->gobble(in); + + in.close(); remove(tempFile.c_str()); + + } + + return num; + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "createProcesses"); + exit(1); + } +} +//********************************************************************************************************************** +int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName, linePair line, linePair qline, linePair rline) { + + try { + ReportFile report; + QualityScores quality; - vector misMatchCounts(11, 0); + misMatchCounts.resize(11, 0); int maxMismatch = 0; int numSeqs = 0; map::iterator it; - map > qScoreErrorMap; qScoreErrorMap['m'].assign(41, 0); qScoreErrorMap['s'].assign(41, 0); qScoreErrorMap['i'].assign(41, 0); qScoreErrorMap['a'].assign(41, 0); - map > errorForward; errorForward['m'].assign(maxLength,0); errorForward['s'].assign(maxLength,0); errorForward['i'].assign(maxLength,0); errorForward['d'].assign(maxLength,0); errorForward['a'].assign(maxLength,0); - map > errorReverse; errorReverse['m'].assign(maxLength,0); errorReverse['s'].assign(maxLength,0); errorReverse['i'].assign(maxLength,0); errorReverse['d'].assign(maxLength,0); errorReverse['a'].assign(maxLength,0); - - string errorChimeraFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.chimera"; - RefChimeraTest chimeraTest(referenceSeqs, errorChimeraFileName); - outputNames.push_back(errorChimeraFileName); outputTypes["error.chimera"].push_back(errorChimeraFileName); + //open inputfiles and go to beginning place for this processor + ifstream queryFile; + m->openInputFile(filename, queryFile); + queryFile.seekg(line.start); + + ifstream reportFile; + ifstream qualFile; + if(qFileName != "" && rFileName != ""){ + m->openInputFile(qFileName, qualFile); + qualFile.seekg(qline.start); + + //gobble headers + if (rline.start == 0) { report = ReportFile(reportFile, rFileName); } + else{ + m->openInputFile(rFileName, reportFile); + reportFile.seekg(rline.start); + } + + qualForwardMap.resize(maxLength); + qualReverseMap.resize(maxLength); + for(int i=0;iopenOutputFile(chimeraOutputFileName, outChimeraReport); + RefChimeraTest chimeraTest(referenceSeqs); + if (line.start == 0) { chimeraTest.printHeader(outChimeraReport); } + + ofstream errorSummaryFile; + m->openOutputFile(summaryFileName, errorSummaryFile); + if (line.start == 0) { printErrorHeader(errorSummaryFile); } + + ofstream errorSeqFile; + m->openOutputFile(errorOutputFileName, errorSeqFile); + + megaAlignVector.resize(numRefs, ""); - vector megaAlignVector(numRefs, ""); - int index = 0; bool ignoreSeq = 0; - while(queryFile){ - - if (m->control_pressed) { errorSummaryFile.close(); errorSeqFile.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } - + bool moreSeqs = 1; + while (moreSeqs) { + + if (m->control_pressed) { queryFile.close(); if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); } outChimeraReport.close(); errorSummaryFile.close();errorSeqFile.close(); return 0; } + Sequence query(queryFile); - int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned()); + int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned(), outChimeraReport); int closestRefIndex = chimeraTest.getClosestRefIndex(); - + if(numParentSeqs > 1 && ignoreChimeras == 1) { ignoreSeq = 1; } else { ignoreSeq = 0; } - + Compare minCompare = getErrors(query, referenceSeqs[closestRefIndex]); if(namesFileName != ""){ @@ -317,35 +669,35 @@ int SeqErrorCommand::execute(){ minCompare.weight = it->second; } else{ minCompare.weight = 1; } - - printErrorData(minCompare, numParentSeqs); - + + printErrorData(minCompare, numParentSeqs, errorSummaryFile, errorSeqFile); + if(!ignoreSeq){ - + for(int i=0;imothurOut(toString(index) + '\n'); } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + unsigned long int pos = queryFile.tellg(); + if ((pos == -1) || (pos >= line.end)) { break; } + #else + if (queryFile.eof()) { break; } + #endif + + if(index % 100 == 0){ m->mothurOut(toString(index) + '\n'); } } queryFile.close(); + if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); } errorSummaryFile.close(); errorSeqFile.close(); - - if(qualFileName != "" && reportFileName != ""){ - printErrorQuality(qScoreErrorMap); - printQualityFR(qualForwardMap, qualReverseMap); - } - - printErrorFRFile(errorForward, errorReverse); - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } - - string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count"; - ofstream errorCountFile; - m->openOutputFile(errorCountFileName, errorCountFile); - outputNames.push_back(errorCountFileName); outputTypes["error.count"].push_back(errorCountFileName); - m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n"); - m->mothurOut("Errors\tSequences\n"); - errorCountFile << "Errors\tSequences\n"; - for(int i=0;imothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n'); - errorCountFile << i << '\t' << misMatchCounts[i] << endl; - } - errorCountFile.close(); - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } - - printSubMatrix(); - - string megAlignmentFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.ref-query"; - ofstream megAlignmentFile; - m->openOutputFile(megAlignmentFileName, megAlignmentFile); - outputNames.push_back(megAlignmentFileName); outputTypes["error.ref-query"].push_back(megAlignmentFileName); - - for(int i=0;imothurOut(toString(index) + '\n'); } - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } - m->mothurOutEndLine(); - - return 0; + return index; } catch(exception& e) { - m->errorOut(e, "SeqErrorCommand", "execute"); + m->errorOut(e, "SeqErrorCommand", "driver"); exit(1); } } - //*************************************************************************************************************** void SeqErrorCommand::getReferences(){ @@ -572,7 +892,7 @@ map SeqErrorCommand::getWeights(){ //*************************************************************************************************************** -void SeqErrorCommand::printErrorHeader(){ +void SeqErrorCommand::printErrorHeader(ofstream& errorSummaryFile){ try { errorSummaryFile << "query\treference\tweight\t"; errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t"; @@ -589,7 +909,7 @@ void SeqErrorCommand::printErrorHeader(){ //*************************************************************************************************************** -void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs){ +void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs, ofstream& errorSummaryFile, ofstream& errorSeqFile){ try { errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t'; @@ -825,5 +1145,182 @@ void SeqErrorCommand::printQualityFR(vector > qualForwardMap, vector } } +/**************************************************************************************************/ +int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector& fastaFilePos, vector& qfileFilePos, vector& rfileFilePos) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //set file positions for fasta file + fastaFilePos = m->divideFile(filename, processors); + + if (qfilename == "") { return processors; } + + //get name of first sequence in each chunk + map firstSeqNames; + for (int i = 0; i < (fastaFilePos.size()-1); i++) { + ifstream in; + m->openInputFile(filename, in); + in.seekg(fastaFilePos[i]); + + Sequence temp(in); + firstSeqNames[temp.getName()] = i; + + in.close(); + } + + //make copy to use below + map firstSeqNamesReport = firstSeqNames; + + //seach for filePos of each first name in the qfile and save in qfileFilePos + ifstream inQual; + m->openInputFile(qfilename, inQual); + + string input; + while(!inQual.eof()){ + input = m->getline(inQual); + + if (input.length() != 0) { + if(input[0] == '>'){ //this is a sequence name line + istringstream nameStream(input); + + string sname = ""; nameStream >> sname; + sname = sname.substr(1); + + map::iterator it = firstSeqNames.find(sname); + + if(it != firstSeqNames.end()) { //this is the start of a new chunk + unsigned long int pos = inQual.tellg(); + qfileFilePos.push_back(pos - input.length() - 1); + firstSeqNames.erase(it); + } + } + } + + if (firstSeqNames.size() == 0) { break; } + } + inQual.close(); + + if (firstSeqNames.size() != 0) { + for (map::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) { + m->mothurOut(it->first + " is in your fasta file and not in your quality file, aborting."); m->mothurOutEndLine(); + } + m->control_pressed = true; + return processors; + } + + //get last file position of qfile + FILE * pFile; + unsigned long int size; + + //get num bytes in file + pFile = fopen (qfilename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + + qfileFilePos.push_back(size); + + //seach for filePos of each first name in the rfile and save in rfileFilePos + string junk; + ifstream inR; + m->openInputFile(rfilename, inR); + + //read column headers + for (int i = 0; i < 16; i++) { + if (!inR.eof()) { inR >> junk; } + else { break; } + } + + while(!inR.eof()){ + + if (m->control_pressed) { inR.close(); return processors; } + + input = m->getline(inR); + + if (input.length() != 0) { + + istringstream nameStream(input); + string sname = ""; nameStream >> sname; + + map::iterator it = firstSeqNamesReport.find(sname); + + if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk + unsigned long int pos = inR.tellg(); + rfileFilePos.push_back(pos - input.length() - 1); + firstSeqNamesReport.erase(it); + } + } + + if (firstSeqNamesReport.size() == 0) { break; } + m->gobble(inR); + } + inR.close(); + + if (firstSeqNamesReport.size() != 0) { + for (map::iterator it = firstSeqNamesReport.begin(); it != firstSeqNamesReport.end(); it++) { + m->mothurOut(it->first + " is in your fasta file and not in your report file, aborting."); m->mothurOutEndLine(); + } + m->control_pressed = true; + return processors; + } + + //get last file position of qfile + FILE * rFile; + unsigned long int sizeR; + + //get num bytes in file + rFile = fopen (rfilename.c_str(),"rb"); + if (rFile==NULL) perror ("Error opening file"); + else{ + fseek (rFile, 0, SEEK_END); + sizeR=ftell (rFile); + fclose (rFile); + } + + rfileFilePos.push_back(sizeR); + + return processors; + +#else + + fastaFilePos.push_back(0); qfileFilePos.push_back(0); + //get last file position of fastafile + FILE * pFile; + unsigned long int size; + + //get num bytes in file + pFile = fopen (filename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + fastaFilePos.push_back(size); + + //get last file position of fastafile + FILE * qFile; + + //get num bytes in file + qFile = fopen (qfilename.c_str(),"rb"); + if (qFile==NULL) perror ("Error opening file"); + else{ + fseek (qFile, 0, SEEK_END); + size=ftell (qFile); + fclose (qFile); + } + qfileFilePos.push_back(size); + + return 1; + +#endif + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "setLines"); + exit(1); + } +} //*************************************************************************************************************** diff --git a/seqerrorcommand.h b/seqerrorcommand.h index 7e87ec6..cb715ce 100644 --- a/seqerrorcommand.h +++ b/seqerrorcommand.h @@ -56,27 +56,51 @@ public: private: bool abort; + + struct linePair { + unsigned long int start; + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + }; + + vector processIDS; //processid + vector lines; + vector qLines; + vector rLines; void getReferences(); map getWeights(); Compare getErrors(Sequence, Sequence); - void printErrorHeader(); - void printErrorData(Compare, int); + void printErrorHeader(ofstream&); + void printErrorData(Compare, int, ofstream&, ofstream&); void printSubMatrix(); void printErrorFRFile(map >, map >); void printErrorQuality(map >); void printQualityFR(vector >, vector >); + + int setLines(string, string, string, vector&, vector&, vector&); + int driver(string, string, string, string, string, string, linePair, linePair, linePair); + int createProcesses(string, string, string, string, string, string); string queryFileName, referenceFileName, qualFileName, reportFileName, namesFileName, outputDir; double threshold; - bool ignoreChimeras; + bool ignoreChimeras, filter; int numRefs, processors; - int maxLength; - ofstream errorSummaryFile, errorSeqFile; + int maxLength, totalBases, totalMatches; + //ofstream errorSummaryFile, errorSeqFile; vector outputNames; vector referenceSeqs; vector > substitutionMatrix; + vector > qualForwardMap; + vector > qualReverseMap; + vector misMatchCounts; + map > qScoreErrorMap; + map > errorForward; + map > errorReverse; + map weights; + vector megaAlignVector; + }; #endif diff --git a/sequencedb.cpp b/sequencedb.cpp index 585b3b3..e798ee1 100644 --- a/sequencedb.cpp +++ b/sequencedb.cpp @@ -15,7 +15,7 @@ /***********************************************************************/ -SequenceDB::SequenceDB() { m = MothurOut::getInstance(); } +SequenceDB::SequenceDB() { m = MothurOut::getInstance(); length = 0; samelength = true; } /***********************************************************************/ //the clear function free's the memory SequenceDB::~SequenceDB() { clear(); } @@ -24,19 +24,25 @@ SequenceDB::~SequenceDB() { clear(); } SequenceDB::SequenceDB(int newSize) { data.resize(newSize, Sequence()); + length = 0; samelength = true; } /***********************************************************************/ SequenceDB::SequenceDB(ifstream& filehandle) { try{ + length = 0; samelength = true; //read through file while (!filehandle.eof()) { //input sequence info into sequencedb Sequence newSequence(filehandle); - if (newSequence.getName() != "") { data.push_back(newSequence); } + if (newSequence.getName() != "") { + if (length == 0) { length = newSequence.getAligned().length(); } + if (length != newSequence.getAligned().length()) { samelength = false; } + data.push_back(newSequence); + } //takes care of white space m->gobble(filehandle); @@ -92,7 +98,10 @@ string SequenceDB::readSequence(ifstream& in) { break; }else { sequence += line; } } - + + if (length == 0) { length = sequence.length(); } + if (length != sequence.length()) { samelength = false; } + return sequence; } catch(exception& e) { @@ -111,6 +120,9 @@ int SequenceDB::getNumSeqs() { void SequenceDB::set(int index, string newUnaligned) { try { + if (length == 0) { length = newUnaligned.length(); } + if (length != newUnaligned.length()) { samelength = false; } + data[index] = Sequence(data[index].getName(), newUnaligned); } catch(exception& e) { @@ -123,6 +135,9 @@ void SequenceDB::set(int index, string newUnaligned) { void SequenceDB::set(int index, Sequence newSeq) { try { + if (length == 0) { length = newSeq.getAligned().length(); } + if (length != newSeq.getAligned().length()) { samelength = false; } + data[index] = newSeq; } catch(exception& e) { @@ -185,6 +200,9 @@ void SequenceDB::print(ostream& out) { void SequenceDB::push_back(Sequence newSequence) { try { + if (length == 0) { length = newSequence.getAligned().length(); } + if (length != newSequence.getAligned().length()) { samelength = false; } + data.push_back(newSequence); } catch(exception& e) { diff --git a/sequencedb.h b/sequencedb.h index 3672549..8f7640e 100644 --- a/sequencedb.h +++ b/sequencedb.h @@ -37,12 +37,15 @@ public: void clear(); //clears data - remeber to loop through and delete the sequences inside or you will have a memory leak int size(); //returns datas size void print(ostream&); //loops through data using sequence class print + bool sameLength() { return samelength; } private: vector data; string readName(ifstream&); string readSequence(ifstream&); MothurOut* m; + bool samelength; + int length; }; diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp index 30761e1..4363eaa 100644 --- a/summarysharedcommand.cpp +++ b/summarysharedcommand.cpp @@ -580,7 +580,7 @@ int SummarySharedCommand::process(vector thisLookup, string ifstream intemp; m->openInputFile(tempdistFileName, intemp); - for (int i = 0; i < calcDists.size(); i++) { + for (int k = 0; k < calcDists.size(); k++) { int size = 0; intemp >> size; m->gobble(intemp); @@ -592,7 +592,7 @@ int SummarySharedCommand::process(vector thisLookup, string intemp >> seq1 >> seq2 >> dist; m->gobble(intemp); seqDist tempDist(seq1, seq2, dist); - calcDists[i].push_back(tempDist); + calcDists[k].push_back(tempDist); } } intemp.close(); diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 64522a5..dc87fff 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -240,7 +240,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "name", true); if (temp == "not found") { nameFile = ""; } - else if(temp == "not open") { abort = true; } + else if(temp == "not open") { nameFile = ""; abort = true; } else { nameFile = temp; } temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } @@ -818,10 +818,14 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName ofstream temp; m->openOutputFile(trimFASTAFileName, temp); temp.close(); m->openOutputFile(scrapFASTAFileName, temp); temp.close(); - m->openOutputFile(trimQualFileName, temp); temp.close(); - m->openOutputFile(scrapQualFileName, temp); temp.close(); - m->openOutputFile(trimNameFileName, temp); temp.close(); - m->openOutputFile(scrapNameFileName, temp); temp.close(); + if(qFileName != ""){ + m->openOutputFile(trimQualFileName, temp); temp.close(); + m->openOutputFile(scrapQualFileName, temp); temp.close(); + } + if (nameFile != "") { + m->openOutputFile(trimNameFileName, temp); temp.close(); + m->openOutputFile(scrapNameFileName, temp); temp.close(); + } driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);