From a9dbc22713bfc056a797361dd757b1a5c98e1c01 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Mon, 22 Oct 2012 11:40:41 -0400 Subject: [PATCH] added sets to amova and homova commands. added oligos to make.contigs. added metadata to make.biom. fixed bug in get.oturep caused by adding the count file. changed vector declaration in cooccurrence to fixed windows issue. classify.seqs can now ignore seqs in taxonomy file not in template file. --- alignmentdb.cpp | 13 +- alignnode.cpp | 2 +- amovacommand.cpp | 38 +- amovacommand.h | 2 +- chimerauchimecommand.cpp | 22 +- classify.cpp | 43 +- classifyseqscommand.cpp | 15 +- classifyseqscommand.h | 3 +- consensusseqscommand.cpp | 6 +- cooccurrencecommand.cpp | 6 +- corraxescommand.cpp | 11 +- formatcolumn.cpp | 163 ++++++++ formatcolumn.h | 1 + formatmatrix.h | 2 + formatphylip.cpp | 249 +++++++++++- formatphylip.h | 2 + getoturepcommand.cpp | 8 +- heatmapcommand.cpp | 10 +- homovacommand.cpp | 36 +- homovacommand.h | 2 +- makebiomcommand.cpp | 101 ++++- makebiomcommand.h | 5 +- makecontigscommand.cpp | 595 ++++++++++++++++++++++----- makecontigscommand.h | 198 +++++++-- mothurout.cpp | 69 ++++ mothurout.h | 3 + otuassociationcommand.cpp | 16 +- otuassociationcommand.h | 1 + phylotypecommand.cpp | 5 +- seqsummarycommand.cpp | 2 +- sequence.cpp | 4 +- sharedrabundfloatvector.cpp | 2 - subsample.cpp | 2 +- trimoligos.cpp | 777 +++++++++++++++++++----------------- trimoligos.h | 12 +- trimseqscommand.cpp | 34 +- unifracweightedcommand.cpp | 8 +- 37 files changed, 1878 insertions(+), 590 deletions(-) diff --git a/alignmentdb.cpp b/alignmentdb.cpp index f59c5db..4ce76ea 100644 --- a/alignmentdb.cpp +++ b/alignmentdb.cpp @@ -42,12 +42,14 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap numSeqs = templateSequences.size(); if (!silent) { m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); } - + }else { int start = time(NULL); m->mothurOutEndLine(); m->mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush(); - + bool aligned = false; + int tempLength = 0; + #ifdef USE_MPI int pid, processors; vector positions; @@ -102,6 +104,9 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap //save longest base if (temp.getUnaligned().length() >= longest) { longest = temp.getUnaligned().length()+1; } + if (tempLength != 0) { + if (tempLength != temp.getAligned().length()) { m->mothurOut("[ERROR]: template is not aligned, aborting.\n"); m->control_pressed=true; } + }else { tempLength = temp.getAligned().length(); } } } @@ -125,6 +130,10 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap //save longest base if (temp.getUnaligned().length() >= longest) { longest = (temp.getUnaligned().length()+1); } + + if (tempLength != 0) { + if (tempLength != temp.getAligned().length()) { m->mothurOut("[ERROR]: template is not aligned, aborting.\n"); m->control_pressed=true; } + }else { tempLength = temp.getAligned().length(); } } } fastaFile.close(); diff --git a/alignnode.cpp b/alignnode.cpp index ccd8fb0..08ba40c 100755 --- a/alignnode.cpp +++ b/alignnode.cpp @@ -7,7 +7,7 @@ * */ -#include "alignNode.h" +#include "alignnode.h" #include "taxonomynode.h" #include "bayesian.h" diff --git a/amovacommand.cpp b/amovacommand.cpp index c4260fb..f36027f 100644 --- a/amovacommand.cpp +++ b/amovacommand.cpp @@ -10,12 +10,14 @@ #include "amovacommand.h" #include "readphylipvector.h" #include "groupmap.h" +#include "sharedutilities.h" //********************************************************************************************************************** vector AmovaCommand::setParameters(){ try { CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pdesign); + CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets); CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pphylip); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter palpha("alpha", "Number", "", "0.05", "", "", "",false,false); parameters.push_back(palpha); @@ -37,9 +39,10 @@ string AmovaCommand::getHelpString(){ string helpString = ""; helpString += "Referenced: Anderson MJ (2001). A new method for non-parametric multivariate analysis of variance. Austral Ecol 26: 32-46."; helpString += "The amova command outputs a .amova file."; - helpString += "The amova command parameters are phylip, iters, and alpha. The phylip and design parameters are required, unless you have valid current files."; + helpString += "The amova command parameters are phylip, iters, sets and alpha. The phylip and design parameters are required, unless you have valid current files."; helpString += "The design parameter allows you to assign your samples to groups when you are running amova. It is required."; helpString += "The design file looks like the group file. It is a 2 column tab delimited file, where the first column is the sample name and the second column is the group the sample belongs to."; + helpString += "The sets parameter allows you to specify which of the sets in your designfile you would like to analyze. The set names are separated by dashes. THe default is all sets in the designfile.\n"; helpString += "The iters parameter allows you to set number of randomization for the P value. The default is 1000."; helpString += "The amova command should be in the following format: amova(phylip=file.dist, design=file.design)."; helpString += "Note: No spaces between parameter labels (i.e. iters), '=' and parameters (i.e. 1000)."; @@ -161,6 +164,12 @@ AmovaCommand::AmovaCommand(string option) { temp = validParameter.validFile(parameters, "alpha", false); if (temp == "not found") { temp = "0.05"; } m->mothurConvert(temp, experimentwiseAlpha); + + string sets = validParameter.validFile(parameters, "sets", false); + if (sets == "not found") { sets = ""; } + else { + m->splitAtDash(sets, Sets); + } } } catch(exception& e) { @@ -184,7 +193,32 @@ int AmovaCommand::execute(){ //read in distance matrix and square it ReadPhylipVector readMatrix(phylipFileName); vector sampleNames = readMatrix.read(distanceMatrix); - + + if (Sets.size() != 0) { //user selected sets, so we want to remove the samples not in those sets + SharedUtil util; + vector dGroups = designMap->getNamesOfGroups(); + util.setGroups(Sets, dGroups); + + for(int i=0;icontrol_pressed) { delete designMap; return 0; } + + string group = designMap->getGroup(sampleNames[i]); + + if (group == "not found") { + m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; + }else if (!m->inUsersGroups(group, Sets)){ //not in set we want remove it + //remove from all other rows + for(int j=0;j > getRandomizedGroups(map >); bool abort; - vector outputNames; + vector outputNames, Sets; string outputDir, inputDir, designFileName, phylipFileName; GroupMap* designMap; diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index ae01190..6f7ba10 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -1452,15 +1452,21 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc string name = ""; string chimeraFlag = ""; - in >> chimeraFlag >> name; - - //fix name if needed - if (templatefile == "self") { - name = name.substr(0, name.length()-1); //rip off last / - name = name.substr(0, name.find_last_of('/')); + //in >> chimeraFlag >> name; + + string line = m->getline(in); + vector pieces = m->splitWhiteSpace(line); + if (pieces.size() > 2) { + name = pieces[1]; + //fix name if needed + if (templatefile == "self") { + name = name.substr(0, name.length()-1); //rip off last / + name = name.substr(0, name.find_last_of('/')); + } + + chimeraFlag = pieces[pieces.size()-1]; } - - for (int i = 0; i < 15; i++) { in >> chimeraFlag; } + //for (int i = 0; i < 15; i++) { in >> chimeraFlag; } m->gobble(in); if (chimeraFlag == "Y") { out << name << endl; numChimeras++; } diff --git a/classify.cpp b/classify.cpp index 8aa3cdb..15ef0aa 100644 --- a/classify.cpp +++ b/classify.cpp @@ -23,7 +23,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me if (tfile == "saved") { tfile = rdb->getSavedTaxonomy(); } taxFile = tfile; - readTaxonomy(taxFile); + int numSeqs = 0; if (tempFile == "saved") { @@ -74,11 +74,6 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me database->setNumSeqs(numSeqs); - //sanity check - bool okay = phyloTree->ErrorCheck(names); - - if (!okay) { m->control_pressed = true; } - m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences and generate the search databases.");m->mothurOutEndLine(); }else { @@ -221,18 +216,19 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me fastaFile.close(); } #endif - + database->setNumSeqs(names.size()); - //sanity check - bool okay = phyloTree->ErrorCheck(names); - - if (!okay) { m->control_pressed = true; } - m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine(); } - + + readTaxonomy(taxFile); + + //sanity check + bool okay = phyloTree->ErrorCheck(names); + + if (!okay) { m->control_pressed = true; } } catch(exception& e) { m->errorOut(e, "Classify", "generateDatabaseAndNames"); @@ -299,9 +295,13 @@ int Classify::readTaxonomy(string file) { iss >> name; m->gobble(iss); iss >> taxInfo; if (m->debug) { m->mothurOut("[DEBUG]: name = " + name + " tax = " + taxInfo + "\n"); } - taxonomy[name] = taxInfo; - phyloTree->addSeqToTree(name, taxInfo); - } + if (m->inUsersGroups(name, names)) { + taxonomy[name] = taxInfo; + phyloTree->addSeqToTree(name, taxInfo); + }else { + m->mothurOut("[WARNING]: " + name + " is in your taxonomy file and not in your reference file, ignoring.\n"); + } + } MPI_File_close(&inMPI); MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case @@ -309,7 +309,16 @@ int Classify::readTaxonomy(string file) { taxonomy.clear(); m->readTax(file, taxonomy); - for (map::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) { phyloTree->addSeqToTree(itTax->first, itTax->second); } + map tempTaxonomy; + for (map::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) { + if (m->inUsersGroups(itTax->first, names)) { + phyloTree->addSeqToTree(itTax->first, itTax->second); + tempTaxonomy[itTax->first] = itTax->second; + }else { + m->mothurOut("[WARNING]: " + itTax->first + " is in your taxonomy file and not in your reference file, ignoring.\n"); + } + } + taxonomy = tempTaxonomy; #endif phyloTree->assignHeirarchyIDs(0); diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 36bccc2..7f262bf 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -658,7 +658,6 @@ int ClassifySeqsCommand::execute(){ } outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile); - outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile); outputNames.push_back(taxSummary); outputTypes["taxsummary"].push_back(taxSummary); int start = time(NULL); @@ -782,7 +781,9 @@ int ClassifySeqsCommand::execute(){ } #endif - if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur suspects some of your sequences may be reversed, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine(); } + if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur suspects some of your sequences may be reversed, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine(); + outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile); + }else { m->mothurRemove(newaccnosFile); } m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); @@ -895,12 +896,12 @@ int ClassifySeqsCommand::execute(){ #ifdef USE_MPI } #endif - - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } - m->mothurOutEndLine(); } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); //set taxonomy file as new current taxonomyfile string current = ""; diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 0cffec6..87d4f51 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -166,10 +166,11 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ //make classify Classify* myclassify; + string outputMethodTag = pDataArray->method + "."; if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } else if(pDataArray->method == "zap"){ - outputMethodTag = search + "_" + outputMethodTag; + outputMethodTag = pDataArray->search + "_" + outputMethodTag; if (pDataArray->search == "kmer") { myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); } else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); } } diff --git a/consensusseqscommand.cpp b/consensusseqscommand.cpp index 94d6682..36df8c0 100644 --- a/consensusseqscommand.cpp +++ b/consensusseqscommand.cpp @@ -610,10 +610,10 @@ char ConsensusSeqsCommand::getBase(vector counts, int size){ //A,T,G,C,Gap //zero out counts that don't make the cutoff float percentage = (100.0 - cutoff) / 100.0; - int zeroCutoff = percentage * size; - + for (int i = 0; i < counts.size(); i++) { - if (counts[i] < zeroCutoff) { counts[i] = 0; } + float countPercentage = counts[i] / (float) size; + if (countPercentage < percentage) { counts[i] = 0; } } //any diff --git a/cooccurrencecommand.cpp b/cooccurrencecommand.cpp index 00de4d6..ef8c9ed 100644 --- a/cooccurrencecommand.cpp +++ b/cooccurrencecommand.cpp @@ -321,11 +321,11 @@ int CooccurrenceCommand::getCooccurrence(vector& thisLookUp int nrows = numOTUS;//rows of inital matrix int ncols = thisLookUp.size();//groups double initscore = 0.0; - + vector stats; - double probabilityMatrix[ncols * nrows]; + vector probabilityMatrix; probabilityMatrix.resize(ncols * nrows, 0); vector > nullmatrix(nrows, vector(ncols, 0)); - + TrialSwap2 trial; int n = accumulate( columntotal.begin(), columntotal.end(), 0 ); diff --git a/corraxescommand.cpp b/corraxescommand.cpp index f8083cc..c4454a5 100644 --- a/corraxescommand.cpp +++ b/corraxescommand.cpp @@ -925,16 +925,11 @@ int CorrAxesCommand::getMetadata(){ m->openInputFile(metadatafile, in); string headerLine = m->getline(in); m->gobble(in); - istringstream iss (headerLine,istringstream::in); - - //read the first label, because it refers to the groups - string columnLabel; - iss >> columnLabel; m->gobble(iss); + vector pieces = m->splitWhiteSpace(headerLine); //save names of columns you are reading - while (!iss.eof()) { - iss >> columnLabel; m->gobble(iss); - metadataLabels.push_back(columnLabel); + for (int i = 1; i < pieces.size(); i++) { + metadataLabels.push_back(pieces[i]); } int count = metadataLabels.size(); diff --git a/formatcolumn.cpp b/formatcolumn.cpp index 6b29f90..109c09c 100644 --- a/formatcolumn.cpp +++ b/formatcolumn.cpp @@ -179,6 +179,169 @@ int FormatColumnMatrix::read(NameAssignment* nameMap){ exit(1); } } +/***********************************************************************/ + +int FormatColumnMatrix::read(CountTable* nameMap){ + try { + + string firstName, secondName; + float distance; + int nseqs = nameMap->size(); + + list = new ListVector(nameMap->getListVector()); + + Progress* reading = new Progress("Formatting matrix: ", nseqs * nseqs); + + int lt = 1; + int refRow = 0; //we'll keep track of one cell - Cell(refRow,refCol) - and see if it's transpose + int refCol = 0; //shows up later - Cell(refCol,refRow). If it does, then its a square matrix + + //need to see if this is a square or a triangular matrix... + + ofstream out; + string tempOutFile = filename + ".temp"; + m->openOutputFile(tempOutFile, out); + + while(fileHandle && lt == 1){ //let's assume it's a triangular matrix... + + if (m->control_pressed) { out.close(); m->mothurRemove(tempOutFile); fileHandle.close(); delete reading; return 0; } + + fileHandle >> firstName >> secondName >> distance; // get the row and column names and distance + + int itA = nameMap->get(firstName); + int itB = nameMap->get(secondName); + + if (distance == -1) { distance = 1000000; } + + if((distance < cutoff) && (itA != itB)){ + if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol... + refRow = itA; + refCol = itB; + + //making it square + out << itA << '\t' << itB << '\t' << distance << endl; + out << itB << '\t' << itA << '\t' << distance << endl; + } + else if(refRow == itA && refCol == itB){ lt = 0; } //you are square + else if(refRow == itB && refCol == itA){ lt = 0; } //you are square + else{ //making it square + out << itA << '\t' << itB << '\t' << distance << endl; + out << itB << '\t' << itA << '\t' << distance << endl; + } + + reading->update(itA * nseqs / 2); + } + m->gobble(fileHandle); + } + out.close(); + fileHandle.close(); + + string squareFile; + if(lt == 0){ // oops, it was square + squareFile = filename; + }else{ squareFile = tempOutFile; } + + //sort file by first column so the distances for each row are together + string outfile = m->getRootName(squareFile) + "sorted.dist.temp"; + + //use the unix sort +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + string command = "sort -n " + squareFile + " -o " + outfile; + system(command.c_str()); +#else //sort using windows sort + string command = "sort " + squareFile + " /O " + outfile; + system(command.c_str()); +#endif + + if (m->control_pressed) { m->mothurRemove(tempOutFile); m->mothurRemove(outfile); delete reading; return 0; } + + //output to new file distance for each row and save positions in file where new row begins + ifstream in; + m->openInputFile(outfile, in); + + distFile = outfile + ".rowFormatted"; + m->openOutputFile(distFile, out); + + rowPos.resize(nseqs, -1); + int currentRow; + int first, second; + float dist; + map rowMap; + map::iterator itRow; + + //get first currentRow + in >> first; + currentRow = first; + + string firstString = toString(first); + for(int k = 0; k < firstString.length(); k++) { in.putback(firstString[k]); } + + while(!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(distFile); m->mothurRemove(tempOutFile); m->mothurRemove(outfile); delete reading; return 0; } + + in >> first >> second >> dist; m->gobble(in); + + if (first != currentRow) { + //save position in file of each new row + rowPos[currentRow] = out.tellp(); + + out << currentRow << '\t' << rowMap.size() << '\t'; + + for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) { + out << itRow->first << '\t' << itRow->second << '\t'; + } + out << endl; + + currentRow = first; + rowMap.clear(); + + //save row you just read + if (dist < cutoff) { + rowMap[second] = dist; + } + }else{ + if (dist < cutoff) { + rowMap[second] = dist; + } + } + } + + //print last Row + //save position in file of each new row + rowPos[currentRow] = out.tellp(); + + out << currentRow << '\t' << rowMap.size() << '\t'; + + for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) { + out << itRow->first << '\t' << itRow->second << '\t'; + } + out << endl; + + + in.close(); + out.close(); + + if (m->control_pressed) { m->mothurRemove(distFile); m->mothurRemove(tempOutFile); m->mothurRemove(outfile); delete reading; return 0; } + + m->mothurRemove(tempOutFile); + m->mothurRemove(outfile); + + reading->finish(); + + delete reading; + list->setLabel("0"); + + if (m->control_pressed) { m->mothurRemove(distFile); return 0; } + + return 1; + + } + catch(exception& e) { + m->errorOut(e, "FormatColumnMatrix", "read"); + exit(1); + } +} /***********************************************************************/ FormatColumnMatrix::~FormatColumnMatrix(){} diff --git a/formatcolumn.h b/formatcolumn.h index 0fe96ed..157a289 100644 --- a/formatcolumn.h +++ b/formatcolumn.h @@ -19,6 +19,7 @@ public: FormatColumnMatrix(string); ~FormatColumnMatrix(); int read(NameAssignment*); + int read(CountTable*); private: ifstream fileHandle; diff --git a/formatmatrix.h b/formatmatrix.h index 7e7a99c..859b78a 100644 --- a/formatmatrix.h +++ b/formatmatrix.h @@ -13,6 +13,7 @@ #include "mothur.h" #include "listvector.hpp" #include "nameassignment.hpp" +#include "counttable.h" //********************************************************************************************************************** @@ -61,6 +62,7 @@ public: virtual ~FormatMatrix() {} virtual int read(NameAssignment*){ return 1; } + virtual int read(CountTable*){ return 1; } void setCutoff(float c) { cutoff = c; } ListVector* getListVector() { return list; } diff --git a/formatphylip.cpp b/formatphylip.cpp index 6059117..57bc5d7 100644 --- a/formatphylip.cpp +++ b/formatphylip.cpp @@ -30,9 +30,14 @@ int FormatPhylipMatrix::read(NameAssignment* nameMap){ if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); } else { convert(numTest, nseqs); } - - list = new ListVector(nseqs); - list->set(0, name); + if(nameMap == NULL){ + list = new ListVector(nseqs); + list->set(0, name); + } + else{ + list = new ListVector(nameMap->getListVector()); + if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); } + } char d; while((d=fileHandle.get()) != EOF){ @@ -71,7 +76,9 @@ int FormatPhylipMatrix::read(NameAssignment* nameMap){ fileHandle >> name; - list->set(i, name); + if(nameMap == NULL){ list->set(i, name); } + else { if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); } + } for(int j=0;j> name; - list->set(i, name); + if(nameMap == NULL){ list->set(i, name); } + else { if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); } + } for(int j=0;jcontrol_pressed) { fileHandle.close(); out.close(); m->mothurRemove(distFile); delete reading; return 0; } @@ -235,6 +244,236 @@ int FormatPhylipMatrix::read(NameAssignment* nameMap){ exit(1); } } +/***********************************************************************/ +//not using nameMap +int FormatPhylipMatrix::read(CountTable* nameMap){ + try { + + float distance; + int square, nseqs; + string name; + ofstream out; + + string numTest; + fileHandle >> numTest >> name; + + if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); } + else { convert(numTest, nseqs); } + + if(nameMap == NULL){ + list = new ListVector(nseqs); + list->set(0, name); + } + else{ + list = new ListVector(nameMap->getListVector()); + nameMap->get(name); + } + + char d; + while((d=fileHandle.get()) != EOF){ + + if(isalnum(d)){ //you are square + square = 1; + fileHandle.close(); //reset file + + //open and get through numSeqs, code below formats rest of file + m->openInputFile(filename, fileHandle); + fileHandle >> nseqs; m->gobble(fileHandle); + + distFile = filename + ".rowFormatted"; + m->openOutputFile(distFile, out); + break; + } + if(d == '\n'){ + square = 0; + break; + } + } + + Progress* reading; + reading = new Progress("Formatting matrix: ", nseqs * nseqs); + + //lower triangle, so must go to column then formatted row file + if(square == 0){ + int index = 0; + + ofstream outTemp; + string tempFile = filename + ".temp"; + m->openOutputFile(tempFile, outTemp); + + //convert to square column matrix + for(int i=1;i> name; + + if(nameMap == NULL){ list->set(i, name); } + else { nameMap->get(name); } + + + for(int j=0;jcontrol_pressed) { outTemp.close(); m->mothurRemove(tempFile); fileHandle.close(); delete reading; return 0; } + + fileHandle >> distance; + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff){ + outTemp << i << '\t' << j << '\t' << distance << endl; + outTemp << j << '\t' << i << '\t' << distance << endl; + } + index++; + reading->update(index); + } + } + outTemp.close(); + + //format from square column to rowFormatted + //sort file by first column so the distances for each row are together + string outfile = m->getRootName(tempFile) + "sorted.dist.temp"; + + //use the unix sort +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + string command = "sort -n " + tempFile + " -o " + outfile; + system(command.c_str()); +#else //sort using windows sort + string command = "sort " + tempFile + " /O " + outfile; + system(command.c_str()); +#endif + + if (m->control_pressed) { m->mothurRemove(tempFile); m->mothurRemove(outfile); delete reading; return 0; } + + //output to new file distance for each row and save positions in file where new row begins + ifstream in; + m->openInputFile(outfile, in); + + distFile = outfile + ".rowFormatted"; + m->openOutputFile(distFile, out); + + rowPos.resize(nseqs, -1); + int currentRow; + int first, second; + float dist; + map rowMap; + map::iterator itRow; + + //get first currentRow + in >> first; + currentRow = first; + + string firstString = toString(first); + for(int k = 0; k < firstString.length(); k++) { in.putback(firstString[k]); } + + while(!in.eof()) { + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); m->mothurRemove(distFile); m->mothurRemove(outfile); delete reading; return 0; } + + in >> first >> second >> dist; m->gobble(in); + + if (first != currentRow) { + //save position in file of each new row + rowPos[currentRow] = out.tellp(); + + out << currentRow << '\t' << rowMap.size() << '\t'; + + for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) { + out << itRow->first << '\t' << itRow->second << '\t'; + } + out << endl; + + currentRow = first; + rowMap.clear(); + + //save row you just read + rowMap[second] = dist; + + index++; + reading->update(index); + }else{ + rowMap[second] = dist; + } + } + + //print last Row + //save position in file of each new row + rowPos[currentRow] = out.tellp(); + + out << currentRow << '\t' << rowMap.size() << '\t'; + + for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) { + out << itRow->first << '\t' << itRow->second << '\t'; + } + out << endl; + + in.close(); + out.close(); + + m->mothurRemove(tempFile); + m->mothurRemove(outfile); + + if (m->control_pressed) { m->mothurRemove(distFile); delete reading; return 0; } + + } + else{ //square matrix convert directly to formatted row file + int index = nseqs; + map rowMap; + map::iterator itRow; + rowPos.resize(nseqs, -1); + + for(int i=0;i> name; + + if(nameMap == NULL){ list->set(i, name); } + else { nameMap->get(name); } + + for(int j=0;jcontrol_pressed) { fileHandle.close(); out.close(); m->mothurRemove(distFile); delete reading; return 0; } + + fileHandle >> distance; + + if (distance == -1) { distance = 1000000; } + + if((distance < cutoff) && (j != i)){ + rowMap[j] = distance; + } + index++; + reading->update(index); + } + + m->gobble(fileHandle); + + //save position in file of each new row + rowPos[i] = out.tellp(); + + //output row to file + out << i << '\t' << rowMap.size() << '\t'; + for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) { + out << itRow->first << '\t' << itRow->second << '\t'; + } + out << endl; + + //clear map for new row's info + rowMap.clear(); + } + } + reading->finish(); + delete reading; + fileHandle.close(); + out.close(); + + if (m->control_pressed) { m->mothurRemove(distFile); return 0; } + + list->setLabel("0"); + + return 1; + + + } + catch(exception& e) { + m->errorOut(e, "FormatPhylipMatrix", "read"); + exit(1); + } +} + /***********************************************************************/ FormatPhylipMatrix::~FormatPhylipMatrix(){} /***********************************************************************/ diff --git a/formatphylip.h b/formatphylip.h index b920d08..c081305 100644 --- a/formatphylip.h +++ b/formatphylip.h @@ -20,6 +20,8 @@ public: FormatPhylipMatrix(string); ~FormatPhylipMatrix(); int read(NameAssignment*); + int read(CountTable*); + private: ifstream fileHandle; string filename; diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index 9f4dd54..1d26b47 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -574,6 +574,8 @@ int GetOTURepCommand::readDist() { readMatrix->read(nameMap); }else if (countfile != "") { readMatrix->read(&ct); + }else { + readMatrix->read(nameMap); } if (m->control_pressed) { delete readMatrix; return 0; } @@ -614,9 +616,11 @@ int GetOTURepCommand::readDist() { if(namefile != ""){ nameMap = new NameAssignment(namefile); nameMap->readMap(); - readMatrix->read(nameMap); + formatMatrix->read(nameMap); }else if (countfile != "") { - readMatrix->read(&ct); + formatMatrix->read(&ct); + }else { + formatMatrix->read(nameMap); } if (m->control_pressed) { delete formatMatrix; return 0; } diff --git a/heatmapcommand.cpp b/heatmapcommand.cpp index d160525..441eaf8 100644 --- a/heatmapcommand.cpp +++ b/heatmapcommand.cpp @@ -173,27 +173,27 @@ HeatMapCommand::HeatMapCommand(string option) { //check for required parameters listfile = validParameter.validFile(parameters, "list", true); - if (listfile == "not open") { listfile = ""; abort = true; } + if (listfile == "not open") { abort = true; } else if (listfile == "not found") { listfile = ""; } else { format = "list"; inputfile = listfile; m->setListFile(listfile); } sabundfile = validParameter.validFile(parameters, "sabund", true); - if (sabundfile == "not open") { sabundfile = ""; abort = true; } + if (sabundfile == "not open") { abort = true; } else if (sabundfile == "not found") { sabundfile = ""; } else { format = "sabund"; inputfile = sabundfile; m->setSabundFile(sabundfile); } rabundfile = validParameter.validFile(parameters, "rabund", true); - if (rabundfile == "not open") { rabundfile = ""; abort = true; } + if (rabundfile == "not open") { abort = true; } else if (rabundfile == "not found") { rabundfile = ""; } else { format = "rabund"; inputfile = rabundfile; m->setRabundFile(rabundfile); } sharedfile = validParameter.validFile(parameters, "shared", true); - if (sharedfile == "not open") { sharedfile = ""; abort = true; } + if (sharedfile == "not open") { abort = true; } else if (sharedfile == "not found") { sharedfile = ""; } else { format = "sharedfile"; inputfile = sharedfile; m->setSharedFile(sharedfile); } relabundfile = validParameter.validFile(parameters, "relabund", true); - if (relabundfile == "not open") { relabundfile = ""; abort = true; } + if (relabundfile == "not open") { abort = true; } else if (relabundfile == "not found") { relabundfile = ""; } else { format = "relabund"; inputfile = relabundfile; m->setRelAbundFile(relabundfile); } diff --git a/homovacommand.cpp b/homovacommand.cpp index 006f1e4..6496cea 100644 --- a/homovacommand.cpp +++ b/homovacommand.cpp @@ -10,12 +10,14 @@ #include "homovacommand.h" #include "groupmap.h" #include "readphylipvector.h" +#include "sharedutilities.h" //********************************************************************************************************************** vector HomovaCommand::setParameters(){ try { CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pdesign); CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pphylip); + CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter palpha("alpha", "Number", "", "0.05", "", "", "",false,false); parameters.push_back(palpha); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -36,9 +38,10 @@ string HomovaCommand::getHelpString(){ string helpString = ""; helpString += "Referenced: Stewart CN, Excoffier L (1996). Assessing population genetic structure and variability with RAPD data: Application to Vaccinium macrocarpon (American Cranberry). J Evol Biol 9: 153-71.\n"; helpString += "The homova command outputs a .homova file. \n"; - helpString += "The homova command parameters are phylip, iters, and alpha. The phylip and design parameters are required, unless valid current files exist.\n"; + helpString += "The homova command parameters are phylip, iters, sets and alpha. The phylip and design parameters are required, unless valid current files exist.\n"; helpString += "The design parameter allows you to assign your samples to groups when you are running homova. It is required. \n"; helpString += "The design file looks like the group file. It is a 2 column tab delimited file, where the first column is the sample name and the second column is the group the sample belongs to.\n"; + helpString += "The sets parameter allows you to specify which of the sets in your designfile you would like to analyze. The set names are separated by dashes. THe default is all sets in the designfile.\n"; helpString += "The iters parameter allows you to set number of randomization for the P value. The default is 1000. \n"; helpString += "The homova command should be in the following format: homova(phylip=file.dist, design=file.design).\n"; helpString += "Note: No spaces between parameter labels (i.e. iters), '=' and parameters (i.e. 1000).\n"; @@ -162,6 +165,12 @@ HomovaCommand::HomovaCommand(string option) { temp = validParameter.validFile(parameters, "alpha", false); if (temp == "not found") { temp = "0.05"; } m->mothurConvert(temp, experimentwiseAlpha); + + string sets = validParameter.validFile(parameters, "sets", false); + if (sets == "not found") { sets = ""; } + else { + m->splitAtDash(sets, Sets); + } } } @@ -187,6 +196,31 @@ int HomovaCommand::execute(){ ReadPhylipVector readMatrix(phylipFileName); vector sampleNames = readMatrix.read(distanceMatrix); + if (Sets.size() != 0) { //user selected sets, so we want to remove the samples not in those sets + SharedUtil util; + vector dGroups = designMap->getNamesOfGroups(); + util.setGroups(Sets, dGroups); + + for(int i=0;icontrol_pressed) { delete designMap; return 0; } + + string group = designMap->getGroup(sampleNames[i]); + + if (group == "not found") { + m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; + }else if (!m->inUsersGroups(group, Sets)){ //not in set we want remove it + //remove from all other rows + for(int j=0;j > getRandomizedGroups(map >); bool abort; - vector outputNames; + vector outputNames, Sets; string outputDir, inputDir, designFileName, phylipFileName; GroupMap* designMap; diff --git a/makebiomcommand.cpp b/makebiomcommand.cpp index 68e70ee..6385ade 100644 --- a/makebiomcommand.cpp +++ b/makebiomcommand.cpp @@ -9,6 +9,7 @@ #include "makebiomcommand.h" #include "sharedrabundvector.h" #include "inputdata.h" +#include "sharedutilities.h" //taken from http://biom-format.org/documentation/biom_format.html /* Minimal Sparse @@ -93,6 +94,7 @@ vector MakeBiomCommand::setParameters(){ try { CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared); CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcontaxonomy); + CommandParameter pmetadata("metadata", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pmetadata); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -112,11 +114,12 @@ vector MakeBiomCommand::setParameters(){ string MakeBiomCommand::getHelpString(){ try { string helpString = ""; - helpString += "The make.biom command parameters are shared, contaxonomy, groups, matrixtype and label. shared is required, unless you have a valid current file.\n"; + helpString += "The make.biom command parameters are shared, contaxonomy, metadata, groups, matrixtype and label. shared is required, unless you have a valid current file.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n"; helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n"; helpString += "The matrixtype parameter allows you to select what type you would like to make. Choices are sparse and dense, default is sparse.\n"; - helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance. ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled.\n"; + helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance. ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled. It is used to assign taxonomy information to the metadata of rows.\n"; + helpString += "The metadata parameter is used to provide experimental parameters to the columns. Things like 'sample1 gut human_gut'. \n"; helpString += "The make.biom command should be in the following format: make.biom(shared=yourShared, groups=yourGroups, label=yourLabels).\n"; helpString += "Example make.biom(shared=abrecovery.an.shared, groups=A-B-C).\n"; helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n"; @@ -211,6 +214,14 @@ MakeBiomCommand::MakeBiomCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["contaxonomy"] = inputDir + it->second; } } + + it = parameters.find("metadata"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["metadata"] = inputDir + it->second; } + } } //get shared file @@ -231,6 +242,9 @@ MakeBiomCommand::MakeBiomCommand(string option) { if (contaxonomyfile == "not found") { contaxonomyfile = ""; } else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; } + metadatafile = validParameter.validFile(parameters, "metadata", true); + if (metadatafile == "not found") { metadatafile = ""; } + else if (metadatafile == "not open") { metadatafile = ""; abort = true; } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -280,6 +294,8 @@ int MakeBiomCommand::execute(){ labels.insert(lastLabel); } + getSampleMetaData(lookup); + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set processedLabels; set userLabels = labels; @@ -423,13 +439,13 @@ int MakeBiomCommand::getBiom(vector& lookup){ {"id":"Sample6", "metadata":null} ],*/ - string colBack = "\", \"metadata\":null}"; + string colBack = "\", \"metadata\":"; out << spaces + "\"columns\":[\n"; for (int i = 0; i < lookup.size()-1; i++) { if (m->control_pressed) { out.close(); return 0; } - out << rowFront << lookup[i]->getGroup() << colBack << ",\n"; + out << rowFront << lookup[i]->getGroup() << colBack << sampleMetadata[i] << "},\n"; } - out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << "\n" + spaces + "],\n"; + out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << sampleMetadata[lookup.size()-1] << "}\n" + spaces + "],\n"; out << spaces + "\"matrix_type\": \"" << format << "\",\n" + spaces + "\"matrix_element_type\": \"int\",\n"; out << spaces + "\"shape\": [" << m->currentBinLabels.size() << "," << lookup.size() << "],\n"; @@ -608,6 +624,81 @@ vector MakeBiomCommand::getMetaData(vector& lookup) } } +//********************************************************************************************************************** +int MakeBiomCommand::getSampleMetaData(vector& lookup){ + try { + + if (metadatafile == "") { for (int i = 0; i < lookup.size(); i++) { sampleMetadata.push_back("null"); } } + else { + ifstream in; + m->openInputFile(metadatafile, in); + + vector groupNames, metadataLabels; + map > lines; + + string headerLine = m->getline(in); m->gobble(in); + vector pieces = m->splitWhiteSpace(headerLine); + + //save names of columns you are reading + for (int i = 1; i < pieces.size(); i++) { + metadataLabels.push_back(pieces[i]); + } + int count = metadataLabels.size(); + + vector groups = m->getGroups(); + + //read rest of file + while (!in.eof()) { + + if (m->control_pressed) { in.close(); return 0; } + + string group = ""; + in >> group; m->gobble(in); + groupNames.push_back(group); + + string line = m->getline(in); m->gobble(in); + vector thisPieces = m->splitWhiteSpaceWithQuotes(line); + + if (thisPieces.size() != count) { m->mothurOut("[ERROR]: expected " + toString(count) + " items of data for sample " + group + " read " + toString(thisPieces.size()) + ", quitting.\n"); } + else { if (m->inUsersGroups(group, groups)) { lines[group] = thisPieces; } } + + m->gobble(in); + } + in.close(); + + map >::iterator it; + for (int i = 0; i < lookup.size(); i++) { + + if (m->control_pressed) { return 0; } + + it = lines.find(lookup[i]->getGroup()); + + if (it == lines.end()) { m->mothurOut("[ERROR]: can't find metadata information for " + lookup[i]->getGroup() + ", quitting.\n"); m->control_pressed = true; } + else { + vector values = it->second; + + string data = "{"; + for (int j = 0; j < metadataLabels.size()-1; j++) { + values[j] = m->removeQuotes(values[j]); + data += "\"" + metadataLabels[j] + "\":\"" + values[j] + "\", "; + } + values[metadataLabels.size()-1] = m->removeQuotes(values[metadataLabels.size()-1]); + data += "\"" + metadataLabels[metadataLabels.size()-1] + "\":\"" + values[metadataLabels.size()-1] + "\"}"; + sampleMetadata.push_back(data); + } + } + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "MakeBiomCommand", "getSampleMetaData"); + exit(1); + } + +} + /**************************************************************************************************/ //returns {Bacteria, Bacteroidetes, ..} and scores is filled with {100, 98, ...} or {null, null, null} vector MakeBiomCommand::parseTax(string tax, vector& scores) { diff --git a/makebiomcommand.h b/makebiomcommand.h index 8a458fe..ee9bd1c 100644 --- a/makebiomcommand.h +++ b/makebiomcommand.h @@ -35,8 +35,8 @@ public: private: - string sharedfile, contaxonomyfile, groups, outputDir, format, label; - vector outputNames, Groups; + string sharedfile, contaxonomyfile, metadatafile, groups, outputDir, format, label; + vector outputNames, Groups, sampleMetadata; set labels; bool abort, allLines; @@ -44,6 +44,7 @@ private: int getBiom(vector&); vector getMetaData(vector&); vector parseTax(string tax, vector& scores); + int getSampleMetaData(vector&); }; diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp index 4ae25ce..f2f6449 100644 --- a/makecontigscommand.cpp +++ b/makecontigscommand.cpp @@ -16,8 +16,8 @@ vector MakeContigsCommand::setParameters(){ CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos); CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs); CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs); - CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); - CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); +// CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); +// CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "",false,false); parameters.push_back(palign); @@ -220,16 +220,16 @@ MakeContigsCommand::MakeContigsCommand(string option) { temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } m->mothurConvert(temp, pdiffs); - temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } - m->mothurConvert(temp, ldiffs); + // temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } +// m->mothurConvert(temp, ldiffs); - temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } - m->mothurConvert(temp, sdiffs); + // temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } +// m->mothurConvert(temp, sdiffs); temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } m->mothurConvert(temp, tdiffs); - if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } //+ ldiffs + sdiffs; temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } allFiles = m->isTrue(temp); @@ -261,15 +261,32 @@ int MakeContigsCommand::execute(){ if (m->control_pressed) { return 0; } - string outFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("fasta"); - string outQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("qfile"); + vector > fastaFileNames; + vector > qualFileNames; + createGroup = false; + string outputGroupFileName; + if(oligosfile != ""){ + createGroup = getOligos(fastaFileNames, qualFileNames); + if (createGroup) { + outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("group"); + outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); + } + } + + string outFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "trim." + getOutputFileNameTag("fasta"); + string outQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "trim." + getOutputFileNameTag("qfile"); + string outScrapFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "scrap." + getOutputFileNameTag("fasta"); + string outScrapQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "scrap." + getOutputFileNameTag("qfile"); + string outMisMatchFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("mismatch"); outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); outputNames.push_back(outQualFile); outputTypes["qfile"].push_back(outQualFile); + outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile); + outputNames.push_back(outScrapQualFile); outputTypes["qfile"].push_back(outScrapQualFile); outputNames.push_back(outMisMatchFile); outputTypes["mismatch"].push_back(outMisMatchFile); m->mothurOut("Making contigs...\n"); - createProcesses(files, outFastaFile, outQualFile, outMisMatchFile); + createProcesses(files, outFastaFile, outQualFile, outScrapFastaFile, outScrapQualFile, outMisMatchFile, fastaFileNames, qualFileNames); m->mothurOut("Done.\n"); //remove temp fasta and qual files @@ -277,8 +294,79 @@ int MakeContigsCommand::execute(){ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if(allFiles){ + map uniqueFastaNames;// so we don't add the same groupfile multiple times + map::iterator it; + set namesToRemove; + for(int i=0;iisBlank(fastaFileNames[i][j])){ + m->mothurRemove(fastaFileNames[i][j]); + namesToRemove.insert(fastaFileNames[i][j]); + + m->mothurRemove(qualFileNames[i][j]); + namesToRemove.insert(qualFileNames[i][j]); + }else{ + it = uniqueFastaNames.find(fastaFileNames[i][j]); + if (it == uniqueFastaNames.end()) { + uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i]; + } + } + } + } + } + } + + //remove names for outputFileNames, just cleans up the output + vector outputNames2; + for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } } + outputNames = outputNames2; + + for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) { + ifstream in; + m->openInputFile(it->first, in); + + ofstream out; + string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)); + thisGroupName += getOutputFileNameTag("group"); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); + m->openOutputFile(thisGroupName, out); + + while (!in.eof()){ + if (m->control_pressed) { break; } + + Sequence currSeq(in); m->gobble(in); + out << currSeq.getName() << '\t' << it->second << endl; + } + in.close(); + out.close(); + } + } + + if (createGroup) { + ofstream outGroup; + m->openOutputFile(outputGroupFileName, outGroup); + for (map::iterator itGroup = groupMap.begin(); itGroup != groupMap.end(); itGroup++) { + outGroup << itGroup->first << '\t' << itGroup->second << endl; + } + outGroup.close(); + } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process " + toString(numReads) + " sequences.\n"); + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + //output group counts + m->mothurOutEndLine(); + int total = 0; + if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); } + for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { + total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine(); + } + if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); } + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + string currentFasta = ""; itTypes = outputTypes.find("fasta"); if (itTypes != outputTypes.end()) { @@ -311,7 +399,7 @@ int MakeContigsCommand::execute(){ } } //********************************************************************************************************************** -int MakeContigsCommand::createProcesses(vector< vector > files, string outputFasta, string outputQual, string outputMisMatches) { +int MakeContigsCommand::createProcesses(vector< vector > files, string outputFasta, string outputQual, string outputScrapFasta, string outputScrapQual, string outputMisMatches, vector > fastaFileNames, vector > qualFileNames) { try { int num = 0; vector processIDS; @@ -326,14 +414,52 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driver(files[process], outputFasta + toString(getpid()) + ".temp", outputQual + toString(getpid()) + ".temp", outputMisMatches + toString(getpid()) + ".temp"); + vector > tempFASTAFileNames = fastaFileNames; + vector > tempPrimerQualFileNames = qualFileNames; + + if(allFiles){ + ofstream temp; + + for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp"; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + } + } + } + + num = driver(files[process], + outputFasta + toString(getpid()) + ".temp", + outputQual + toString(getpid()) + ".temp", + outputScrapFasta + toString(getpid()) + ".temp", + outputScrapQual + toString(getpid()) + ".temp", + outputMisMatches + toString(getpid()) + ".temp", + tempFASTAFileNames, + tempPrimerQualFileNames); - //pass numSeqs to parent - ofstream out; - string tempFile = outputFasta + toString(getpid()) + ".num.temp"; - m->openOutputFile(tempFile, out); - out << num << endl; - out.close(); + //pass groupCounts to parent + ofstream out; + string tempFile = toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); + out << num << endl; + if(createGroup){ + out << groupCounts.size() << endl; + + for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { + out << it->first << '\t' << it->second << endl; + } + + out << groupMap.size() << endl; + for (map::iterator it = groupMap.begin(); it != groupMap.end(); it++) { + out << it->first << '\t' << it->second << endl; + } + } + out.close(); exit(0); }else { @@ -343,8 +469,14 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o } } + ofstream temp; + m->openOutputFile(outputFasta, temp); temp.close(); + m->openOutputFile(outputQual, temp); temp.close(); + m->openOutputFile(outputScrapFasta, temp); temp.close(); + m->openOutputFile(outputScrapQual, temp); temp.close(); + //do my part - num = driver(files[processors-1], outputFasta, outputQual, outputMisMatches); + num = driver(files[processors-1], outputFasta, outputQual, outputScrapFasta, outputScrapQual, outputMisMatches, fastaFileNames, qualFileNames); //force parent to wait until all the processes are done for (int i=0;i > files, string o } for (int i = 0; i < processIDS.size(); i++) { - ifstream in; - string tempFile = outputFasta + toString(processIDS[i]) + ".num.temp"; - m->openInputFile(tempFile, in); - if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } - in.close(); m->mothurRemove(tempFile); + ifstream in; + string tempFile = toString(processIDS[i]) + ".num.temp"; + m->openInputFile(tempFile, in); + int tempNum; + in >> tempNum; num += tempNum; m->gobble(in); + + if(createGroup){ + string group; + in >> tempNum; m->gobble(in); + + if (tempNum != 0) { + for (int j = 0; j < tempNum; j++) { + int groupNum; + in >> group >> groupNum; m->gobble(in); + + map::iterator it = groupCounts.find(group); + if (it == groupCounts.end()) { groupCounts[group] = groupNum; } + else { groupCounts[it->first] += groupNum; } + } + } + in >> tempNum; m->gobble(in); + if (tempNum != 0) { + for (int j = 0; j < tempNum; j++) { + string group, seqName; + in >> seqName >> group; m->gobble(in); + + map::iterator it = groupMap.find(seqName); + if (it == groupMap.end()) { groupMap[seqName] = group; } + else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); } + } + } + } + in.close(); m->mothurRemove(tempFile); } #else @@ -371,17 +531,67 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o HANDLE hThreadArray[processors-1]; //Create processor worker threads. - for( int i=0; i > tempFASTAFileNames = fastaFileNames; + vector > tempPrimerQualFileNames = qualFileNames; + + if(allFiles){ + ofstream temp; + + for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + + tempPrimerQualFileNames[i][j] += extension; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + } + } + } + + + contigsData* tempcontig = new contigsData(files[h], (outputFasta + extension), (outputQual + extension), (outputScrapFasta + extension), (outputScrapQual + extension),(outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, threshold, barcodes, primers, tempFASTAFileNames, tempPrimerQualFileNames, barcodeNameVector, primerNameVector, pdiffs, bdiffs, tdiffs, createGroup, allFiles, h); pDataArray.push_back(tempcontig); - processIDS.push_back(i); - hThreadArray[i] = CreateThread(NULL, 0, MyContigsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + hThreadArray[h] = CreateThread(NULL, 0, MyContigsThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]); } + + vector > tempFASTAFileNames = fastaFileNames; + vector > tempPrimerQualFileNames = qualFileNames; + + if(allFiles){ + ofstream temp; + string extension = toString(processors-1) + ".temp"; + + for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + + tempPrimerQualFileNames[i][j] += extension; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + } + } + } + + //parent do my part + ofstream temp; + m->openOutputFile(outputFasta, temp); temp.close(); + m->openOutputFile(outputQual, temp); temp.close(); + m->openOutputFile(outputScrapFasta, temp); temp.close(); + m->openOutputFile(outputScrapQual, temp); temp.close(); - num = driver(files[processors-1], outputFasta, outputQual, outputMisMatches); + + //do my part + num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputQual+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputScrapQual+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames); //Wait until all threads have terminated. WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); @@ -389,6 +599,16 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o //Close all thread handles and free memory allocations. for(int i=0; i < pDataArray.size(); i++){ num += pDataArray[i]->count; + for (map::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) { + map::iterator it2 = groupCounts.find(it->first); + if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; } + else { groupCounts[it->first] += it->second; } + } + for (map::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) { + map::iterator it2 = groupMap.find(it->first); + if (it2 == groupMap.end()) { groupMap[it->first] = it->second; } + else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); } + } CloseHandle(hThreadArray[i]); delete pDataArray[i]; } @@ -402,8 +622,28 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o m->appendFiles((outputQual + toString(processIDS[i]) + ".temp"), outputQual); m->mothurRemove((outputQual + toString(processIDS[i]) + ".temp")); + m->appendFiles((outputScrapFasta + toString(processIDS[i]) + ".temp"), outputScrapFasta); + m->mothurRemove((outputScrapFasta + toString(processIDS[i]) + ".temp")); + + m->appendFiles((outputScrapQual + toString(processIDS[i]) + ".temp"), outputScrapQual); + m->mothurRemove((outputScrapQual + toString(processIDS[i]) + ".temp")); + m->appendFiles((outputMisMatches + toString(processIDS[i]) + ".temp"), outputMisMatches); m->mothurRemove((outputMisMatches + toString(processIDS[i]) + ".temp")); + + if(allFiles){ + for(int j=0;jappendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]); + m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp")); + + m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]); + m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp")); + } + } + } + } } return num; @@ -414,7 +654,7 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o } } //********************************************************************************************************************** -int MakeContigsCommand::driver(vector files, string outputFasta, string outputQual, string outputMisMatches){ +int MakeContigsCommand::driver(vector files, string outputFasta, string outputQual, string outputScrapFasta, string outputScrapQual, string outputMisMatches, vector > fastaFileNames, vector > qualFileNames){ try { Alignment* alignment; @@ -435,26 +675,51 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string m->openInputFile(thisrfastafile, inRFasta); m->openInputFile(thisrqualfile, inRQual); - ofstream outFasta, outQual, outMisMatch; + ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual; m->openOutputFile(outputFasta, outFasta); m->openOutputFile(outputQual, outQual); + m->openOutputFile(outputScrapFasta, outScrapFasta); + m->openOutputFile(outputScrapQual, outScrapQual); m->openOutputFile(outputMisMatches, outMisMatch); outMisMatch << "Name\tLength\tMisMatches\n"; + TrimOligos trimOligos(pdiffs, bdiffs, 0, 0, primers, barcodes); + while ((!inFQual.eof()) && (!inFFasta.eof()) && (!inRFasta.eof()) && (!inRQual.eof())) { if (m->control_pressed) { break; } + int success = 1; + string trashCode = ""; + int currentSeqsDiffs = 0; + //read seqs and quality info Sequence fSeq(inFFasta); m->gobble(inFFasta); Sequence rSeq(inRFasta); m->gobble(inRFasta); QualityScores fQual(inFQual); m->gobble(inFQual); QualityScores rQual(inRQual); m->gobble(inRQual); + int barcodeIndex = 0; + int primerIndex = 0; + + if(barcodes.size() != 0){ + success = trimOligos.stripBarcode(fSeq, rSeq, fQual, rQual, barcodeIndex); + if(success > bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } + + if(primers.size() != 0){ + success = trimOligos.stripForward(fSeq, rSeq, fQual, rQual, primerIndex); + if(success > pdiffs) { trashCode += 'f'; } + else{ currentSeqsDiffs += success; } + } + + if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } + //flip the reverse reads rSeq.reverseComplement(); rQual.flipQScores(); - + //pairwise align alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned()); map ABaseMap = alignment->getSeqAAlnBaseMap(); @@ -462,7 +727,7 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string fSeq.setAligned(alignment->getSeqAAln()); rSeq.setAligned(alignment->getSeqBAln()); int length = fSeq.getAligned().length(); - + //traverse alignments merging into one contiguous seq string contig = ""; vector contigScores; @@ -471,8 +736,8 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string string seq2 = rSeq.getAligned(); vector scores1 = fQual.getQualityScores(); vector scores2 = rQual.getQualityScores(); - - // if (num < 5) { cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; } + + // if (num < 5) { cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; } int overlapStart = fSeq.getStartPos(); int seq2Start = rSeq.getStartPos(); //bigger of the 2 starting positions is the location of the overlapping start @@ -532,16 +797,60 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string contig += seq1[i]; contigScores.push_back(scores1[ABaseMap[i]]); } + + } + if(trashCode.length() == 0){ + if (createGroup) { + if(barcodes.size() != 0){ + string thisGroup = barcodeNameVector[barcodeIndex]; + if (primers.size() != 0) { + if (primerNameVector[primerIndex] != "") { + if(thisGroup != "") { + thisGroup += "." + primerNameVector[primerIndex]; + }else { + thisGroup = primerNameVector[primerIndex]; + } + } + } + + if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); } + + groupMap[fSeq.getName()] = thisGroup; + + map::iterator it = groupCounts.find(thisGroup); + if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; } + else { groupCounts[it->first] ++; } + + } + } + + if(allFiles){ + ofstream output; + m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output); + output << ">" << fSeq.getName() << endl << contig << endl; + output.close(); + + m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output); + output << ">" << fSeq.getName() << endl; + for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; } + output << endl; + output.close(); + } + + //output + outFasta << ">" << fSeq.getName() << endl << contig << endl; + outQual << ">" << fSeq.getName() << endl; + for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; } + outQual << endl; + outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl; + }else { + //output + outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl; + outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl; + for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; } + outScrapQual << endl; } - //if (num < 5) { cout << overlapStart << '\t' << overlapEnd << endl << seq1 << endl << seq2 << endl<< contig << endl; } - //output - outFasta << ">" << fSeq.getName() << endl << contig << endl; - outQual << ">" << fSeq.getName() << endl; - for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; } - outQual << endl; - outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl; - num++; //report progress @@ -557,10 +866,12 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string inRQual.close(); outFasta.close(); outQual.close(); + outScrapFasta.close(); + outScrapQual.close(); outMisMatch.close(); delete alignment; - if (m->control_pressed) { m->mothurRemove(outputQual); m->mothurRemove(outputFasta); m->mothurRemove(outputMisMatches);} + if (m->control_pressed) { m->mothurRemove(outputQual); m->mothurRemove(outputFasta); m->mothurRemove(outputScrapQual); m->mothurRemove(outputScrapFasta);m->mothurRemove(outputMisMatches);} return num; } @@ -618,39 +929,57 @@ vector< vector > MakeContigsCommand::readFastqFiles(int& count){ m->openInputFile(rfastqfile, inReverse); count = 0; - while ((!inForward.eof()) && (!inReverse.eof())) { + map uniques; + map::iterator itUniques; + while ((!inForward.eof()) || (!inReverse.eof())) { if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; } //get a read from forward and reverse fastq files - fastqRead fread = readFastq(inForward); - fastqRead rread = readFastq(inReverse); - checkReads(fread, rread); - - if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; } - - //if the reads are okay write to output files - int process = count % processors; + bool ignoref, ignorer; + fastqRead thisFread, thisRread; + if (!inForward.eof()) { thisFread = readFastq(inForward, ignoref); } + else { ignoref = true; } + if (!inReverse.eof()) { thisRread = readFastq(inReverse, ignorer); } + else { ignorer = true; } - *(tempfiles[process][0]) << ">" << fread.name << endl << fread.sequence << endl; - *(tempfiles[process][1]) << ">" << fread.name << endl; - for (int i = 0; i < fread.scores.size(); i++) { *(tempfiles[process][1]) << fread.scores[i] << " "; } - *(tempfiles[process][1]) << endl; - *(tempfiles[process][2]) << ">" << rread.name << endl << rread.sequence << endl; - *(tempfiles[process][3]) << ">" << rread.name << endl; - for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; } - *(tempfiles[process][3]) << endl; + vector reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques); - count++; - - //report progress - if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } - + for (int i = 0; i < reads.size(); i++) { + fastqRead fread = reads[i].forward; + fastqRead rread = reads[i].reverse; + + if (checkReads(fread, rread)) { + if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; } + + //if the reads are okay write to output files + int process = count % processors; + + *(tempfiles[process][0]) << ">" << fread.name << endl << fread.sequence << endl; + *(tempfiles[process][1]) << ">" << fread.name << endl; + for (int i = 0; i < fread.scores.size(); i++) { *(tempfiles[process][1]) << fread.scores[i] << " "; } + *(tempfiles[process][1]) << endl; + *(tempfiles[process][2]) << ">" << rread.name << endl << rread.sequence << endl; + *(tempfiles[process][3]) << ">" << rread.name << endl; + for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; } + *(tempfiles[process][3]) << endl; + + count++; + + //report progress + if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + } + } } //report progress if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } - + if (uniques.size() != 0) { + for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) { + m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n"); + } + m->mothurOutEndLine(); + } //close files, delete ofstreams for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } @@ -658,6 +987,7 @@ vector< vector > MakeContigsCommand::readFastqFiles(int& count){ inReverse.close(); //adjust for really large processors or really small files + if (count == 0) { m->mothurOut("[ERROR]: no good reads.\n"); m->control_pressed = true; } if (count < processors) { for (int i = count; i < processors; i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } files[i].clear(); } files.resize(count); @@ -672,43 +1002,110 @@ vector< vector > MakeContigsCommand::readFastqFiles(int& count){ } } //********************************************************************************************************************** -fastqRead MakeContigsCommand::readFastq(ifstream& in){ +vector MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map& uniques){ + try { + vector reads; + map::iterator itUniques; + + if (!ignoref && !ignorer) { + if (forward.name == reverse.name) { + pairFastqRead temp(forward, reverse); + reads.push_back(temp); + }else { + //look for forward pair + itUniques = uniques.find(forward.name); + if (itUniques != uniques.end()) { //we have the pair for this read + pairFastqRead temp(forward, itUniques->second); + reads.push_back(temp); + uniques.erase(itUniques); + }else { //save this read for later + uniques[forward.name] = forward; + } + + //look for reverse pair + itUniques = uniques.find(reverse.name); + if (itUniques != uniques.end()) { //we have the pair for this read + pairFastqRead temp(itUniques->second, reverse); + reads.push_back(temp); + uniques.erase(itUniques); + }else { //save this read for later + uniques[reverse.name] = reverse; + } + } + }else if (!ignoref && ignorer) { //ignore reverse keep forward + //look for forward pair + itUniques = uniques.find(forward.name); + if (itUniques != uniques.end()) { //we have the pair for this read + pairFastqRead temp(forward, itUniques->second); + reads.push_back(temp); + uniques.erase(itUniques); + }else { //save this read for later + uniques[forward.name] = forward; + } + + }else if (ignoref && !ignorer) { //ignore forward keep reverse + //look for reverse pair + itUniques = uniques.find(reverse.name); + if (itUniques != uniques.end()) { //we have the pair for this read + pairFastqRead temp(itUniques->second, reverse); + reads.push_back(temp); + uniques.erase(itUniques); + }else { //save this read for later + uniques[reverse.name] = reverse; + } + }//else ignore both and do nothing + + return reads; + } + catch(exception& e) { + m->errorOut(e, "MakeContigsCommand", "readFastqFiles"); + exit(1); + } +} +//********************************************************************************************************************** +fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){ try { fastqRead read; + ignore = false; + //read sequence name - string name = m->getline(in); m->gobble(in); - if (name == "") { m->mothurOut("[ERROR]: Blank fasta name."); m->mothurOutEndLine(); m->control_pressed = true; return read; } - else if (name[0] != '@') { m->mothurOut("[ERROR]: reading " + name + " expected a name with @ as a leading character."); m->mothurOutEndLine(); m->control_pressed = true; return read; } + string line = m->getline(in); m->gobble(in); + vector pieces = m->splitWhiteSpace(line); + string name = ""; if (pieces.size() != 0) { name = pieces[0]; } + if (name == "") { m->mothurOut("[WARNING]: Blank fasta name, ignoring read."); m->mothurOutEndLine(); ignore=true; } + else if (name[0] != '@') { m->mothurOut("[WARNING]: reading " + name + " expected a name with @ as a leading character, ignoring read."); m->mothurOutEndLine(); ignore=true; } else { name = name.substr(1); } //read sequence string sequence = m->getline(in); m->gobble(in); - if (sequence == "") { m->mothurOut("[ERROR]: missing sequence for " + name); m->mothurOutEndLine(); m->control_pressed = true; return read; } + if (sequence == "") { m->mothurOut("[WARNING]: missing sequence for " + name + ", ignoring."); ignore=true; } //read sequence name - string name2 = m->getline(in); m->gobble(in); - if (name2 == "") { m->mothurOut("[ERROR]: Blank quality name."); m->mothurOutEndLine(); m->control_pressed = true; return read; } - else if (name2[0] != '+') { m->mothurOut("[ERROR]: reading " + name2 + " expected a name with + as a leading character."); m->mothurOutEndLine(); m->control_pressed = true; return read; } - else { name2 = name2.substr(1); } + line = m->getline(in); m->gobble(in); + pieces = m->splitWhiteSpace(line); + string name2 = ""; if (pieces.size() != 0) { name2 = pieces[0]; } + if (name2 == "") { m->mothurOut("[WARNING]: expected a name with + as a leading character, ignoring."); ignore=true; } + else if (name2[0] != '+') { m->mothurOut("[WARNING]: reading " + name2 + " expected a name with + as a leading character, ignoring."); ignore=true; } + else { name2 = name2.substr(1); if (name2 == "") { name2 = name; } } //read quality scores string quality = m->getline(in); m->gobble(in); - if (quality == "") { m->mothurOut("[ERROR]: missing quality for " + name2); m->mothurOutEndLine(); m->control_pressed = true; return read; } - + if (quality == "") { m->mothurOut("[WARNING]: missing quality for " + name2 + ", ignoring."); ignore=true; } + //sanity check sequence length and number of quality scores match - if (name2 != "") { if (name != name2) { m->mothurOut("[ERROR]: names do not match. read " + name + " for fasta and " + name2 + " for quality."); m->mothurOutEndLine(); m->control_pressed = true; return read; } } - if (quality.length() != sequence.length()) { m->mothurOut("[ERROR]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores."); m->mothurOutEndLine(); m->control_pressed = true; return read; } + if (name2 != "") { if (name != name2) { m->mothurOut("[WARNING]: names do not match. read " + name + " for fasta and " + name2 + " for quality, ignoring."); ignore=true; } } + if (quality.length() != sequence.length()) { m->mothurOut("[WARNING]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores, ignoring read."); ignore=true; } vector qualScores; - int controlChar = int('@'); + int controlChar = int('!'); for (int i = 0; i < quality.length(); i++) { int temp = int(quality[i]); temp -= controlChar; qualScores.push_back(temp); } - + read.name = name; read.sequence = sequence; read.scores = qualScores; @@ -725,34 +1122,22 @@ bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse){ try { bool good = true; - //fix names - if ((forward.name.length() > 2) && (reverse.name.length() > 2)) { - forward.name = forward.name.substr(0, forward.name.length()-2); - reverse.name = reverse.name.substr(0, reverse.name.length()-2); - }else { good = false; m->control_pressed = true; } - - //do names match - if (forward.name != reverse.name) { - m->mothurOut("[ERROR]: read " + forward.name + " from " + ffastqfile + ", but read " + reverse.name + " from " + rfastqfile + ".\n"); - good = false; m->control_pressed = true; - } - //do sequence lengths match if (forward.sequence.length() != reverse.sequence.length()) { - m->mothurOut("[ERROR]: For sequence " + forward.name + " I read a sequence of length " + toString(forward.sequence.length()) + " from " + ffastqfile + ", but read a sequence of length " + toString(reverse.sequence.length()) + " from " + rfastqfile + ".\n"); - good = false; m->control_pressed = true; + m->mothurOut("[WARNING]: For sequence " + forward.name + " I read a sequence of length " + toString(forward.sequence.length()) + " from " + ffastqfile + ", but read a sequence of length " + toString(reverse.sequence.length()) + " from " + rfastqfile + ", ignoring.\n"); + good = false; } //do number of qual scores match if (forward.scores.size() != reverse.scores.size()) { - m->mothurOut("[ERROR]: For sequence " + forward.name + " I read " + toString(forward.scores.size()) + " quality scores from " + ffastqfile + ", but read " + toString(reverse.scores.size()) + " quality scores from " + rfastqfile + ".\n"); - good = false; m->control_pressed = true; + m->mothurOut("[WARNING]: For sequence " + forward.name + " I read " + toString(forward.scores.size()) + " quality scores from " + ffastqfile + ", but read " + toString(reverse.scores.size()) + " quality scores from " + rfastqfile + ", ignoring.\n"); + good = false; } return good; } catch(exception& e) { - m->errorOut(e, "MakeContigsCommand", "readFastq"); + m->errorOut(e, "MakeContigsCommand", "checkReads"); exit(1); } } @@ -799,7 +1184,7 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vect if(foligo[i] == 'U') { foligo[i] = 'T'; } } - if(type == "PRIMER"){ + if(type == "FORWARD"){ m->gobble(in); in >> roligo; @@ -808,7 +1193,7 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vect roligo[i] = toupper(roligo[i]); if(roligo[i] == 'U') { roligo[i] = 'T'; } } - roligo = reverseOligo(roligo); + //roligo = reverseOligo(roligo); group = ""; @@ -840,7 +1225,7 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vect roligo[i] = toupper(roligo[i]); if(roligo[i] == 'U') { roligo[i] = 'T'; } } - roligo = reverseOligo(roligo); + //roligo = reverseOligo(roligo); oligosPair newPair(foligo, roligo); @@ -863,8 +1248,10 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vect barcodeNameVector.push_back(group); }else if(type == "LINKER"){ linker.push_back(foligo); + m->mothurOut("[WARNING]: make.contigs is not setup to remove linkers, ignoring.\n"); }else if(type == "SPACER"){ spacer.push_back(foligo); + m->mothurOut("[WARNING]: make.contigs is not setup to remove spacers, ignoring.\n"); } else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are primer, barcode, linker and spacer. Ignoring " + foligo + "."); m->mothurOutEndLine(); } } diff --git a/makecontigscommand.h b/makecontigscommand.h index 84e43c0..86a8450 100644 --- a/makecontigscommand.h +++ b/makecontigscommand.h @@ -29,6 +29,14 @@ struct fastqRead { ~fastqRead() {}; }; +struct pairFastqRead { + fastqRead forward; + fastqRead reverse; + + pairFastqRead() {}; + pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){}; + ~pairFastqRead() {}; +}; /**************************************************************************************************/ class MakeContigsCommand : public Command { @@ -50,7 +58,7 @@ public: void help() { m->mothurOut(getHelpString()); } private: - bool abort, allFiles; + bool abort, allFiles, createGroup; string outputDir, ffastqfile, rfastqfile, align, oligosfile; float match, misMatch, gapOpen, gapExtend; int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs; @@ -63,18 +71,20 @@ private: vector primerNameVector; vector barcodeNameVector; - map groupCounts; + map groupCounts; + map groupMap; //map combos; //map groupToIndex; //vector groupVector; - fastqRead readFastq(ifstream&); + fastqRead readFastq(ifstream&, bool&); vector< vector > readFastqFiles(int&); bool checkReads(fastqRead&, fastqRead&); - int createProcesses(vector< vector >, string, string, string); - int driver(vector, string, string, string); + int createProcesses(vector< vector >, string, string, string, string, string, vector >, vector >); + int driver(vector, string, string, string, string, string, vector >, vector >); bool getOligos(vector >&, vector >&); string reverseOligo(string); + vector getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map& uniques); }; /**************************************************************************************************/ @@ -86,15 +96,26 @@ private: struct contigsData { string outputFasta; string outputQual; + string outputScrapFasta; + string outputScrapQual; string outputMisMatches; string align; vector files; + vector > fastaFileNames; + vector > qualFileNames; MothurOut* m; float match, misMatch, gapOpen, gapExtend; - int count, threshold, threadID; + int count, threshold, threadID, pdiffs, bdiffs, tdiffs; + bool allFiles, createGroup; + map groupCounts; + map groupMap; + vector primerNameVector; + vector barcodeNameVector; + map barcodes; + map primers; contigsData(){} - contigsData(vector f, string of, string oq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int tid) { + contigsData(vector f, string of, string oq, string osf, string osq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, map br, map pr, vector > ffn, vector > qfn, vectorbnv, vector pnv, int pdf, int bdf, int tdf, bool cg, bool all, int tid) { files = f; outputFasta = of; outputQual = oq; @@ -107,6 +128,19 @@ struct contigsData { threshold = thr; align = al; count = 0; + outputScrapFasta = osf; + outputScrapQual = osq; + fastaFileNames = ffn; + qualFileNames = qfn; + barcodes = br; + primers = pr; + barcodeNameVector = bnv; + primerNameVector = pnv; + pdiffs = pdf; + bdiffs = bdf; + tdiffs = tdf; + allFiles = all; + createGroup = cg; threadID = tid; } }; @@ -132,32 +166,69 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); } + if(pDataArray->allFiles){ + for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file + for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file + if (pDataArray->fastaFileNames[i][j] != "") { + ofstream temp; + pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close(); + pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close(); + } + } + } + } + ifstream inFFasta, inRFasta, inFQual, inRQual; pDataArray->m->openInputFile(thisffastafile, inFFasta); pDataArray->m->openInputFile(thisfqualfile, inFQual); pDataArray->m->openInputFile(thisrfastafile, inRFasta); pDataArray->m->openInputFile(thisrqualfile, inRQual); - ofstream outFasta, outQual, outMisMatch; + ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual; pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta); pDataArray->m->openOutputFile(pDataArray->outputQual, outQual); pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch); + pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta); + pDataArray->m->openOutputFile(pDataArray->outputScrapQual, outScrapQual); outMisMatch << "Name\tLength\tMisMatches\n"; + TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes); + while ((!inFQual.eof()) && (!inFFasta.eof()) && (!inRFasta.eof()) && (!inRQual.eof())) { if (pDataArray->m->control_pressed) { break; } + int success = 1; + string trashCode = ""; + int currentSeqsDiffs = 0; + //read seqs and quality info Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta); Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta); QualityScores fQual(inFQual); pDataArray->m->gobble(inFQual); QualityScores rQual(inRQual); pDataArray->m->gobble(inRQual); + int barcodeIndex = 0; + int primerIndex = 0; + + if(pDataArray->barcodes.size() != 0){ + success = trimOligos.stripBarcode(fSeq, rSeq, fQual, rQual, barcodeIndex); + if(success > pDataArray->bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } + + if(pDataArray->primers.size() != 0){ + success = trimOligos.stripForward(fSeq, rSeq, fQual, rQual, primerIndex); + if(success > pDataArray->pdiffs) { trashCode += 'f'; } + else{ currentSeqsDiffs += success; } + } + + if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; } + //flip the reverse reads rSeq.reverseComplement(); rQual.flipQScores(); - + //pairwise align alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned()); map ABaseMap = alignment->getSeqAAlnBaseMap(); @@ -172,29 +243,51 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ int numMismatches = 0; string seq1 = fSeq.getAligned(); string seq2 = rSeq.getAligned(); - vector scores1 = fQual.getQualityScores(); vector scores2 = rQual.getQualityScores(); - for (int i = 0; i < length; i++) { + int overlapStart = fSeq.getStartPos(); + int seq2Start = rSeq.getStartPos(); + //bigger of the 2 starting positions is the location of the overlapping start + if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1 + overlapStart = seq2Start; + for (int i = 0; i < overlapStart; i++) { + contig += seq1[i]; + contigScores.push_back(scores1[ABaseMap[i]]); + } + }else { //seq1 starts later so take from 0 to overlapStart from seq2 + for (int i = 0; i < overlapStart; i++) { + contig += seq2[i]; + contigScores.push_back(scores2[BBaseMap[i]]); + } + } + + int seq1End = fSeq.getEndPos(); + int seq2End = rSeq.getEndPos(); + int overlapEnd = seq1End; + if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends + + for (int i = overlapStart; i < overlapEnd; i++) { if (seq1[i] == seq2[i]) { //match, add base and choose highest score contig += seq1[i]; contigScores.push_back(scores1[ABaseMap[i]]); - if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; } + if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; } }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base - if (scores2[BBaseMap[i]] >= pDataArray->threshold) { + if (scores2[BBaseMap[i]] < pDataArray->threshold) { } // + else { contig += seq2[i]; contigScores.push_back(scores2[BBaseMap[i]]); } }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base - if (scores1[ABaseMap[i]] >= pDataArray->threshold) { + if (scores1[ABaseMap[i]] < pDataArray->threshold) { } // + else { contig += seq1[i]; contigScores.push_back(scores1[ABaseMap[i]]); } }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality char c = seq1[i]; contigScores.push_back(scores1[ABaseMap[i]]); - if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; c = seq2[i]; } + if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; } contig += c; numMismatches++; }else { //should never get here @@ -202,13 +295,70 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ } } - //output - outFasta << ">" << fSeq.getName() << endl << contig << endl; - outQual << ">" << fSeq.getName() << endl; - for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; } - outQual << endl; - outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl; - + if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2 + for (int i = overlapEnd; i < length; i++) { + contig += seq2[i]; + contigScores.push_back(scores2[BBaseMap[i]]); + } + }else { //seq2 ends before seq1 so take from overlap to length from seq1 + for (int i = overlapEnd; i < length; i++) { + contig += seq1[i]; + contigScores.push_back(scores1[ABaseMap[i]]); + } + + } + + if(trashCode.length() == 0){ + if (pDataArray->createGroup) { + if(pDataArray->barcodes.size() != 0){ + string thisGroup = pDataArray->barcodeNameVector[barcodeIndex]; + if (pDataArray->primers.size() != 0) { + if (pDataArray->primerNameVector[primerIndex] != "") { + if(thisGroup != "") { + thisGroup += "." + pDataArray->primerNameVector[primerIndex]; + }else { + thisGroup = pDataArray->primerNameVector[primerIndex]; + } + } + } + + if (pDataArray->m->debug) { pDataArray->m->mothurOut(", group= " + thisGroup + "\n"); } + + pDataArray->groupMap[fSeq.getName()] = thisGroup; + + map::iterator it = pDataArray->groupCounts.find(thisGroup); + if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; } + else { pDataArray->groupCounts[it->first] ++; } + + } + } + + if(pDataArray->allFiles){ + ofstream output; + pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output); + output << ">" << fSeq.getName() << endl << contig << endl; + output.close(); + + pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output); + output << ">" << fSeq.getName() << endl; + for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; } + output << endl; + output.close(); + } + + //output + outFasta << ">" << fSeq.getName() << endl << contig << endl; + outQual << ">" << fSeq.getName() << endl; + for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; } + outQual << endl; + outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl; + }else { + //output + outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl; + outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl; + for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; } + outScrapQual << endl; + } num++; //report progress @@ -225,9 +375,11 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ outFasta.close(); outQual.close(); outMisMatch.close(); + outScrapFasta.close(); + outScrapQual.close(); delete alignment; - if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputFasta); pDataArray->m->mothurRemove(pDataArray->outputMisMatches);} + if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputFasta); pDataArray->m->mothurRemove(pDataArray->outputMisMatches); pDataArray->m->mothurRemove(pDataArray->outputScrapQual); pDataArray->m->mothurRemove(pDataArray->outputScrapFasta);} return 0; diff --git a/mothurout.cpp b/mothurout.cpp index 7d40e80..37c0916 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -1541,6 +1541,46 @@ vector MothurOut::splitWhiteSpace(string input){ exit(1); } } +/***********************************************************************/ +vector MothurOut::splitWhiteSpaceWithQuotes(string input){ + try { + vector pieces; + string rest = ""; + + int pos = input.find('\''); + int pos2 = input.find('\"'); + + if ((pos == string::npos) && (pos2 == string::npos)) { return splitWhiteSpace(input); } //no quotes to worry about + else { + for (int i = 0; i < input.length(); i++) { + if ((input[i] == '\'') || (input[i] == '\"') || (rest == "\'") || (rest == "\"")) { //grab everything til end or next ' or " + rest += input[i]; + for (int j = i+1; j < input.length(); j++) { + if ((input[j] == '\'') || (input[j] == '\"')) { //then quit + rest += input[j]; + i = j+1; + j+=input.length(); + }else { rest += input[j]; } + } + }else if (!isspace(input[i])) { rest += input[i]; } + else { + if (rest != "") { pieces.push_back(rest); rest = ""; } + while (i < input.length()) { //gobble white space + if (isspace(input[i])) { i++; } + else { rest = input[i]; break; } //cout << "next piece buffer = " << nextPiece << endl; + } + } + } + + if (rest != "") { pieces.push_back(rest); } + } + return pieces; + } + catch(exception& e) { + errorOut(e, "MothurOut", "splitWhiteSpace"); + exit(1); + } +} //********************************************************************************************************************** int MothurOut::readTax(string namefile, map& taxMap) { try { @@ -2688,6 +2728,35 @@ bool MothurOut::inUsersGroups(string groupname, vector Groups) { exit(1); } } +/**************************************************************************************************/ + +bool MothurOut::inUsersGroups(vector set, vector< vector > sets) { + try { + for (int i = 0; i < sets.size(); i++) { + if (set == sets[i]) { return true; } + } + return false; + } + catch(exception& e) { + errorOut(e, "MothurOut", "inUsersGroups"); + exit(1); + } +} +/**************************************************************************************************/ + +bool MothurOut::inUsersGroups(int groupname, vector Groups) { + try { + for (int i = 0; i < Groups.size(); i++) { + if (groupname == Groups[i]) { return true; } + } + return false; + } + catch(exception& e) { + errorOut(e, "MothurOut", "inUsersGroups"); + exit(1); + } +} + /**************************************************************************************************/ //returns true if any of the strings in first vector are in second vector bool MothurOut::inUsersGroups(vector groupnames, vector Groups) { diff --git a/mothurout.h b/mothurout.h index 53d4250..0d2e86f 100644 --- a/mothurout.h +++ b/mothurout.h @@ -120,7 +120,9 @@ class MothurOut { bool checkReleaseVersion(ifstream&, string); bool anyLabelsToProcess(string, set&, string); bool inUsersGroups(vector, vector); + bool inUsersGroups(vector, vector< vector >); bool inUsersGroups(string, vector); + bool inUsersGroups(int, vector); void getNumSeqs(ifstream&, int&); int getNumSeqs(ifstream&); int getNumNames(string); @@ -139,6 +141,7 @@ class MothurOut { void splitAtDash(string&, vector&); void splitAtChar(string&, vector&, char); void splitAtChar(string&, string&, char); + vector splitWhiteSpaceWithQuotes(string); int removeConfidences(string&); string removeQuotes(string); string makeList(vector&); diff --git a/otuassociationcommand.cpp b/otuassociationcommand.cpp index 06f83a4..968d767 100644 --- a/otuassociationcommand.cpp +++ b/otuassociationcommand.cpp @@ -16,6 +16,7 @@ vector OTUAssociationCommand::setParameters(){ CommandParameter pshared("shared", "InputTypes", "", "", "SharedRelMeta", "SharedRelMeta", "none",false,false); parameters.push_back(pshared); CommandParameter prelabund("relabund", "InputTypes", "", "", "SharedRelMeta", "SharedRelMeta", "none",false,false); parameters.push_back(prelabund); CommandParameter pmetadata("metadata", "InputTypes", "", "", "SharedRelMeta", "SharedRelMeta", "none",false,false); parameters.push_back(pmetadata); + CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pmethod("method", "Multiple", "pearson-spearman-kendall", "pearson", "", "", "",false,false); parameters.push_back(pmethod); @@ -37,9 +38,10 @@ string OTUAssociationCommand::getHelpString(){ string helpString = ""; helpString += "The otu.association command reads a shared or relabund file and calculates the correlation coefficients between otus.\n"; helpString += "If you provide a metadata file, mothur will calculate te correlation bewteen the metadata and the otus.\n"; - helpString += "The otu.association command parameters are shared, relabund, metadata, groups, method and label. The shared or relabund parameter is required.\n"; + helpString += "The otu.association command parameters are shared, relabund, metadata, groups, method, cutoff and label. The shared or relabund parameter is required.\n"; helpString += "The groups parameter allows you to specify which of the groups you would like included. The group names are separated by dashes.\n"; helpString += "The label parameter allows you to select what distances level you would like used, and are also separated by dashes.\n"; + helpString += "The cutoff parameter allows you to set a pvalue at which the otu will be reported.\n"; helpString += "The method parameter allows you to select what method you would like to use. Options are pearson, spearman and kendall. Default=pearson.\n"; helpString += "The otu.association command should be in the following format: otu.association(shared=yourSharedFile, method=yourMethod).\n"; helpString += "Example otu.association(shared=genus.pool.shared, method=kendall).\n"; @@ -195,6 +197,10 @@ OTUAssociationCommand::OTUAssociationCommand(string option) { method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "pearson"; } + string temp = validParameter.validFile(parameters, "cutoff", false); + if (temp == "not found") { temp = "10"; } + m->mothurConvert(temp, cutoff); + if ((method != "pearson") && (method != "spearman") && (method != "kendall")) { m->mothurOut(method + " is not a valid method. Valid methods are pearson, spearman, and kendall."); m->mothurOutEndLine(); abort = true; } } @@ -356,7 +362,7 @@ int OTUAssociationCommand::process(vector& lookup){ else if (method == "kendall") { coef = linear.calcKendall(xy[i], xy[k], sig); } else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; } - out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; + if (sig < cutoff) { out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; } } } }else { //compare otus to metadata @@ -373,7 +379,7 @@ int OTUAssociationCommand::process(vector& lookup){ else if (method == "kendall") { coef = linear.calcKendall(xy[i], metadata[k], sig); } else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; } - out << m->binLabelsInFile[i] << '\t' << metadataLabels[k] << '\t' << coef << '\t' << sig << endl; + if (sig < cutoff) { out << m->binLabelsInFile[i] << '\t' << metadataLabels[k] << '\t' << coef << '\t' << sig << endl; } } } @@ -515,7 +521,7 @@ int OTUAssociationCommand::process(vector& lookup){ else if (method == "kendall") { coef = linear.calcKendall(xy[i], xy[k], sig); } else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; } - out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; + if (sig < cutoff) { out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; } } } }else { //compare otus to metadata @@ -532,7 +538,7 @@ int OTUAssociationCommand::process(vector& lookup){ else if (method == "kendall") { coef = linear.calcKendall(xy[i], metadata[k], sig); } else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; } - out << m->binLabelsInFile[i] << '\t' << metadataLabels[k] << '\t' << coef << '\t' << sig << endl; + if (sig < cutoff) { out << m->binLabelsInFile[i] << '\t' << metadataLabels[k] << '\t' << coef << '\t' << sig << endl; } } } diff --git a/otuassociationcommand.h b/otuassociationcommand.h index 64995ac..7f7664b 100644 --- a/otuassociationcommand.h +++ b/otuassociationcommand.h @@ -36,6 +36,7 @@ private: string sharedfile, relabundfile, metadatafile, groups, label, inputFileName, outputDir, method; bool abort, pickedGroups, allLines; + double cutoff; set labels; vector metadataLookup; vector< vector< double> > metadata; diff --git a/phylotypecommand.cpp b/phylotypecommand.cpp index 38d3bdf..eddddfa 100644 --- a/phylotypecommand.cpp +++ b/phylotypecommand.cpp @@ -272,11 +272,14 @@ int PhylotypeCommand::execute(){ map::iterator itNames = namemap.find(names[i]); //make sure this name is in namefile if (itNames != namemap.end()) { name += namemap[names[i]] + ","; } //you found it in namefile - else { m->mothurOut(names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); exit(1); } + else { m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } }else{ name += names[i] + ","; } } } + + if (m->control_pressed) { break; } + name = name.substr(0, name.length()-1); //rip off extra ',' //add bin to list vector if (name != "") { list.push_back(name); } //caused by unknown diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index a9bb573..a250094 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -450,7 +450,7 @@ int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vectorcontrol_pressed) { in.close(); outSummary.close(); return 1; } Sequence current(in); m->gobble(in); - + if (current.getName() != "") { int num = 1; diff --git a/sequence.cpp b/sequence.cpp index a359607..96662bc 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -300,7 +300,7 @@ string Sequence::getSequenceString(ifstream& fastaFile, int& numAmbig) { if(letter == '>'){ fastaFile.putback(letter); break; - } + }else if (letter == ' ') {;} else if(isprint(letter)){ letter = toupper(letter); if(letter == 'U'){letter = 'T';} @@ -354,7 +354,7 @@ string Sequence::getSequenceString(istringstream& fastaFile, int& numAmbig) { if(letter == '>'){ fastaFile.putback(letter); break; - } + }else if (letter == ' ') {;} else if(isprint(letter)){ letter = toupper(letter); if(letter == 'U'){letter = 'T';} diff --git a/sharedrabundfloatvector.cpp b/sharedrabundfloatvector.cpp index b8b9102..fa32fc4 100644 --- a/sharedrabundfloatvector.cpp +++ b/sharedrabundfloatvector.cpp @@ -152,8 +152,6 @@ SharedRAbundFloatVector::SharedRAbundFloatVector(ifstream& f) : DataVector(), ma m->saveNextLabel = nextLabel; m->setAllGroups(allGroups); - for (int i = 0; i < allGroups.size(); i++) { cout << allGroups[i] << endl; } - } catch(exception& e) { m->errorOut(e, "SharedRAbundFloatVector", "SharedRAbundFloatVector"); diff --git a/subsample.cpp b/subsample.cpp index 392f97b..a6b1b2d 100644 --- a/subsample.cpp +++ b/subsample.cpp @@ -24,7 +24,7 @@ Tree* SubSample::getSample(Tree* T, CountTable* ct, CountTable* newCt, int size) for (int i = 0; i < Groups.size(); i++) { if (m->inUsersGroups(Groups[i], m->getGroups())) { if (m->control_pressed) { break; } - + cout << Groups[i] << endl; int thisSize = ct->getGroupCount(Groups[i]); if (thisSize >= size) { diff --git a/trimoligos.cpp b/trimoligos.cpp index 8f4cbe9..c35971a 100644 --- a/trimoligos.cpp +++ b/trimoligos.cpp @@ -28,6 +28,33 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + if(it->first.length() > maxFBarcodeLength){ + maxFBarcodeLength = it->first.length(); + } + } + maxFPrimerLength = 0; + map::iterator it; + for(it=primers.begin();it!=primers.end();it++){ + if(it->first.length() > maxFPrimerLength){ + maxFPrimerLength = it->first.length(); + } + } + + maxLinkerLength = 0; + for(int i = 0; i < linker.size(); i++){ + if(linker[i].length() > maxLinkerLength){ + maxLinkerLength = linker[i].length(); + } + } + + maxSpacerLength = 0; + for(int i = 0; i < spacer.size(); i++){ + if(spacer[i].length() > maxSpacerLength){ + maxSpacerLength = spacer[i].length(); + } + } } catch(exception& e) { m->errorOut(e, "TrimOligos", "TrimOligos"); @@ -36,7 +63,7 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map pr, map br, vector lk, vector sp){ +TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br){ try { m = MothurOut::getInstance(); @@ -45,10 +72,60 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map< ldiffs = l; sdiffs = s; - ibarcodes = br; - iprimers = pr; - linker = lk; - spacer = sp; + maxFBarcodeLength = 0; + for(map::iterator it=br.begin();it!=br.end();it++){ + string forward = it->second.forward; + map >::iterator itForward = ifbarcodes.find(forward); + + if(forward.length() > maxFBarcodeLength){ maxFBarcodeLength = forward.length(); } + + if (itForward == ifbarcodes.end()) { + vector temp; temp.push_back(it->first); + ifbarcodes[forward] = temp; + }else { itForward->second.push_back(it->first); } + } + + maxFPrimerLength = 0; + for(map::iterator it=pr.begin();it!=pr.end();it++){ + string forward = it->second.forward; + map >::iterator itForward = ifprimers.find(forward); + + if(forward.length() > maxFPrimerLength){ maxFPrimerLength = forward.length(); } + + if (itForward == ifprimers.end()) { + vector temp; temp.push_back(it->first); + ifprimers[forward] = temp; + }else { itForward->second.push_back(it->first); } + } + + maxRBarcodeLength = 0; + for(map::iterator it=br.begin();it!=br.end();it++){ + string forward = it->second.reverse; + map >::iterator itForward = irbarcodes.find(forward); + + if(forward.length() > maxRBarcodeLength){ maxRBarcodeLength = forward.length(); } + + if (itForward == irbarcodes.end()) { + vector temp; temp.push_back(it->first); + irbarcodes[forward] = temp; + }else { itForward->second.push_back(it->first); } + } + + maxRPrimerLength = 0; + for(map::iterator it=pr.begin();it!=pr.end();it++){ + string forward = it->second.reverse; + map >::iterator itForward = irprimers.find(forward); + + if(forward.length() > maxRPrimerLength){ maxRPrimerLength = forward.length(); } + + if (itForward == irprimers.end()) { + vector temp; temp.push_back(it->first); + irprimers[forward] = temp; + }else { itForward->second.push_back(it->first); } + } + + ipbarcodes = br; + ipprimers = pr; } catch(exception& e) { m->errorOut(e, "TrimOligos", "TrimOligos"); @@ -107,21 +184,9 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ if ((bdiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - - int maxLength = 0; - Alignment* alignment; - if (barcodes.size() > 0) { - map::iterator it; - - for(it=barcodes.begin();it!=barcodes.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); - - }else{ alignment = NULL; } + if (barcodes.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; @@ -133,7 +198,7 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ string oligo = it->first; // int length = oligo.length(); - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length success = bdiffs + 10; break; } @@ -202,7 +267,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality int success = bdiffs + 1; //guilty until proven innocent //can you find the forward barcode - for(map::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){ + for(map::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){ string foligo = it->second.forward; string roligo = it->second.reverse; @@ -229,37 +294,45 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality //if you found the barcode or if you don't want to allow for diffs if ((bdiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - - //look for forward - int maxLength = 0; - Alignment* alignment; - if (ibarcodes.size() > 0) { - for(map::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){ - if(it->second.forward.length() > maxLength){ maxLength = it->second.forward.length(); } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); - }else{ alignment = NULL; } + if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; int minCount = 1; - int minFGroup = -1; - int minFPos = 0; - - for(map::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){ - string oligo = it->second.forward; + vector< vector > minFGroup; + vector minFPos; + + //the pair can have duplicates, but we only want to search for the unique forward and reverses, example + /* + 1 Sarah Westcott + 2 John Westcott + 3 Anna Westcott + 4 Sarah Schloss + 5 Pat Schloss + 6 Gail Brown + 7 Pat Moore + + only want to look for forward = Sarah, John, Anna, Pat, Gail + reverse = Westcott, Schloss, Brown, Moore + + but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. + */ + //cout << endl << forwardSeq.getName() << endl; + for(map >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){ + string oligo = it->first; - if(rawFSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + if(rawFSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length success = bdiffs + 10; break; } - + //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl; //use needleman to align first barcode.length()+numdiffs of sequence to each barcode alignment->align(oligo, rawFSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); - + int alnLength = oligo.length(); for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } @@ -267,87 +340,136 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); + if (alnLength == 0) { numDiff = bdiffs + 100; } + //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; + if(numDiff < minDiff){ minDiff = numDiff; minCount = 1; - minFGroup = it->first; - minFPos = 0; + minFGroup.clear(); + minFGroup.push_back(it->second); + int tempminFPos = 0; + minFPos.clear(); for(int i=0;isecond); + int tempminFPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes else{ //check for reverse match if (alignment != NULL) { delete alignment; } - maxLength = 0; - if (ibarcodes.size() > 0) { - for(map::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){ - if(it->second.reverse.length() > maxLength){ maxLength = it->second.reverse.length(); } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); - }else{ alignment = NULL; } + if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode minDiff = 1e6; minCount = 1; - int minRGroup = -1; - int minRPos = 0; + vector< vector > minRGroup; + vector minRPos; - for(map::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){ - string oligo = it->second.reverse; - - if(rawRSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + for(map >::iterator it=irbarcodes.begin();it!=irbarcodes.end();it++){ + string oligo = it->first; + //cout << "before = " << oligo << '\t' << rawRSequence.substr(0,oligo.length()+bdiffs) << endl; + if(rawRSequence.length() < maxRBarcodeLength){ //let's just assume that the barcodes are the same length success = bdiffs + 10; break; } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawRSequence.substr((rawRSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs)); + alignment->align(oligo, rawRSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); int alnLength = oligo.length(); - for(int i=0;i=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); + if (alnLength == 0) { numDiff = bdiffs + 100; } + //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; if(numDiff < minDiff){ minDiff = numDiff; minCount = 1; - minRGroup = it->first; - minRPos = 0; + minRGroup.clear(); + minRGroup.push_back(it->second); + int tempminRPos = 0; + minRPos.clear(); for(int i=0;isecond); } } - + + /*cout << minDiff << '\t' << minCount << '\t' << endl; + for (int i = 0; i < minFGroup.size(); i++) { + cout << i << '\t'; + for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } + cout << endl; + } + cout << endl; + for (int i = 0; i < minRGroup.size(); i++) { + cout << i << '\t'; + for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } + cout << endl; + } + cout << endl;*/ if(minDiff > bdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes - else{ + else { + bool foundMatch = false; + vector matches; + for (int i = 0; i < minFGroup.size(); i++) { + for (int j = 0; j < minFGroup[i].size(); j++) { + for (int k = 0; k < minRGroup.size(); k++) { + if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); } + } + } + } + + int fStart = 0; + int rStart = 0; + if (matches.size() == 1) { + foundMatch = true; + group = matches[0]; + fStart = minFPos[0]; + rStart = minRPos[0]; + } + //we have an acceptable match for the forward and reverse, but do they match? - if (minFGroup == minRGroup) { - group = minFGroup; - forwardSeq.setUnaligned(rawFSequence.substr(minFPos)); - reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-minRPos))); - forwardQual.trimQScores(minFPos, -1); - reverseQual.trimQScores(-1, rawRSequence.length()-minRPos); + if (foundMatch) { + forwardSeq.setUnaligned(rawFSequence.substr(fStart)); + reverseSeq.setUnaligned(rawRSequence.substr(rStart)); + forwardQual.trimQScores(fStart, -1); + reverseQual.trimQScores(rStart, -1); success = minDiff; }else { success = bdiffs + 100; } } @@ -366,255 +488,252 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality } //*******************************************************************/ -int TrimOligos::stripBarcode(Sequence& seq, int& group){ +int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){ try { + //look for forward barcode + string rawFSequence = forwardSeq.getUnaligned(); + string rawRSequence = reverseSeq.getUnaligned(); + int success = pdiffs + 1; //guilty until proven innocent - string rawSequence = seq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - - //can you find the barcode - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + //can you find the forward barcode + for(map::iterator it=ipprimers.begin();it!=ipprimers.end();it++){ + string foligo = it->second.forward; + string roligo = it->second.reverse; + + if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out break; } - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - + if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) { + group = it->first; + forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); + reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); + forwardQual.trimQScores(foligo.length(), -1); + reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length()); success = 0; break; } } //if you found the barcode or if you don't want to allow for diffs - if ((bdiffs == 0) || (success == 0)) { return success; } - + if ((pdiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - - int maxLength = 0; - Alignment* alignment; - if (barcodes.size() > 0) { - map::iterator it=barcodes.begin(); - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); - - }else{ alignment = NULL; } + if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + vector< vector > minFGroup; + vector minFPos; + + //the pair can have duplicates, but we only want to search for the unique forward and reverses, example + /* + 1 Sarah Westcott + 2 John Westcott + 3 Anna Westcott + 4 Sarah Schloss + 5 Pat Schloss + 6 Gail Brown + 7 Pat Moore + + only want to look for forward = Sarah, John, Anna, Pat, Gail + reverse = Westcott, Schloss, Brown, Moore + + but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. + */ + //cout << endl << forwardSeq.getName() << endl; + for(map >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){ string oligo = it->first; - // int length = oligo.length(); - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; + if(rawFSequence.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; break; } - + //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl; //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); + alignment->align(oligo, rawFSequence.substr(0,oligo.length()+pdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); - + int alnLength = oligo.length(); - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); + int numDiff = countDiffs(oligo, temp); + if (alnLength == 0) { numDiff = pdiffs + 100; } + //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; + if(numDiff < minDiff){ minDiff = numDiff; minCount = 1; - minGroup = it->second; - minPos = 0; + minFGroup.clear(); + minFGroup.push_back(it->second); + int tempminFPos = 0; + minFPos.clear(); for(int i=0;i bdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripBarcode"); - exit(1); - } - -} -/******************************************************************* -int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){ - try { - - string rawSequence = seq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - - //can you find the barcode - for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length()))); - - if(qual.getName() != ""){ - qual.trimQScores(-1, rawSequence.length()-oligo.length()); - } - - success = 0; - break; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if ((bdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it - - int maxLength = 0; - - Alignment* alignment; - if (rbarcodes.size() > 0) { - map::iterator it; - - for(it=rbarcodes.begin();it!=rbarcodes.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); - - }else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ - string oligo = it->first; - // int length = oligo.length(); - - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=0;isecond; - minPos = 0; - for(int i=alnLength-1;i>=0;i--){ + minFPos.push_back(tempminFPos); + }else if(numDiff == minDiff){ + minFGroup.push_back(it->second); + int tempminFPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos))); + + //cout << minDiff << '\t' << minCount << '\t' << endl; + if(minDiff > pdiffs) { success = minDiff; } //no good matches + else{ + //check for reverse match + if (alignment != NULL) { delete alignment; } - if(qual.getName() != ""){ - qual.trimQScores(-1, (rawSequence.length()-minPos)); - } - success = minDiff; + if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + minDiff = 1e6; + minCount = 1; + vector< vector > minRGroup; + vector minRPos; + + for(map >::iterator it=irprimers.begin();it!=irprimers.end();it++){ + string oligo = it->first; + //cout << "before = " << oligo << '\t' << rawRSequence.substr(0,oligo.length()+pdiffs) << endl; + if(rawRSequence.length() < maxRPrimerLength){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawRSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + int numDiff = countDiffs(oligo, temp); + if (alnLength == 0) { numDiff = pdiffs + 100; } + + //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minRGroup.clear(); + minRGroup.push_back(it->second); + int tempminRPos = 0; + minRPos.clear(); + for(int i=0;isecond); + } + + } + + /*cout << minDiff << '\t' << minCount << '\t' << endl; + for (int i = 0; i < minFGroup.size(); i++) { + cout << i << '\t'; + for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } + cout << endl; + } + cout << endl; + for (int i = 0; i < minRGroup.size(); i++) { + cout << i << '\t'; + for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } + cout << endl; + } + cout << endl;*/ + if(minDiff > pdiffs) { success = minDiff; } //no good matches + else { + bool foundMatch = false; + vector matches; + for (int i = 0; i < minFGroup.size(); i++) { + for (int j = 0; j < minFGroup[i].size(); j++) { + for (int k = 0; k < minRGroup.size(); k++) { + if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); } + } + } + } + + int fStart = 0; + int rStart = 0; + if (matches.size() == 1) { + foundMatch = true; + group = matches[0]; + fStart = minFPos[0]; + rStart = minRPos[0]; + } + + //we have an acceptable match for the forward and reverse, but do they match? + if (foundMatch) { + forwardSeq.setUnaligned(rawFSequence.substr(fStart)); + reverseSeq.setUnaligned(rawRSequence.substr(rStart)); + forwardQual.trimQScores(fStart, -1); + reverseQual.trimQScores(rStart, -1); + success = minDiff; + }else { success = pdiffs + 100; } + } } if (alignment != NULL) { delete alignment; } - } return success; } catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripRBarcode"); + m->errorOut(e, "TrimOligos", "stripIForward"); exit(1); } } -/******************************************************************* -int TrimOligos::stripRBarcode(Sequence& seq, int& group){ +//*******************************************************************/ +int TrimOligos::stripBarcode(Sequence& seq, int& group){ try { string rawSequence = seq.getUnaligned(); int success = bdiffs + 1; //guilty until proven innocent //can you find the barcode - for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ string oligo = it->first; if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out break; } - - if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){ + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ group = it->second; - seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length()))); + seq.setUnaligned(rawSequence.substr(oligo.length())); success = 0; break; @@ -625,21 +744,9 @@ int TrimOligos::stripRBarcode(Sequence& seq, int& group){ if ((bdiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - - int maxLength = 0; - - Alignment* alignment; - if (rbarcodes.size() > 0) { - map::iterator it; - - for(it=rbarcodes.begin();it!=rbarcodes.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); - - }else{ alignment = NULL; } + Alignment* alignment; + if (barcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; @@ -647,28 +754,28 @@ int TrimOligos::stripRBarcode(Sequence& seq, int& group){ int minGroup = -1; int minPos = 0; - for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ string oligo = it->first; // int length = oligo.length(); - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length success = bdiffs + 10; break; } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs)); + alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); - + int alnLength = oligo.length(); - for(int i=0;i=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } } - oligo = oligo.substr(alnLength); - temp = temp.substr(alnLength); - + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + int numDiff = countDiffs(oligo, temp); if(numDiff < minDiff){ @@ -676,7 +783,7 @@ int TrimOligos::stripRBarcode(Sequence& seq, int& group){ minCount = 1; minGroup = it->second; minPos = 0; - for(int i=alnLength-1;i>=0;i--){ + for(int i=0;i 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes else{ //use the best match group = minGroup; - seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos))); - + seq.setUnaligned(rawSequence.substr(minPos)); success = minDiff; } @@ -705,12 +811,13 @@ int TrimOligos::stripRBarcode(Sequence& seq, int& group){ } catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripRBarcode"); + m->errorOut(e, "TrimOligos", "stripBarcode"); exit(1); } } -//********************************************************************/ + +/********************************************************************/ int TrimOligos::stripForward(Sequence& seq, int& group){ try { @@ -737,21 +844,9 @@ int TrimOligos::stripForward(Sequence& seq, int& group){ if ((pdiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - - int maxLength = 0; - Alignment* alignment; - if (primers.size() > 0) { - map::iterator it; - - for(it=primers.begin();it!=primers.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); - - }else{ alignment = NULL; } + if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; @@ -763,7 +858,7 @@ int TrimOligos::stripForward(Sequence& seq, int& group){ string oligo = it->first; // int length = oligo.length(); - if(rawSequence.length() < maxLength){ + if(rawSequence.length() < maxFPrimerLength){ success = pdiffs + 100; break; } @@ -849,21 +944,9 @@ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, boo if ((pdiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - - int maxLength = 0; - Alignment* alignment; - if (primers.size() > 0) { - map::iterator it; - - for(it=primers.begin();it!=primers.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); - - }else{ alignment = NULL; } + if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; @@ -875,7 +958,7 @@ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, boo string oligo = it->first; // int length = oligo.length(); - if(rawSequence.length() < maxLength){ + if(rawSequence.length() < maxFPrimerLength){ success = pdiffs + 100; break; } @@ -1024,19 +1107,9 @@ bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){ if ((ldiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - - int maxLength = 0; - Alignment* alignment; - if (linker.size() > 0) { - for(int i = 0; i < linker.size(); i++){ - if(linker[i].length() > maxLength){ - maxLength = linker[i].length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1)); - - }else{ alignment = NULL; } + if (linker.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); } + else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; @@ -1047,7 +1120,7 @@ bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){ string oligo = linker[i]; // int length = oligo.length(); - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length success = ldiffs + 10; break; } @@ -1133,19 +1206,9 @@ bool TrimOligos::stripLinker(Sequence& seq){ if ((ldiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - - int maxLength = 0; - Alignment* alignment; - if (linker.size() > 0) { - for(int i = 0; i < linker.size(); i++){ - if(linker[i].length() > maxLength){ - maxLength = linker[i].length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1)); - - }else{ alignment = NULL; } + if (linker.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); } + else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; @@ -1156,7 +1219,7 @@ bool TrimOligos::stripLinker(Sequence& seq){ string oligo = linker[i]; // int length = oligo.length(); - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length success = ldiffs + 10; break; } @@ -1240,19 +1303,9 @@ bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){ if ((sdiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - - int maxLength = 0; - Alignment* alignment; - if (spacer.size() > 0) { - for(int i = 0; i < spacer.size(); i++){ - if(spacer[i].length() > maxLength){ - maxLength = spacer[i].length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1)); - - }else{ alignment = NULL; } + if (spacer.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; @@ -1263,7 +1316,7 @@ bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){ string oligo = spacer[i]; // int length = oligo.length(); - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length success = sdiffs + 10; break; } @@ -1349,19 +1402,9 @@ bool TrimOligos::stripSpacer(Sequence& seq){ if ((sdiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - - int maxLength = 0; - Alignment* alignment; - if (spacer.size() > 0) { - for(int i = 0; i < spacer.size(); i++){ - if(spacer[i].length() > maxLength){ - maxLength = spacer[i].length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1)); - - }else{ alignment = NULL; } + if (spacer.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; @@ -1372,7 +1415,7 @@ bool TrimOligos::stripSpacer(Sequence& seq){ string oligo = spacer[i]; // int length = oligo.length(); - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length success = sdiffs + 10; break; } diff --git a/trimoligos.h b/trimoligos.h index fb8f74d..6716deb 100644 --- a/trimoligos.h +++ b/trimoligos.h @@ -29,7 +29,7 @@ class TrimOligos { public: TrimOligos(int,int, map, map, vector); //pdiffs, bdiffs, primers, barcodes, revPrimers TrimOligos(int,int, int, int, map, map, vector, vector, vector); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer - TrimOligos(int,int, int, int, map, map, vector, vector); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, linker, spacer + TrimOligos(int,int, int, int, map, map); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes ~TrimOligos(); int stripBarcode(Sequence&, int&); @@ -58,8 +58,14 @@ class TrimOligos { vector revPrimer; vector linker; vector spacer; - map ibarcodes; - map iprimers; + map > ifbarcodes; + map > ifprimers; + map > irbarcodes; + map > irprimers; + map ipbarcodes; + map ipprimers; + + int maxFBarcodeLength, maxRBarcodeLength, maxFPrimerLength, maxRPrimerLength, maxLinkerLength, maxSpacerLength; MothurOut* m; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 0c21c89..ba368fc 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -1036,10 +1036,10 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName HANDLE hThreadArray[processors-1]; //Create processor worker threads. - for( int i=0; i > tempFASTAFileNames = fastaFileNames; vector > tempPrimerQualFileNames = qualFileNames; vector > tempNameFileNames = nameFileNames; @@ -1081,14 +1081,14 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, - lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m, + lines[h].start, lines[h].end, qLines[h].start, qLines[h].end, m, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast, qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage, minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount); pDataArray.push_back(tempTrim); - hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + hThreadArray[h] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]); } //parent do my part @@ -1103,8 +1103,32 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName m->openOutputFile(trimNameFileName, temp); temp.close(); m->openOutputFile(scrapNameFileName, temp); temp.close(); } + vector > tempFASTAFileNames = fastaFileNames; + vector > tempPrimerQualFileNames = qualFileNames; + vector > tempNameFileNames = nameFileNames; + if(allFiles){ + ofstream temp; + string extension = toString(processors-1) + ".temp"; + for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + if(qFileName != ""){ + tempPrimerQualFileNames[i][j] += extension; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + if(nameFile != ""){ + tempNameFileNames[i][j] += extension; + m->openOutputFile(tempNameFileNames[i][j], temp); temp.close(); + } + } + } + } + } - driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]); + driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, lines[processors-1], qLines[processors-1]); processIDS.push_back(processors-1); diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index cbec749..e698fff 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -384,16 +384,20 @@ int UnifracWeightedCommand::execute() { //subsample loop vector< vector > calcDistsTotals; //each iter, each groupCombos dists. this will be used to make .dist files for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f. - if (m->control_pressed) { break; } //copy to preserve old one - would do this in subsample but memory cleanup becomes messy. CountTable* newCt = new CountTable(); + int sampleTime = 0; + if (m->debug) { sampleTime = time(NULL); } + //uses method of setting groups to doNotIncludeMe SubSample sample; Tree* subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize); - + + if (m->debug) { m->mothurOut("[DEBUG]: iter " + toString(thisIter) + " took " + toString(time(NULL) - sampleTime) + " seconds to sample tree.\n"); } + //call new weighted function vector iterData; iterData.resize(numComp,0); Weighted thisWeighted(includeRoot); -- 2.39.2