From: westcott Date: Fri, 11 Mar 2011 15:54:35 +0000 (+0000) Subject: added namefile to align.check and added seqs from namefile to optimizing piece of... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=4f9a6e14a608172f8a97f0297a3b8e6ea267c518 added namefile to align.check and added seqs from namefile to optimizing piece of screen.seqs --- diff --git a/mothurout.cpp b/mothurout.cpp index 27b36ce..96e7bb8 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -1096,6 +1096,36 @@ float MothurOut::ceilDist(float dist, int precision){ exit(1); } } +/**********************************************************************************************************************/ +map MothurOut::readNames(string namefile) { + try { + + map nameMap; + + //open input file + ifstream in; + openInputFile(namefile, in); + + while (!in.eof()) { + if (control_pressed) { break; } + + string firstCol, secondCol; + in >> firstCol >> secondCol; gobble(in); + + int num = getNumNames(secondCol); + + nameMap[firstCol] = num; + } + in.close(); + + return nameMap; + + } + catch(exception& e) { + errorOut(e, "MothurOut", "readNames"); + exit(1); + } +} /***********************************************************************/ diff --git a/mothurout.h b/mothurout.h index a1d9506..750973f 100644 --- a/mothurout.h +++ b/mothurout.h @@ -59,6 +59,7 @@ class MothurOut { string getline(istringstream&); void gobble(istream&); void gobble(istringstream&); + map readNames(string); //searchs and checks bool checkReleaseVersion(ifstream&, string); diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 150a27e..937959c 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -144,7 +144,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { else if (groupfile == "not found") { groupfile = ""; } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } alignreport = validParameter.validFile(parameters, "alignreport", true); @@ -248,9 +248,13 @@ int ScreenSeqsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - //if the user want to optimize we need to no the 90% mark + //if the user want to optimize we need to know the 90% mark vector positions; - if (optimize.size() != 0) { getSummary(positions); } //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here + if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here + //use the namefile to optimize correctly + if (namefile != "") { nameMap = m->readNames(namefile); } + getSummary(positions); + } else { positions = m->divideFile(fastafile, processors); for (int i = 0; i < (positions.size()-1); i++) { @@ -625,7 +629,8 @@ int ScreenSeqsCommand::getSummary(vector& positions){ sort(ambigBases.begin(), ambigBases.end()); sort(longHomoPolymer.begin(), longHomoPolymer.end()); - int criteriaPercentile = int(numSeqs * (criteria / (float) 100)); + //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs + int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100)); for (int i = 0; i < optimize.size(); i++) { if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); } @@ -662,11 +667,24 @@ int ScreenSeqsCommand::driverCreateSummary(vector& startPosition, vectorgobble(in); if (current.getName() != "") { - startPosition.push_back(current.getStartPos()); - endPosition.push_back(current.getEndPos()); - seqLength.push_back(current.getNumBases()); - ambigBases.push_back(current.getAmbigBases()); - longHomoPolymer.push_back(current.getLongHomoPolymer()); + int num = 1; + if (namefile != "") { + //make sure this sequence is in the namefile, else error + map::iterator it = nameMap.find(current.getName()); + + if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + for (int i = 0; i < num; i++) { + startPosition.push_back(current.getStartPos()); + endPosition.push_back(current.getEndPos()); + seqLength.push_back(current.getNumBases()); + ambigBases.push_back(current.getAmbigBases()); + longHomoPolymer.push_back(current.getLongHomoPolymer()); + } + count++; } @@ -712,6 +730,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector& startPosition, m->openOutputFile(tempFile, out); out << num << endl; + out << startPosition.size() << endl; for (int k = 0; k < startPosition.size(); k++) { out << startPosition[k] << '\t'; } out << endl; for (int k = 0; k < endPosition.size(); k++) { out << endPosition[k] << '\t'; } out << endl; for (int k = 0; k < seqLength.size(); k++) { out << seqLength[k] << '\t'; } out << endl; @@ -744,6 +763,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector& startPosition, int temp, tempNum; in >> tempNum; m->gobble(in); num += tempNum; + in >> tempNum; m->gobble(in); for (int k = 0; k < tempNum; k++) { in >> temp; startPosition.push_back(temp); } m->gobble(in); for (int k = 0; k < tempNum; k++) { in >> temp; endPosition.push_back(temp); } m->gobble(in); for (int k = 0; k < tempNum; k++) { in >> temp; seqLength.push_back(temp); } m->gobble(in); diff --git a/screenseqscommand.h b/screenseqscommand.h index 81b915e..45c9db5 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -53,6 +53,8 @@ private: vector outputNames; vector optimize; map > outputTypes; + map nameMap; + int readNames(); int getSummary(vector&); int createProcessesCreateSummary(vector&, vector&, vector&, vector&, vector&, string); diff --git a/secondarystructurecommand.cpp b/secondarystructurecommand.cpp index c9002ca..2b474c1 100644 --- a/secondarystructurecommand.cpp +++ b/secondarystructurecommand.cpp @@ -13,7 +13,7 @@ //********************************************************************************************************************** vector AlignCheckCommand::getValidParameters(){ try { - string Array[] = {"fasta","map", "outputdir","inputdir"}; + string Array[] = {"fasta", "name","map", "outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); return myArray; } @@ -69,7 +69,7 @@ AlignCheckCommand::AlignCheckCommand(string option) { else { //valid paramters for this command - string Array[] = {"fasta","map", "outputdir","inputdir"}; + string Array[] = {"fasta","name","map", "outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -107,6 +107,14 @@ AlignCheckCommand::AlignCheckCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["map"] = inputDir + it->second; } } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } } //check for required parameters @@ -118,6 +126,10 @@ AlignCheckCommand::AlignCheckCommand(string option) { if (fastafile == "not open") { abort = true; } else if (fastafile == "not found") { fastafile = ""; m->mothurOut("You must provide an fasta file."); m->mothurOutEndLine(); abort = true; } + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { namefile = ""; abort = true; } + else if (namefile == "not found") { namefile = ""; } + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; @@ -159,6 +171,10 @@ int AlignCheckCommand::execute(){ //get secondary structure info. readMap(); + if (namefile != "") { nameMap = m->readNames(namefile); } + + if (m->control_pressed) { return 0; } + ifstream in; m->openInputFile(fastafile, in); @@ -166,8 +182,9 @@ int AlignCheckCommand::execute(){ string outfile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "align.check"; m->openOutputFile(outfile, out); + out << "name" << '\t' << "pound" << '\t' << "dash" << '\t' << "plus" << '\t' << "equal" << '\t'; - out << "loop" << '\t' << "tilde" << '\t' << "total" << endl; + out << "loop" << '\t' << "tilde" << '\t' << "total" << '\t' << "numseqs" << endl; while(!in.eof()){ @@ -179,8 +196,17 @@ int AlignCheckCommand::execute(){ if (haderror == 1) { break; } + int num = 1; + if (namefile != "") { + //make sure this sequence is in the namefile, else error + map::iterator it = nameMap.find(seq.getName()); + + if (it == nameMap.end()) { cout << "[ERROR]: " << seq.getName() << " is not in your namefile, please correct." << endl; m->control_pressed = true; } + else { num = it->second; } + } + out << seq.getName() << '\t' << data.pound << '\t' << data.dash << '\t' << data.plus << '\t' << data.equal << '\t'; - out << data.loop << '\t' << data.tilde << '\t' << data.total << endl; + out << data.loop << '\t' << data.tilde << '\t' << data.total << '\t' << num << endl; } } @@ -305,6 +331,4 @@ statData AlignCheckCommand::getStats(string sequence){ exit(1); } } - - //********************************************************************************************************************** diff --git a/secondarystructurecommand.h b/secondarystructurecommand.h index c1cc057..0b654a4 100644 --- a/secondarystructurecommand.h +++ b/secondarystructurecommand.h @@ -44,11 +44,12 @@ class AlignCheckCommand : public Command { private: vector structMap; - string mapfile, fastafile, outputDir; + string mapfile, fastafile, outputDir, namefile; bool abort; int seqLength, haderror; vector outputNames; map > outputTypes; + map nameMap; void readMap(); statData getStats(string sequence); diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index 8730997..c0f9196 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -172,7 +172,7 @@ int SeqSummaryCommand::execute(){ vector ambigBases; vector longHomoPolymer; - if (namefile != "") { readNames(); } + if (namefile != "") { nameMap = m->readNames(namefile); } if (m->control_pressed) { return 0; } @@ -480,7 +480,7 @@ int SeqSummaryCommand::MPICreateSummary(int start, int num, vector& startPo //make sure this sequence is in the namefile, else error map::iterator it = nameMap.find(current.getName()); - if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your namefile, please correct." << endl; m->control_pressed = true; } else { num = it->second; } } @@ -589,34 +589,6 @@ int SeqSummaryCommand::createProcessesCreateSummary(vector& startPosition, } } /**********************************************************************************************************************/ -int SeqSummaryCommand::readNames() { - try { - //open input file - ifstream in; - m->openInputFile(namefile, in); - - while (!in.eof()) { - if (m->control_pressed) { break; } - - string firstCol, secondCol; - in >> firstCol >> secondCol; m->gobble(in); - - int num = m->getNumNames(secondCol); - - nameMap[firstCol] = num; - } - in.close(); - - return 0; - - } - catch(exception& e) { - m->errorOut(e, "SeqSummaryCommand", "readNames"); - exit(1); - } -} - -/**********************************************************************************************************************/ diff --git a/seqsummarycommand.h b/seqsummarycommand.h index fb24141..24c29a1 100644 --- a/seqsummarycommand.h +++ b/seqsummarycommand.h @@ -44,7 +44,6 @@ private: int createProcessesCreateSummary(vector&, vector&, vector&, vector&, vector&, string, string); int driverCreateSummary(vector&, vector&, vector&, vector&, vector&, string, string, linePair*); - int readNames(); #ifdef USE_MPI int MPICreateSummary(int, int, vector&, vector&, vector&, vector&, vector&, MPI_File&, MPI_File&, vector&);