From fdc1f6eaf544f695fc1511f24bddd7e6069c33ba Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 7 Apr 2010 12:55:09 +0000 Subject: [PATCH] added MPI code, broke up chimera.seqs into 5 separated commands, added parse.sff, paralellized filter.seqs and bellerophon. --- Mothur.xcodeproj/project.pbxproj | 24 + aligncommand.cpp | 310 +++++++++++- aligncommand.h | 5 + alignmentdb.cpp | 165 ++++++- alignmentdb.h | 5 +- bellerophon.cpp | 786 ++++++++++++++++++++++--------- bellerophon.h | 30 +- blastdb.cpp | 63 +++ blastdb.hpp | 8 +- ccode.cpp | 178 ++++++- ccode.h | 15 +- chimera.cpp | 158 ++++--- chimera.h | 81 ++-- chimerabellerophoncommand.cpp | 190 ++++++++ chimerabellerophoncommand.h | 39 ++ chimeraccodecommand.cpp | 574 ++++++++++++++++++++++ chimeraccodecommand.h | 56 +++ chimeracheckcommand.cpp | 464 ++++++++++++++++++ chimeracheckcommand.h | 57 +++ chimeracheckrdp.cpp | 186 +++++++- chimeracheckrdp.h | 11 +- chimerapintailcommand.cpp | 573 ++++++++++++++++++++++ chimerapintailcommand.h | 58 +++ chimeraseqscommand.cpp | 613 +----------------------- chimeraseqscommand.h | 22 - chimeraslayer.cpp | 198 ++++++-- chimeraslayer.h | 11 +- chimeraslayercommand.cpp | 604 ++++++++++++++++++++++++ chimeraslayercommand.h | 58 +++ classify.cpp | 121 ++++- classifyseqscommand.cpp | 285 ++++++++++- classifyseqscommand.h | 5 + cluster.cpp | 89 ++-- clustercommand.cpp | 7 +- commandfactory.cpp | 37 +- database.hpp | 6 +- distancecommand.cpp | 224 +++++++-- distancecommand.h | 1 + distancedb.hpp | 5 + filterseqscommand.cpp | 318 +++++++------ filterseqscommand.h | 12 +- fullmatrix.cpp | 6 +- globaldata.hpp | 2 +- kmerdb.cpp | 40 ++ kmerdb.hpp | 6 + mothur.h | 72 +++ nastreport.cpp | 51 ++ nastreport.hpp | 4 + parsesffcommand.cpp | 562 ++++++++++++++++++++++ parsesffcommand.h | 55 +++ pintail.cpp | 314 ++++++++++-- pintail.h | 14 +- readcolumn.cpp | 305 ++++++------ readdistcommand.cpp | 13 +- readdistcommand.h | 2 +- readmatrix.hpp | 3 +- readphylip.cpp | 8 +- sequence.cpp | 62 ++- sequence.hpp | 3 + suffixdb.cpp | 38 ++ suffixdb.hpp | 6 + validparameter.cpp | 12 + 62 files changed, 6693 insertions(+), 1537 deletions(-) create mode 100644 chimerabellerophoncommand.cpp create mode 100644 chimerabellerophoncommand.h create mode 100644 chimeraccodecommand.cpp create mode 100644 chimeraccodecommand.h create mode 100644 chimeracheckcommand.cpp create mode 100644 chimeracheckcommand.h create mode 100644 chimerapintailcommand.cpp create mode 100644 chimerapintailcommand.h create mode 100644 chimeraslayercommand.cpp create mode 100644 chimeraslayercommand.h create mode 100644 parsesffcommand.cpp create mode 100644 parsesffcommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index f07824e..24351a0 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -7,6 +7,16 @@ objects = { /* Begin PBXFileReference section */ + A747E79B1163442A00FB9042 /* chimeracheckcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckcommand.h; sourceTree = ""; }; + A747E79C1163442A00FB9042 /* chimeracheckcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckcommand.cpp; sourceTree = ""; }; + A747E81C116365E000FB9042 /* chimeraslayercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayercommand.h; sourceTree = ""; }; + A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayercommand.cpp; sourceTree = ""; }; + A78254461164D7790002E2DD /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = ""; }; + A78254471164D7790002E2DD /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = ""; }; + A7825502116519F70002E2DD /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = ""; }; + A7825503116519F70002E2DD /* chimerabellerophoncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerabellerophoncommand.cpp; sourceTree = ""; }; + A78434881162224F00100BE0 /* chimeraccodecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraccodecommand.h; sourceTree = ""; }; + A78434891162224F00100BE0 /* chimeraccodecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraccodecommand.cpp; sourceTree = ""; }; A7DA1FEC113FECD400BF472F /* ace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ace.cpp; sourceTree = ""; }; A7DA1FED113FECD400BF472F /* ace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ace.h; sourceTree = ""; }; A7DA1FEE113FECD400BF472F /* aligncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligncommand.cpp; sourceTree = ""; }; @@ -405,6 +415,8 @@ A7DA217A113FECD400BF472F /* weighted.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = weighted.h; sourceTree = ""; }; A7DA217B113FECD400BF472F /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = ""; }; A7DA217C113FECD400BF472F /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = ""; }; + A7E8338B115BBDAA00739EC4 /* parsesffcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parsesffcommand.cpp; sourceTree = ""; }; + A7E8338C115BBDAA00739EC4 /* parsesffcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsesffcommand.h; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXGroup section */ @@ -618,6 +630,16 @@ A7DA2008113FECD400BF472F /* bootstrapsharedcommand.h */, A7DA2017113FECD400BF472F /* chimeraseqscommand.cpp */, A7DA2018113FECD400BF472F /* chimeraseqscommand.h */, + A7825502116519F70002E2DD /* chimerabellerophoncommand.h */, + A7825503116519F70002E2DD /* chimerabellerophoncommand.cpp */, + A747E79B1163442A00FB9042 /* chimeracheckcommand.h */, + A747E79C1163442A00FB9042 /* chimeracheckcommand.cpp */, + A78434881162224F00100BE0 /* chimeraccodecommand.h */, + A78434891162224F00100BE0 /* chimeraccodecommand.cpp */, + A78254461164D7790002E2DD /* chimerapintailcommand.h */, + A78254471164D7790002E2DD /* chimerapintailcommand.cpp */, + A747E81C116365E000FB9042 /* chimeraslayercommand.h */, + A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */, A7DA201D113FECD400BF472F /* classifyseqscommand.cpp */, A7DA201E113FECD400BF472F /* classifyseqscommand.h */, A7DA2021113FECD400BF472F /* clustercommand.cpp */, @@ -674,6 +696,8 @@ A7DA20B9113FECD400BF472F /* otuhierarchycommand.h */, A7DA20BC113FECD400BF472F /* parselistscommand.cpp */, A7DA20BD113FECD400BF472F /* parselistscommand.h */, + A7E8338B115BBDAA00739EC4 /* parsesffcommand.cpp */, + A7E8338C115BBDAA00739EC4 /* parsesffcommand.h */, A7DA20C0113FECD400BF472F /* parsimonycommand.cpp */, A7DA20C1113FECD400BF472F /* parsimonycommand.h */, A7DA20C2113FECD400BF472F /* pcacommand.cpp */, diff --git a/aligncommand.cpp b/aligncommand.cpp index 257587f..a4b3a79 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -32,7 +32,7 @@ AlignCommand::AlignCommand(string option) { try { abort = false; - + //allow user to run help if(option == "help") { help(); abort = true; } @@ -95,23 +95,45 @@ AlignCommand::AlignCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { candidateFileNames[i] = inputDir + candidateFileNames[i]; } } - + int ableToOpen; ifstream in; + + #ifdef USE_MPI + int pid; + MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + #endif + ableToOpen = openInputFile(candidateFileNames[i], in); + in.close(); + + #ifdef USE_MPI + for (int j = 1; j < processors; j++) { + MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); + } + }else{ + MPI_Status status; + MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + } + + #endif + if (ableToOpen == 1) { m->mothurOut(candidateFileNames[i] + " will be disregarded."); m->mothurOutEndLine(); //erase from file list candidateFileNames.erase(candidateFileNames.begin()+i); i--; } - in.close(); + } //make sure there is at least one valid file left if (candidateFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } } - + //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; @@ -197,10 +219,10 @@ void AlignCommand::help(){ int AlignCommand::execute(){ try { if (abort == true) { return 0; } - + templateDB = new AlignmentDB(templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch); int longestBase = templateDB->getLongestBase(); - + if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); } else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } @@ -226,8 +248,111 @@ int AlignCommand::execute(){ int numFastaSeqs = 0; for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); int start = time(NULL); + +#ifdef USE_MPI + int pid, end, numSeqsPerProcessor; + int tag = 2001; + vector MPIPos; + MPIWroteAccnos = false; + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + MPI_File inMPI; + MPI_File outMPIAlign; + MPI_File outMPIReport; + MPI_File outMPIAccnos; + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + int inMode=MPI_MODE_RDONLY; + + char outAlignFilename[alignFileName.length()]; + strcpy(outAlignFilename, alignFileName.c_str()); + + char outReportFilename[reportFileName.length()]; + strcpy(outReportFilename, reportFileName.c_str()); + + char outAccnosFilename[accnosFileName.length()]; + strcpy(outAccnosFilename, accnosFileName.c_str()); + + char inFileName[candidateFileNames[s].length()]; + strcpy(inFileName, candidateFileNames[s].c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_open(MPI_COMM_WORLD, outAlignFilename, outMode, MPI_INFO_NULL, &outMPIAlign); + MPI_File_open(MPI_COMM_WORLD, outReportFilename, outMode, MPI_INFO_NULL, &outMPIReport); + MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIAlign); MPI_File_close(&outMPIReport); MPI_File_close(&outMPIAccnos); return 0; } + + if (pid == 0) { //you are the root process + + MPIPos = setFilePosFasta(candidateFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + + //figure out how many sequences you have to align + numSeqsPerProcessor = numFastaSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIAlign, outMPIReport, outMPIAccnos, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIAlign); MPI_File_close(&outMPIReport); MPI_File_close(&outMPIAccnos); return 0; } + + for (int i = 1; i < processors; i++) { + bool tempResult; + MPI_Recv(&tempResult, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); + if (tempResult != 0) { MPIWroteAccnos = true; } + } + }else{ //you are a child process + MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + MPIPos.resize(numFastaSeqs+1); + MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + + //figure out how many sequences you have to align + numSeqsPerProcessor = numFastaSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIAlign, outMPIReport, outMPIAccnos, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIAlign); MPI_File_close(&outMPIReport); MPI_File_close(&outMPIAccnos); return 0; } + + MPI_Send(&MPIWroteAccnos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); + } + + //close files + MPI_File_close(&inMPI); + MPI_File_close(&outMPIAlign); + MPI_File_close(&outMPIReport); + MPI_File_close(&outMPIAccnos); + + //delete accnos file if blank + if (pid == 0) { + //delete accnos file if its blank else report to user + if (MPIWroteAccnos) { + m->mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + "."); + if (!flip) { + m->mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well."); + }else{ m->mothurOut(" If the reverse compliment proved to be better it was reported."); } + m->mothurOutEndLine(); + }else { + //MPI_Info info; + //MPI_File_delete(outAccnosFilename, info); + hasAccnos = false; + remove(accnosFileName.c_str()); + } + } + +#else -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ ifstream inFASTA; openInputFile(candidateFileNames[s], inFASTA); @@ -327,7 +452,7 @@ int AlignCommand::execute(){ return 0; } } -#else + #else ifstream inFASTA; openInputFile(candidateFileNames[s], inFASTA); numFastaSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); @@ -354,12 +479,25 @@ int AlignCommand::execute(){ m->mothurOutEndLine(); } -#endif - + #endif + +#endif + + + #ifdef USE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + + if (pid == 0) { //only one process should output to screen + #endif + outputNames.push_back(alignFileName); outputNames.push_back(reportFileName); if (hasAccnos) { outputNames.push_back(accnosFileName); } - + + #ifdef USE_MPI + } + #endif + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to align " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); @@ -395,12 +533,13 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName, openInputFile(filename, inFASTA); inFASTA.seekg(line->start); - + for(int i=0;inumSeqs;i++){ if (m->control_pressed) { return 0; } Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); + int origNumBases = candidateSeq->getNumBases(); string originalUnaligned = candidateSeq->getUnaligned(); int numBasesNeeded = origNumBases * threshold; @@ -491,7 +630,153 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName, exit(1); } } +//********************************************************************************************************************** +#ifdef USE_MPI +int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& alignFile, MPI_File& reportFile, MPI_File& accnosFile, vector& MPIPos){ + try { + string outputString = ""; + MPI_Status statusReport; + MPI_Status statusAlign; + MPI_Status statusAccnos; + MPI_Status status; + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + NastReport report; + + if (pid == 0) { + outputString = report.getHeaders(); + int length = outputString.length(); + char buf[length]; + strcpy(buf, outputString.c_str()); + + MPI_File_write_shared(reportFile, buf, length, MPI_CHAR, &statusReport); + } + + for(int i=0;icontrol_pressed) { return 0; } + //read next sequence + int length = MPIPos[start+i+1] - MPIPos[start+i]; + char buf4[length]; + MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + istringstream iss (tempBuf,istringstream::in); + + Sequence* candidateSeq = new Sequence(iss); + int origNumBases = candidateSeq->getNumBases(); + string originalUnaligned = candidateSeq->getUnaligned(); + int numBasesNeeded = origNumBases * threshold; + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { + alignment->resize(candidateSeq->getUnaligned().length()+1); + } + + Sequence temp = templateDB->findClosestSequence(candidateSeq); + Sequence* templateSeq = &temp; + + float searchScore = templateDB->getSearchScore(); + + Nast* nast = new Nast(alignment, candidateSeq, templateSeq); + Sequence* copy; + + Nast* nast2; + bool needToDeleteCopy = false; //this is needed in case you have you enter the ifs below + //since nast does not make a copy of hte sequence passed, and it is used by the reporter below + //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place + //so this bool tells you if you need to delete it + + //if there is a possibility that this sequence should be reversed + if (candidateSeq->getNumBases() < numBasesNeeded) { + + string wasBetter = ""; + //if the user wants you to try the reverse + if (flip) { + //get reverse compliment + copy = new Sequence(candidateSeq->getName(), originalUnaligned); + copy->reverseComplement(); + + //rerun alignment + Sequence temp2 = templateDB->findClosestSequence(copy); + Sequence* templateSeq2 = &temp2; + + searchScore = templateDB->getSearchScore(); + + nast2 = new Nast(alignment, copy, templateSeq2); + + //check if any better + if (copy->getNumBases() > candidateSeq->getNumBases()) { + candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better + templateSeq = templateSeq2; + delete nast; + nast = nast2; + needToDeleteCopy = true; + }else{ + wasBetter = "\treverse complement did NOT produce a better alignment, please check sequence."; + delete nast2; + delete copy; + } + } + + //create accnos file with names + outputString = candidateSeq->getName() + wasBetter + "\n"; + + //send results to parent + int length = outputString.length(); + char buf[length]; + strcpy(buf, outputString.c_str()); + + MPI_File_write_shared(accnosFile, buf, length, MPI_CHAR, &statusAccnos); + MPIWroteAccnos = true; + } + + report.setCandidate(candidateSeq); + report.setTemplate(templateSeq); + report.setSearchParameters(search, searchScore); + report.setAlignmentParameters(align, alignment); + report.setNastParameters(*nast); + + outputString = ">" + candidateSeq->getName() + "\n" + candidateSeq->getAligned() + "\n"; + + //send results to parent + int length = outputString.length(); + char buf2[length]; + strcpy(buf2, outputString.c_str()); + + MPI_File_write_shared(alignFile, buf2, length, MPI_CHAR, &statusAlign); + + outputString = report.getReport(); + + //send results to parent + length = outputString.length(); + char buf3[length]; + strcpy(buf3, outputString.c_str()); + + MPI_File_write_shared(reportFile, buf3, length, MPI_CHAR, &statusReport); + + delete nast; + if (needToDeleteCopy) { delete copy; } + } + delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ cout << (toString(i+1)) << endl; } + } + //report progress + if((num) % 100 != 0){ cout << (toString(num)) << endl; } + + return 1; + } + catch(exception& e) { + m->errorOut(e, "AlignCommand", "driverMPI"); + exit(1); + } +} +#endif /**************************************************************************************************/ int AlignCommand::createProcesses(string alignFileName, string reportFileName, string accnosFName, string filename) { @@ -577,5 +862,4 @@ void AlignCommand::appendReportFiles(string temp, string filename) { exit(1); } } - //********************************************************************************************************************** diff --git a/aligncommand.h b/aligncommand.h index f0496a5..b100287 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -32,6 +32,7 @@ private: }; vector processIDS; //processid vector lines; + bool MPIWroteAccnos; AlignmentDB* templateDB; Alignment* alignment; @@ -41,6 +42,10 @@ private: void appendAlignFiles(string, string); void appendReportFiles(string, string); + #ifdef USE_MPI + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); + #endif + string candidateFileName, templateFileName, distanceFileName, search, align, outputDir; float match, misMatch, gapOpen, gapExtend, threshold; int processors, kmerSize; diff --git a/alignmentdb.cpp b/alignmentdb.cpp index 51fb175..4b324b4 100644 --- a/alignmentdb.cpp +++ b/alignmentdb.cpp @@ -14,17 +14,69 @@ /**************************************************************************************************/ -AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch){ // This assumes that the template database is in fasta format, may +AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch){ // This assumes that the template database is in fasta format, may try { // need to alter this in the future? m = MothurOut::getInstance(); longest = 0; - - ifstream fastaFile; - openInputFile(fastaFileName, fastaFile); + method = s; + bool needToGenerate = true; m->mothurOutEndLine(); m->mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush(); + #ifdef USE_MPI + int pid; + vector positions; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + char inFileName[fastaFileName.length()]; + strcpy(inFileName, fastaFileName.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + + if (pid == 0) { + positions = setFilePosFasta(fastaFileName, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + }else{ + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + positions.resize(numSeqs+1); + MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + } + + //read file + for(int i=0;icontrol_pressed) { templateSequences.clear(); break; } + + //read next sequence + int length = positions[i+1] - positions[i]; + char buf4[length]; + MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + + istringstream iss (tempBuf,istringstream::in); + + Sequence temp(iss); + if (temp.getName() != "") { + templateSequences.push_back(temp); + //save longest base + if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length()+1; } + } + } + + MPI_File_close(&inMPI); + #else + ifstream fastaFile; + openInputFile(fastaFileName, fastaFile); + while (!fastaFile.eof()) { Sequence temp(fastaFile); gobble(fastaFile); @@ -36,10 +88,11 @@ AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, floa if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length()+1; } } } + fastaFile.close(); + #endif + numSeqs = templateSequences.size(); - - fastaFile.close(); //all of this is elsewhere already! m->mothurOut("DONE."); @@ -51,15 +104,18 @@ AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, floa emptySequence.setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); emptySequence.setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); - bool needToGenerate = true; + string kmerDBName; if(method == "kmer") { search = new KmerDB(fastaFileName, kmerSize); - kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; - ifstream kmerFileTest(kmerDBName.c_str()); + #ifdef USE_MPI + #else + kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTest(kmerDBName.c_str()); - if(kmerFileTest){ needToGenerate = false; } + if(kmerFileTest){ needToGenerate = false; } + #endif } else if(method == "suffix") { search = new SuffixDB(numSeqs); } else if(method == "blast") { search = new BlastDB(gapOpen, gapExtend, match, misMatch); } @@ -74,7 +130,12 @@ AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, floa //add sequences to search for (int i = 0; i < templateSequences.size(); i++) { search->addSequence(templateSequences[i]); + + if (m->control_pressed) { templateSequences.clear(); break; } } + + if (m->control_pressed) { templateSequences.clear(); } + search->generateDB(); }else if ((method == "kmer") && (!needToGenerate)) { @@ -91,6 +152,29 @@ AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, floa } } /**************************************************************************************************/ +AlignmentDB::AlignmentDB(string s){ + try { + m = MothurOut::getInstance(); + method = s; + + if(method == "suffix") { search = new SuffixDB(); } + else if(method == "blast") { search = new BlastDB(); } + else { search = new KmerDB(); } + + + //in case you delete the seqs and then ask for them + emptySequence = Sequence(); + emptySequence.setName("no_match"); + emptySequence.setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + emptySequence.setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + + } + catch(exception& e) { + m->errorOut(e, "AlignmentDB", "AlignmentDB"); + exit(1); + } +} +/**************************************************************************************************/ AlignmentDB::~AlignmentDB() { delete search; } /**************************************************************************************************/ Sequence AlignmentDB::findClosestSequence(Sequence* seq) { @@ -107,7 +191,68 @@ Sequence AlignmentDB::findClosestSequence(Sequence* seq) { exit(1); } } +#ifdef USE_MPI /**************************************************************************************************/ +int AlignmentDB::MPISend(int receiver) { + try { + + //send numSeqs - int + MPI_Send(&numSeqs, 1, MPI_INT, receiver, 2001, MPI_COMM_WORLD); + + //send longest - int + MPI_Send(&longest, 1, MPI_INT, receiver, 2001, MPI_COMM_WORLD); + + //send templateSequences + for (int i = 0; i < templateSequences.size(); i++) { + templateSequences[i].MPISend(receiver); + } + + //send Database + search->MPISend(receiver); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignmentDB", "MPISend"); + exit(1); + } +} +/**************************************************************************************************/ +int AlignmentDB::MPIRecv(int sender) { + try { + MPI_Status status; + //receive numSeqs - int + MPI_Recv(&numSeqs, 1, MPI_INT, sender, 2001, MPI_COMM_WORLD, &status); + + //receive longest - int + MPI_Recv(&longest, 1, MPI_INT, sender, 2001, MPI_COMM_WORLD, &status); + + //receive templateSequences + templateSequences.resize(numSeqs); + for (int i = 0; i < templateSequences.size(); i++) { + templateSequences[i].MPIRecv(sender); + } + + //receive Database + search->MPIRecv(sender); + + for (int i = 0; i < templateSequences.size(); i++) { + search->addSequence(templateSequences[i]); + } + search->generateDB(); + search->setNumSeqs(numSeqs); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignmentDB", "MPIRecv"); + exit(1); + } +} +#endif +/**************************************************************************************************/ + + diff --git a/alignmentdb.h b/alignmentdb.h index d665f78..a69b917 100644 --- a/alignmentdb.h +++ b/alignmentdb.h @@ -21,15 +21,18 @@ class AlignmentDB { public: AlignmentDB(string, string, int, float, float, float, float); //reads fastafile passed in and stores sequences + AlignmentDB(string); ~AlignmentDB(); Sequence findClosestSequence(Sequence*); float getSearchScore() { return search->getSearchScore(); } int getLongestBase() { return longest; } + int MPISend(int); + int MPIRecv(int); private: int numSeqs, longest; - float searchScore; + string method; Database* search; vector templateSequences; diff --git a/bellerophon.cpp b/bellerophon.cpp index 54dfb9b..25c5de7 100644 --- a/bellerophon.cpp +++ b/bellerophon.cpp @@ -13,12 +13,60 @@ #include "onegapdist.h" -//*************************************************************************************************************** +/***************************************************************************************************************/ -Bellerophon::Bellerophon(string name, string o) { +Bellerophon::Bellerophon(string name, bool filterSeqs, bool c, int win, int inc, int p, string o) : Chimera() { try { fastafile = name; + correction = c; outputDir = o; + window = win; + increment = inc; + processors = p; + + //read in sequences + seqs = readSeqs(fastafile); + numSeqs = seqs.size(); + if (numSeqs == 0) { m->mothurOut("Error in reading you sequences."); m->mothurOutEndLine(); exit(1); } + + //do soft filter + if (filterSeqs) { + createFilter(seqs, 0.5); + for (int i = 0; i < seqs.size(); i++) { runFilter(seqs[i]); } + } + + distCalculator = new eachGapDist(); + + //set default window to 25% of sequence length + string seq0 = seqs[0]->getAligned(); + if (window == 0) { window = seq0.length() / 4; } + else if (window > (seq0.length() / 2)) { + m->mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); m->mothurOutEndLine(); + window = (seq0.length() / 2); + } + + if (increment > (seqs[0]->getAlignLength() - (2*window))) { + if (increment != 10) { + + m->mothurOut("You have selected a increment that is too large. I will use the default."); m->mothurOutEndLine(); + increment = 10; + if (increment > (seqs[0]->getAlignLength() - (2*window))) { increment = 0; } + + }else{ increment = 0; } + } + + if (increment == 0) { iters = 1; } + else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); } + + //initialize pref + pref.resize(iters); + for (int i = 0; i < iters; i++) { + Preference temp; + for (int j = 0; j < numSeqs; j++) { + pref[i].push_back(temp); + } + } + } catch(exception& e) { m->errorOut(e, "Bellerophon", "Bellerophon"); @@ -30,20 +78,26 @@ Bellerophon::Bellerophon(string name, string o) { int Bellerophon::print(ostream& out, ostream& outAcc) { try { int above1 = 0; + + //sorted "best" preference scores for all seqs + vector best = getBestPref(); + + if (m->control_pressed) { return numSeqs; } + out << "Name\tScore\tLeft\tRight\t" << endl; //output prefenence structure to .chimeras file - for (int i = 0; i < pref.size(); i++) { + for (int i = 0; i < best.size(); i++) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { return numSeqs; } - out << pref[i].name << '\t' << setprecision(3) << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl; + out << best[i].name << '\t' << setprecision(3) << best[i].score << '\t' << best[i].leftParent << '\t' << best[i].rightParent << endl; //calc # of seqs with preference above 1.0 - if (pref[i].score[0] > 1.0) { + if (best[i].score > 1.0) { above1++; - outAcc << pref[i].name << endl; - m->mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); m->mothurOutEndLine(); - m->mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); m->mothurOutEndLine(); + outAcc << best[i].name << endl; + m->mothurOut(best[i].name + " is a suspected chimera at breakpoint " + toString(best[i].midpoint)); m->mothurOutEndLine(); + m->mothurOut("It's score is " + toString(best[i].score) + " with suspected left parent " + best[i].leftParent + " and right parent " + best[i].rightParent); m->mothurOutEndLine(); } } @@ -51,22 +105,22 @@ int Bellerophon::print(ostream& out, ostream& outAcc) { m->mothurOutEndLine(); m->mothurOut("Sequence with preference score above 1.0: " + toString(above1)); m->mothurOutEndLine(); int spot; - spot = pref.size()-1; - m->mothurOut("Minimum:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine(); - spot = pref.size() * 0.975; - m->mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine(); - spot = pref.size() * 0.75; - m->mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine(); - spot = pref.size() * 0.50; - m->mothurOut("Median: \t" + toString(pref[spot].score[0])); m->mothurOutEndLine(); - spot = pref.size() * 0.25; - m->mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine(); - spot = pref.size() * 0.025; - m->mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine(); + spot = best.size()-1; + m->mothurOut("Minimum:\t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = best.size() * 0.975; + m->mothurOut("2.5%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = best.size() * 0.75; + m->mothurOut("25%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = best.size() * 0.50; + m->mothurOut("Median: \t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = best.size() * 0.25; + m->mothurOut("75%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = best.size() * 0.025; + m->mothurOut("97.5%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine(); spot = 0; - m->mothurOut("Maximum:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine(); + m->mothurOut("Maximum:\t" + toString(best[spot].score)); m->mothurOutEndLine(); - return 1; + return numSeqs; } catch(exception& e) { @@ -74,191 +128,361 @@ int Bellerophon::print(ostream& out, ostream& outAcc) { exit(1); } } - -//******************************************************************************************************************** -//sorts highest score to lowest -inline bool comparePref(Preference left, Preference right){ - return (left.score[0] > right.score[0]); -} - +#ifdef USE_MPI //*************************************************************************************************************** -int Bellerophon::getChimeras() { +int Bellerophon::print(MPI_File& out, MPI_File& outAcc) { try { - //do soft filter - if (filter) { - string optionString = "fasta=" + fastafile + ", soft=50"; - if (outputDir != "") { optionString += ", outputdir=" + outputDir; } + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + string outString = ""; + + //sorted "best" preference scores for all seqs + vector best = getBestPref(); - filterSeqs = new FilterSeqsCommand(optionString); - filterSeqs->execute(); - delete filterSeqs; + int above1 = 0; + int ninetyfive = best.size() * 0.05; + float cutoffScore = best[ninetyfive].score; + + if (m->control_pressed) { return numSeqs; } - if (m->control_pressed) { return 0; } + outString += "Name\tScore\tLeft\tRight\n"; + //output prefenence structure to .chimeras file + for (int i = 0; i < best.size(); i++) { + + if (m->control_pressed) { return numSeqs; } + + outString += best[i].name + "\t" + toString(best[i].score) + "\t" + best[i].leftParent + "\t" + best[i].rightParent + "\n"; + + MPI_Status status; + int length = outString.length(); + char buf2[length]; + strcpy(buf2, outString.c_str()); + + MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status); + + + //calc # of seqs with preference above 95%tile + if (best[i].score >= cutoffScore) { + above1++; + string outAccString; + outAccString += best[i].name + "\n"; + + MPI_Status statusAcc; + length = outAccString.length(); + char buf[length]; + strcpy(buf, outAccString.c_str()); + + MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc); + + cout << best[i].name << " is a suspected chimera at breakpoint " << toString(best[i].midpoint) << endl; + cout << "It's score is " << toString(best[i].score) << " with suspected left parent " << best[i].leftParent << " and right parent " << best[i].rightParent << endl; + } + } - //reset fastafile to filtered file - if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; } - else { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; } + //output results to screen + m->mothurOutEndLine(); + m->mothurOut("Sequence with preference score above " + toString(cutoffScore) + ": " + toString(above1)); m->mothurOutEndLine(); + int spot; + spot = best.size()-1; + m->mothurOut("Minimum:\t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = best.size() * 0.975; + m->mothurOut("2.5%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = best.size() * 0.75; + m->mothurOut("25%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = best.size() * 0.50; + m->mothurOut("Median: \t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = best.size() * 0.25; + m->mothurOut("75%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = best.size() * 0.025; + m->mothurOut("97.5%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine(); + spot = 0; + m->mothurOut("Maximum:\t" + toString(best[spot].score)); m->mothurOutEndLine(); } - distCalculator = new eachGapDist(); + return numSeqs; - //read in sequences - seqs = readSeqs(fastafile); + } + catch(exception& e) { + m->errorOut(e, "Bellerophon", "print"); + exit(1); + } +} +#endif +//******************************************************************************************************************** +//sorts highest score to lowest +inline bool comparePref(Preference left, Preference right){ + return (left.score > right.score); +} +//*************************************************************************************************************** +int Bellerophon::getChimeras() { + try { - if (m->control_pressed) { return 0; } + //create breaking points + vector midpoints; midpoints.resize(iters, window); + for (int i = 1; i < iters; i++) { midpoints[i] = midpoints[i-1] + increment; } - if (unaligned) { m->mothurOut("Your sequences need to be aligned when you use the bellerophon method."); m->mothurOutEndLine(); return 1; } - - int numSeqs = seqs.size(); - - if (numSeqs == 0) { m->mothurOut("Error in reading you sequences."); m->mothurOutEndLine(); exit(1); } + #ifdef USE_MPI + int pid, numSeqsPerProcessor; + + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); - //set default window to 25% of sequence length - string seq0 = seqs[0]->getAligned(); - if (window == 0) { window = seq0.length() / 4; } - else if (window > (seq0.length() / 2)) { - m->mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); m->mothurOutEndLine(); - window = (seq0.length() / 2); - } + numSeqsPerProcessor = iters / processors; - if (increment > (seqs[0]->getAlignLength() - (2*window))) { - if (increment != 10) { - - m->mothurOut("You have selected a increment that is too large. I will use the default."); m->mothurOutEndLine(); - increment = 10; - if (increment > (seqs[0]->getAlignLength() - (2*window))) { increment = 0; } - - }else{ increment = 0; } + //each process hits this only once + int startPos = pid * numSeqsPerProcessor; + if(pid == processors - 1){ + numSeqsPerProcessor = iters - pid * numSeqsPerProcessor; } + lines.push_back(linePair(startPos, numSeqsPerProcessor)); - if (increment == 0) { iters = 1; } - else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); } + //fill pref with scores + driverChimeras(midpoints, lines[0]); - //initialize pref - pref.resize(numSeqs); - - for (int i = 0; i < numSeqs; i++ ) { - pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3); - pref[i].name = seqs[i]->getName(); - pref[i].score[0] = 0.0; pref[i].score[1] = 0.0; - pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0; - pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0; - } - - int midpoint = window; - int count = 0; - while (count < iters) { - - if (m->control_pressed) { return 0; } + if (m->control_pressed) { return 0; } - //create 2 vectors of sequences, 1 for left side and one for right side - vector left; vector right; + //each process must send its parts back to pid 0 + if (pid == 0) { + + //receive results + for (int j = 1; j < processors; j++) { - for (int i = 0; i < seqs.size(); i++) { + vector MPIBestSend; + for (int i = 0; i < numSeqs; i++) { if (m->control_pressed) { return 0; } + + MPI_Status status; + //receive string + int length; + MPI_Recv(&length, 1, MPI_INT, j, 2001, MPI_COMM_WORLD, &status); -//cout << "midpoint = " << midpoint << "\twindow = " << window << endl; -//cout << "whole = " << seqs[i]->getAligned().length() << endl; - //save left side - string seqLeft = seqs[i]->getAligned().substr(midpoint-window, window); - Sequence tempLeft; - tempLeft.setName(seqs[i]->getName()); - tempLeft.setAligned(seqLeft); - left.push_back(tempLeft); -//cout << "left = " << tempLeft.getAligned().length() << endl; - //save right side - string seqRight = seqs[i]->getAligned().substr(midpoint, window); - Sequence tempRight; - tempRight.setName(seqs[i]->getName()); - tempRight.setAligned(seqRight); - right.push_back(tempRight); -//cout << "right = " << seqRight.length() << endl; + char buf[length]; + MPI_Recv(&buf, length, MPI_CHAR, j, 2001, MPI_COMM_WORLD, &status); + + string temp = buf; + if (temp.length() > length) { temp = temp.substr(0, length); } + + MPIBestSend.push_back(temp); } - //adjust midpoint by increment - midpoint += increment; - + fillPref(j, MPIBestSend); - //this should be parallelized - //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | ) - //create a matrix containing the distance from left to left and right to right - //calculate distances - SparseMatrix* SparseLeft = new SparseMatrix(); - SparseMatrix* SparseRight = new SparseMatrix(); - - createSparseMatrix(0, left.size(), SparseLeft, left); + if (m->control_pressed) { return 0; } + } + + }else { + //takes best window for each sequence and turns Preference to string that can be parsed by pid 0. + //played with this a bit, but it may be better to try user-defined datatypes with set string lengths?? + vector MPIBestSend = getBestWindow(lines[0]); + pref.clear(); + + //send your result to parent + for (int i = 0; i < numSeqs; i++) { - if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; } + if (m->control_pressed) { return 0; } - createSparseMatrix(0, right.size(), SparseRight, right); + int bestLength = MPIBestSend[i].length(); + char buf[bestLength]; + strcpy(buf, MPIBestSend[i].c_str()); - if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; } + MPI_Send(&bestLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD); + MPI_Send(buf, bestLength, MPI_CHAR, 0, 2001, MPI_COMM_WORLD); + } + + MPIBestSend.clear(); + } + + #else + + //divide breakpoints between processors + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + lines.push_back(linePair(0, iters)); - left.clear(); right.clear(); - vector distMapRight; - vector distMapLeft; + //fill pref with scores + driverChimeras(midpoints, lines[0]); + + }else{ + + int numSeqsPerProcessor = iters / processors; - // Create a data structure to quickly access the distance information. - //this is from thallingers reimplementation on get.oturep - // It consists of a vector of distance maps, where each map contains - // all distances of a certain sequence. Vector and maps are accessed - // via the index of a sequence in the distance matrix - distMapRight = vector(numSeqs); - distMapLeft = vector(numSeqs); - //cout << "left" << endl << endl; - for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) { - distMapLeft[currentCell->row][currentCell->column] = currentCell->dist; - if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; } - //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; - } - //cout << "right" << endl << endl; - for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) { - distMapRight[currentCell->row][currentCell->column] = currentCell->dist; - if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; } - //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; + for (int i = 0; i < processors; i++) { + int startPos = i * numSeqsPerProcessor; + if(i == processors - 1){ + numSeqsPerProcessor = iters - i * numSeqsPerProcessor; + } + lines.push_back(linePair(startPos, numSeqsPerProcessor)); } - delete SparseLeft; - delete SparseRight; - - //fill preference structure - generatePreferences(distMapLeft, distMapRight, midpoint); - - count++; - - } - - delete distCalculator; - - //rank preference score to eachother - float dme = 0.0; - float expectedPercent = 1 / (float) (pref.size()); - - for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[0]; } + createProcesses(midpoints); + } + #else + lines.push_back(linePair(0, iters)); + + ///fill pref with scores + driverChimeras(midpoints, lines[0]); + #endif + + #endif - for (int i = 0; i < pref.size(); i++) { + return 0; + + } + catch(exception& e) { + m->errorOut(e, "Bellerophon", "getChimeras"); + exit(1); + } +} +/**************************************************************************************************/ - //gives the actual percentage of the dme this seq adds - pref[i].score[0] = pref[i].score[0] / dme; +int Bellerophon::createProcesses(vector mid) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + int exitCommand = 1; + vector processIDS; + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); - //how much higher or lower is this than expected - pref[i].score[0] = pref[i].score[0] / expectedPercent; + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + exitCommand = driverChimeras(mid, lines[process]); + string tempOut = outputDir + toString(getpid()) + ".temp"; + writePrefs(tempOut, lines[process]); + exit(0); + }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } + } + //force parent to wait until all the processes are done + for (int i=0;ierrorOut(e, "AlignCommand", "createProcesses"); + exit(1); + } +} +//*************************************************************************************************************** +int Bellerophon::driverChimeras(vector midpoints, linePair line) { + try { + + for (int h = line.start; h < (line.start + line.num); h++) { + count = h; + int midpoint = midpoints[h]; - for (int i = 0; i < seqs.size(); i++) { delete seqs[i]; } seqs.clear(); + //initialize pref[count] + for (int i = 0; i < numSeqs; i++ ) { + pref[count][i].name = seqs[i]->getName(); + pref[count][i].midpoint = midpoint; + } + + if (m->control_pressed) { return 0; } + + //create 2 vectors of sequences, 1 for left side and one for right side + vector left; vector right; + + for (int i = 0; i < seqs.size(); i++) { + + if (m->control_pressed) { return 0; } + + //cout << "midpoint = " << midpoint << "\twindow = " << window << endl; + //cout << "whole = " << seqs[i]->getAligned().length() << endl; + //save left side + string seqLeft = seqs[i]->getAligned().substr(midpoint-window, window); + Sequence tempLeft; + tempLeft.setName(seqs[i]->getName()); + tempLeft.setAligned(seqLeft); + left.push_back(tempLeft); + //cout << "left = " << tempLeft.getAligned().length() << endl; + //save right side + string seqRight = seqs[i]->getAligned().substr(midpoint, window); + Sequence tempRight; + tempRight.setName(seqs[i]->getName()); + tempRight.setAligned(seqRight); + right.push_back(tempRight); + //cout << "right = " << seqRight.length() << endl; + } + + //this should be parallelized + //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | ) + //create a matrix containing the distance from left to left and right to right + //calculate distances + SparseMatrix* SparseLeft = new SparseMatrix(); + SparseMatrix* SparseRight = new SparseMatrix(); + + createSparseMatrix(0, left.size(), SparseLeft, left); + + if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; } + + createSparseMatrix(0, right.size(), SparseRight, right); + + if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; } + + left.clear(); right.clear(); + vector distMapRight; + vector distMapLeft; + + // Create a data structure to quickly access the distance information. + //this is from thallingers reimplementation on get.oturep + // It consists of a vector of distance maps, where each map contains + // all distances of a certain sequence. Vector and maps are accessed + // via the index of a sequence in the distance matrix + distMapRight = vector(numSeqs); + distMapLeft = vector(numSeqs); + //cout << "left" << endl << endl; + for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) { + distMapLeft[currentCell->row][currentCell->column] = currentCell->dist; + if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; } + //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; + } + //cout << "right" << endl << endl; + for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) { + distMapRight[currentCell->row][currentCell->column] = currentCell->dist; + if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; } + //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; + } + + delete SparseLeft; + delete SparseRight; + + //fill preference structure + generatePreferences(distMapLeft, distMapRight, midpoint); + + if (m->control_pressed) { return 0; } + + //report progress + if((h+1) % 10 == 0){ cout << "Processing sliding window: " << toString(h+1) << "\n"; m->mothurOutJustToLog("Processing sliding window: " + toString(h+1) + "\n") ; } + + } + //report progress + if((line.start + line.num) % 10 != 0){ cout << "Processing sliding window: " << toString(line.start + line.num) << "\n"; m->mothurOutJustToLog("Processing sliding window: " + toString(line.start + line.num) + "\n") ; } + return 0; } catch(exception& e) { - m->errorOut(e, "Bellerophon", "getChimeras"); + m->errorOut(e, "Bellerophon", "driverChimeras"); exit(1); } } @@ -297,15 +521,6 @@ int Bellerophon::generatePreferences(vector left, vector right, SeqMap::iterator itR; SeqMap::iterator itL; - //initialize pref[i] - for (int i = 0; i < pref.size(); i++) { - pref[i].score[1] = 0.0; - pref[i].closestLeft[1] = 100000.0; - pref[i].closestRight[1] = 100000.0; - pref[i].leftParent[1] = ""; - pref[i].rightParent[1] = ""; - } - for (int i = 0; i < left.size(); i++) { SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0, @@ -326,15 +541,15 @@ int Bellerophon::generatePreferences(vector left, vector right, if ((itL != currentLeft.end()) && (itR != currentRight.end())) { if (!correction) { - pref[i].score[1] += abs((itL->second - itR->second)); - pref[j].score[1] += abs((itL->second - itR->second)); + pref[count][i].score += abs((itL->second - itR->second)); + pref[count][j].score += abs((itL->second - itR->second)); //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; //cout << "abs = " << abs((itL->second - itR->second)) << endl; //cout << i << " score = " << pref[i].score[1] << endl; //cout << j << " score = " << pref[j].score[1] << endl; }else { - pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); - pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); + pref[count][i].score += abs((sqrt(itL->second) - sqrt(itR->second))); + pref[count][j].score += abs((sqrt(itL->second) - sqrt(itR->second))); //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; //cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl; //cout << i << " score = " << pref[i].score[1] << endl; @@ -342,27 +557,27 @@ int Bellerophon::generatePreferences(vector left, vector right, } //cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl; //are you the closest left sequence - if (itL->second < pref[i].closestLeft[1]) { + if (itL->second < pref[count][i].closestLeft) { - pref[i].closestLeft[1] = itL->second; - pref[i].leftParent[1] = seqs[j]->getName(); + pref[count][i].closestLeft = itL->second; + pref[count][i].leftParent = seqs[j]->getName(); //cout << "updating closest left to " << pref[i].leftParent[1] << endl; } //cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl; - if (itL->second < pref[j].closestLeft[1]) { - pref[j].closestLeft[1] = itL->second; - pref[j].leftParent[1] = seqs[i]->getName(); + if (itL->second < pref[count][j].closestLeft) { + pref[count][j].closestLeft = itL->second; + pref[count][j].leftParent = seqs[i]->getName(); //cout << "updating closest left to " << pref[j].leftParent[1] << endl; } //are you the closest right sequence - if (itR->second < pref[i].closestRight[1]) { - pref[i].closestRight[1] = itR->second; - pref[i].rightParent[1] = seqs[j]->getName(); + if (itR->second < pref[count][i].closestRight) { + pref[count][i].closestRight = itR->second; + pref[count][i].rightParent = seqs[j]->getName(); } - if (itR->second < pref[j].closestRight[1]) { - pref[j].closestRight[1] = itR->second; - pref[j].rightParent[1] = seqs[i]->getName(); + if (itR->second < pref[count][j].closestRight) { + pref[count][j].closestRight = itR->second; + pref[count][j].rightParent = seqs[i]->getName(); } } @@ -370,55 +585,190 @@ int Bellerophon::generatePreferences(vector left, vector right, } + + return 1; + + } + catch(exception& e) { + m->errorOut(e, "Bellerophon", "generatePreferences"); + exit(1); + } +} +/**************************************************************************************************/ +vector Bellerophon::getBestPref() { + try { + + vector best; - - //calculate the dme - int count0 = 0; - for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } } + //for each sequence + for (int i = 0; i < numSeqs; i++) { + + //set best pref score to first one + Preference temp = pref[0][i]; + + if (m->control_pressed) { return best; } + + //for each window + for (int j = 1; j < pref.size(); j++) { + + //is this a better score + if (pref[j][i].score > temp.score) { temp = pref[j][i]; } + } + + best.push_back(temp); + } - float expectedPercent = 1 / (float) (pref.size() - count0); -//cout << endl << "dme = " << dme << endl; - //recalculate prefernences based on dme - for (int i = 0; i < pref.size(); i++) { + //rank preference score to eachother + float dme = 0.0; + float expectedPercent = 1 / (float) (best.size()); - if (m->control_pressed) { return 0; } -//cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl; - // gives the actual percentage of the dme this seq adds - pref[i].score[1] = pref[i].score[1] / dme; + for (int i = 0; i < best.size(); i++) { dme += best[i].score; } + + for (int i = 0; i < best.size(); i++) { + + if (m->control_pressed) { return best; } + + //gives the actual percentage of the dme this seq adds + best[i].score = best[i].score / dme; //how much higher or lower is this than expected - pref[i].score[1] = pref[i].score[1] / expectedPercent; + best[i].score = best[i].score / expectedPercent; + + } + + //sort Preferences highest to lowest + sort(best.begin(), best.end(), comparePref); + + return best; + } + catch(exception& e) { + m->errorOut(e, "Bellerophon", "getBestPref"); + exit(1); + } +} +/**************************************************************************************************/ +int Bellerophon::writePrefs(string file, linePair tempLine) { + try { + + ofstream outTemp; + openOutputFile(file, outTemp); + + //lets you know what part of the pref matrix you are writing + outTemp << tempLine.start << '\t' << tempLine.num << endl; + + for (int i = tempLine.start; i < (tempLine.start + tempLine.num); i++) { + + for (int j = 0; j < numSeqs; j++) { + + if (m->control_pressed) { outTemp.close(); remove(file.c_str()); return 0; } + + outTemp << pref[i][j].name << '\t' << pref[i][j].leftParent << '\t' << pref[i][j].rightParent << '\t'; + outTemp << pref[i][j].score << '\t' << pref[i][j].closestLeft << '\t' << pref[i][j].closestRight << '\t' << pref[i][j].midpoint << endl; + } + } + + outTemp.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "Bellerophon", "writePrefs"); + exit(1); + } +} +/**************************************************************************************************/ +int Bellerophon::readPrefs(string file) { + try { + + ifstream inTemp; + openInputFile(file, inTemp); + + int start, num; + + //lets you know what part of the pref matrix you are writing + inTemp >> start >> num; gobble(inTemp); + + for (int i = start; i < num; i++) { - //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]); + for (int j = 0; j < numSeqs; j++) { + + if (m->control_pressed) { inTemp.close(); remove(file.c_str()); return 0; } - //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher. -//cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl; + inTemp >> pref[i][j].name >> pref[i][j].leftParent >> pref[i][j].rightParent; + inTemp >> pref[i][j].score >> pref[i][j].closestLeft >> pref[i][j].closestRight >> pref[i][j].midpoint; + gobble(inTemp); + } } - //is this score bigger then the last score - for (int i = 0; i < pref.size(); i++) { + inTemp.close(); + + remove(file.c_str()); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "Bellerophon", "writePrefs"); + exit(1); + } +} +/**************************************************************************************************/ +vector Bellerophon::getBestWindow(linePair line) { + try { + + vector best; + + //for each sequence + for (int i = 0; i < numSeqs; i++) { + + //set best pref score to first one + Preference temp = pref[line.start][i]; - if (m->control_pressed) { return 0; } + if (m->control_pressed) { return best; } - //update biggest score - if (pref[i].score[1] > pref[i].score[0]) { - pref[i].score[0] = pref[i].score[1]; - pref[i].leftParent[0] = pref[i].leftParent[1]; - pref[i].rightParent[0] = pref[i].rightParent[1]; - pref[i].closestLeft[0] = pref[i].closestLeft[1]; - pref[i].closestRight[0] = pref[i].closestRight[1]; - pref[i].midpoint = mid; + //for each window + for (int j = (line.start+1); j < (line.start+line.num); j++) { + + //is this a better score + if (pref[j][i].score > temp.score) { temp = pref[j][i]; } } + string tempString = temp.name + '\t' + temp.leftParent + '\t' + temp.rightParent + '\t' + toString(temp.score); + best.push_back(tempString); } + + return best; + + } + catch(exception& e) { + m->errorOut(e, "Bellerophon", "getBestWindow"); + exit(1); + } +} +/**************************************************************************************************/ +int Bellerophon::fillPref(int process, vector& best) { + try { + //figure out where you start so you can put the best scores there + int numSeqsPerProcessor = iters / processors; + int start = process * numSeqsPerProcessor; - return 1; + for (int i = 0; i < best.size(); i++) { + + if (m->control_pressed) { return 0; } + + istringstream iss (best[i],istringstream::in); + + string tempScore; + iss >> pref[start][i].name >> pref[start][i].leftParent >> pref[start][i].rightParent >> tempScore; + convert(tempScore, pref[start][i].score); + } + return 0; } catch(exception& e) { - m->errorOut(e, "Bellerophon", "generatePreferences"); + m->errorOut(e, "Bellerophon", "fillPref"); exit(1); } } + /**************************************************************************************************/ diff --git a/bellerophon.h b/bellerophon.h index 3d05617..1333ec8 100644 --- a/bellerophon.h +++ b/bellerophon.h @@ -12,7 +12,6 @@ #include "chimera.h" -#include "filterseqscommand.h" #include "sparsematrix.hpp" #include "sequence.hpp" #include "dist.h" @@ -25,22 +24,41 @@ typedef map SeqMap; //maps sequence to all distance for that seqeun class Bellerophon : public Chimera { public: - Bellerophon(string, string); - ~Bellerophon() {}; + Bellerophon(string, bool, bool, int, int, int, string); //fastafile, filter, correction, window, increment, processors, outputDir); + ~Bellerophon() { delete distCalculator; for (int i = 0; i < seqs.size(); i++) { delete seqs[i]; } seqs.clear(); } int getChimeras(); int print(ostream&, ostream&); + #ifdef USE_MPI + int print(MPI_File&, MPI_File&); + #endif + private: + struct linePair { + int start; + int num; + linePair(long int i, int j) : start(i), num(j) {} + }; + + vector lines; + Dist* distCalculator; - FilterSeqsCommand* filterSeqs; vector seqs; - vector pref; + vector< vector > pref; //pref[0] = preference scores for all seqs in window 0. string fastafile; - int iters; + int iters, count, window, increment, numSeqs, processors; //iters = number of windows + bool correction; int generatePreferences(vector, vector, int); int createSparseMatrix(int, int, SparseMatrix*, vector); + vector getBestPref(); + int driverChimeras(vector, linePair); + int createProcesses(vector); + int writePrefs(string, linePair); + int readPrefs(string); + vector getBestWindow(linePair line); + int fillPref(int, vector&); }; /***********************************************************/ diff --git a/blastdb.cpp b/blastdb.cpp index 17db069..780afe0 100644 --- a/blastdb.cpp +++ b/blastdb.cpp @@ -25,6 +25,19 @@ gapOpen(gO), gapExtend(gE), match(m), misMatch(mM) { queryFileName = toString(randNumber) + ".candidate.unaligned.fasta"; blastFileName = toString(randNumber) + ".blast"; +} +/**************************************************************************************************/ + +BlastDB::BlastDB() : Database() { + + globaldata = GlobalData::getInstance(); + count = 0; + + int randNumber = rand(); + dbFileName = toString(randNumber) + ".template.unaligned.fasta"; + queryFileName = toString(randNumber) + ".candidate.unaligned.fasta"; + blastFileName = toString(randNumber) + ".blast"; + } /**************************************************************************************************/ @@ -181,6 +194,56 @@ void BlastDB::generateDB() { exit(1); } } +#ifdef USE_MPI +/**************************************************************************************************/ +int BlastDB::MPISend(int receiver) { + try { + + //send gapOpen - float + MPI_Send(&gapOpen, 1, MPI_FLOAT, receiver, 2001, MPI_COMM_WORLD); + + //send gapExtend - float + MPI_Send(&gapExtend, 1, MPI_FLOAT, receiver, 2001, MPI_COMM_WORLD); + + //send match - float + MPI_Send(&match, 1, MPI_FLOAT, receiver, 2001, MPI_COMM_WORLD); + + //send mismatch - float + MPI_Send(&misMatch, 1, MPI_FLOAT, receiver, 2001, MPI_COMM_WORLD); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "BlastDB", "MPISend"); + exit(1); + } +} +/**************************************************************************************************/ +int BlastDB::MPIRecv(int sender) { + try { + MPI_Status status; + + //receive gapOpen - float + MPI_Recv(&gapOpen, 1, MPI_FLOAT, sender, 2001, MPI_COMM_WORLD, &status); + + //receive gapExtend - float + MPI_Recv(&gapExtend, 1, MPI_FLOAT, sender, 2001, MPI_COMM_WORLD, &status); + + //receive match - float + MPI_Recv(&match, 1, MPI_FLOAT, sender, 2001, MPI_COMM_WORLD, &status); + + //receive mismatch - float + MPI_Recv(&misMatch, 1, MPI_FLOAT, sender, 2001, MPI_COMM_WORLD, &status); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "BlastDB", "MPIRecv"); + exit(1); + } +} +#endif +/**************************************************************************************************/ /**************************************************************************************************/ diff --git a/blastdb.hpp b/blastdb.hpp index 0f8fccd..d61aaec 100644 --- a/blastdb.hpp +++ b/blastdb.hpp @@ -18,13 +18,19 @@ class BlastDB : public Database { public: BlastDB(float, float, float, float); + BlastDB(); ~BlastDB(); void generateDB(); void addSequence(Sequence); vector findClosestSequences(Sequence*, int); vector findClosestMegaBlast(Sequence*, int); - + + #ifdef USE_MPI + int MPISend(int); //just sends gapOpen, gapExtend, match and mismatch + int MPIRecv(int); + #endif + private: string dbFileName; string queryFileName; diff --git a/ccode.cpp b/ccode.cpp index 3aad3f6..56856a9 100644 --- a/ccode.cpp +++ b/ccode.cpp @@ -13,28 +13,61 @@ //*************************************************************************************************************** -Ccode::Ccode(string filename, string o) { - fastafile = filename; outputDir = o; +Ccode::Ccode(string filename, string temp, bool f, string mask, int win, int numW, string o) : Chimera() { + fastafile = filename; + outputDir = o; + templateFileName = temp; templateSeqs = readSeqs(temp); + setMask(mask); + filter = f; + window = win; + numWanted = numW; + distCalc = new eachGapDist(); decalc = new DeCalculator(); mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo"; - ofstream out2; - openOutputFile(mapInfo, out2); + + #ifdef USE_MPI - out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl; - out2.close(); + char inFileName[mapInfo.length()]; + strcpy(inFileName, mapInfo.c_str()); + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + + MPI_File_open(MPI_COMM_WORLD, inFileName, outMode, MPI_INFO_NULL, &outMap); //comm, filename, mode, info, filepointer + + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + string outString = "Place in masked, filtered and trimmed sequence\tPlace in original alignment\n"; + + MPI_Status status; + int length = outString.length(); + char buf2[length]; + strcpy(buf2, outString.c_str()); + + MPI_File_write_shared(outMap, buf2, length, MPI_CHAR, &status); + } + #else + + ofstream out2; + openOutputFile(mapInfo, out2); + + out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl; + out2.close(); + #endif } //*************************************************************************************************************** Ccode::~Ccode() { delete distCalc; delete decalc; + + #ifdef USE_MPI + MPI_File_close(&outMap); + #endif } //*************************************************************************************************************** -void Ccode::printHeader(ostream& out) { - out << "For full window mapping info refer to " << mapInfo << endl << endl; -} -//*************************************************************************************************************** int Ccode::print(ostream& out, ostream& outAcc) { try { @@ -116,13 +149,136 @@ int Ccode::print(ostream& out, ostream& outAcc) { //free memory for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; } - return 0; + return results; } catch(exception& e) { m->errorOut(e, "Ccode", "print"); exit(1); } } +#ifdef USE_MPI +//*************************************************************************************************************** +int Ccode::print(MPI_File& out, MPI_File& outAcc) { + try { + + string outMapString = ""; + + outMapString += querySeq->getName() + "\n"; + for (it = spotMap.begin(); it!= spotMap.end(); it++) { + outMapString += toString(it->first) + "\t" + toString(it->second) + "\n"; + } + printMapping(outMapString); + outMapString = ""; + + string outString = ""; + string outAccString = ""; + + outString += querySeq->getName() + "\n\nReference sequences used and distance to query:\n"; + + for (int j = 0; j < closest.size(); j++) { + outString += closest[j].seq->getName() + "\t" + toString(closest[j].dist) + "\n"; + } + outString += "\n\nMapping information: "; + + //for each window + //window mapping info. + //you mask and did not filter + if ((seqMask != "") && (!filter)) { outString += "mask and trim."; } + + //you filtered and did not mask + if ((seqMask == "") && (filter)) { outString += "filter and trim."; } + + //you masked and filtered + if ((seqMask != "") && (filter)) { outString += "mask, filter and trim."; } + + outString += "\nWindow\tStartPos\tEndPos\n"; + it = trim.begin(); + for (int k = 0; k < windows.size()-1; k++) { + outString += toString(k+1) + "\t" + toString(spotMap[windows[k]-it->first]) + "\t" + toString(spotMap[windows[k]-it->first+windowSizes]) + "\n"; + } + + outString += toString(windows.size()) + "\t" + toString(spotMap[windows[windows.size()-1]-it->first]) + "\t" + toString(spotMap[it->second-it->first-1]) + "\n\n"; + + outString += "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova\n"; + for (int k = 0; k < windows.size(); k++) { + float ds = averageQuery[k] / averageRef[k]; + outString += toString(k+1) + "\t" + toString(averageQuery[k]) + "\t" + toString(sdQuery[k]) + "\t" + toString(averageRef[k]) + "\t" + toString(sdRef[k]) + "\t" + toString(ds) + "\t" + toString(anova[k]) + "\n"; + } + + //varRef + //varQuery + /* F test for differences among variances. + * varQuery is expected to be higher or similar than varRef */ + //float fs = varQuery[query] / varRef[query]; /* F-Snedecor, test for differences of variances */ + + bool results = false; + + //confidence limit, t - Student, anova + outString += "\nWindow\tConfidenceLimit\tt-Student\tAnova\n"; + + for (int k = 0; k < windows.size(); k++) { + string temp = ""; + if (isChimericConfidence[k]) { temp += "*\t"; } + else { temp += "\t"; } + + if (isChimericTStudent[k]) { temp += "*\t"; } + else { temp += "\t"; } + + if (isChimericANOVA[k]) { temp += "*\t"; } + else { temp += "\t"; } + + outString += toString(k+1) + "\t" + temp + "\n"; + + if (temp == "*\t*\t*\t") { results = true; } + } + outString += "\n"; + + MPI_Status status; + int length = outString.length(); + char buf2[length]; + strcpy(buf2, outString.c_str()); + + MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status); + + if (results) { + m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine(); + outAccString += querySeq->getName() + "\n"; + + MPI_Status statusAcc; + length = outAccString.length(); + char buf[length]; + strcpy(buf, outAccString.c_str()); + + MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc); + } + + //free memory + for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; } + + return results; + } + catch(exception& e) { + m->errorOut(e, "Ccode", "print"); + exit(1); + } +} +//*************************************************************************************************************** +int Ccode::printMapping(string& output) { + try { + MPI_Status status; + int length = output.length(); + char buf[length]; + strcpy(buf, output.c_str()); + + MPI_File_write_shared(outMap, buf, length, MPI_CHAR, &status); + + } + catch(exception& e) { + m->errorOut(e, "Ccode", "printMapping"); + exit(1); + } +} +#endif //*************************************************************************************************************** int Ccode::getChimeras(Sequence* query) { try { diff --git a/ccode.h b/ccode.h index afc77cc..91ef845 100644 --- a/ccode.h +++ b/ccode.h @@ -24,17 +24,20 @@ class Ccode : public Chimera { public: - Ccode(string, string); + Ccode(string, string, bool, string, int, int, string); //fasta, template, filter, mask, window, numWanted, outputDir ~Ccode(); int getChimeras(Sequence* query); int print(ostream&, ostream&); - void printHeader(ostream&); + + #ifdef USE_MPI + int print(MPI_File&, MPI_File&); + #endif private: Dist* distCalc; DeCalculator* decalc; - int iters; + int iters, window, numWanted; string fastafile, mapInfo; Sequence* querySeq; @@ -75,6 +78,12 @@ class Ccode : public Chimera { int getDiff(string, string); //return number of mismatched bases, a gap to base is not counted as a mismatch float getT(int); float getF(int); + + #ifdef USE_MPI + int printMapping(string&); + MPI_File outMap; + #endif + }; /***********************************************************/ diff --git a/chimera.cpp b/chimera.cpp index 7eeca96..692a4fe 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -42,7 +42,6 @@ string Chimera::createFilter(vector seqs, float t) { } } - //zero out spot where all sequences have blanks //zero out spot where all sequences have blanks int numColRemoved = 0; for(int i = 0;i < seqs[0]->getAligned().length(); i++){ @@ -55,7 +54,8 @@ string Chimera::createFilter(vector seqs, float t) { //cout << "a = " << a[i] << " t = " << t[i] << " g = " << g[i] << " c = " << c[i] << endl; } - m->mothurOut("Filter removed " + toString(numColRemoved) + " columns."); m->mothurOutEndLine(); + if (threshold != 0.0) { m->mothurOut("Filter removed " + toString(numColRemoved) + " columns."); m->mothurOutEndLine(); } + return filterString; } catch(exception& e) { @@ -93,14 +93,68 @@ map Chimera::runFilter(Sequence* seq) { vector Chimera::readSeqs(string file) { try { - m->mothurOut("Reading sequences... "); cout.flush(); - ifstream in; - openInputFile(file, in); - vector container; int count = 0; length = 0; unaligned = false; + + m->mothurOut("Reading sequences from " + file + "..."); cout.flush(); + + #ifdef USE_MPI + int pid; + vector positions; + int numSeqs; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + char inFileName[file.length()]; + strcpy(inFileName, file.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + + if (pid == 0) { + positions = setFilePosFasta(file, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + }else{ + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + positions.resize(numSeqs+1); + MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + } + + //read file + for(int i=0;icontrol_pressed) { MPI_File_close(&inMPI); return container; } + + //read next sequence + int seqlength = positions[i+1] - positions[i]; + char buf4[seqlength]; + MPI_File_read_at(inMPI, positions[i], buf4, seqlength, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > seqlength) { tempBuf = tempBuf.substr(0, seqlength); } + + istringstream iss (tempBuf,istringstream::in); + + Sequence* current = new Sequence(iss); + if (current->getName() != "") { + if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length + else if (length != current->getAligned().length()) { unaligned = true; } + + container.push_back(current); + } + } + + MPI_File_close(&inMPI); + #else + + ifstream in; + openInputFile(file, in); //read in seqs and store in vector while(!in.eof()){ @@ -110,14 +164,13 @@ vector Chimera::readSeqs(string file) { Sequence* current = new Sequence(in); gobble(in); if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length - else if (length != current->getAligned().length()) { //seqs are unaligned - unaligned = true; - } - + else if (length != current->getAligned().length()) { unaligned = true; } + if (current->getName() != "") { container.push_back(current); } } - in.close(); + #endif + m->mothurOut("Done."); m->mothurOutEndLine(); return container; @@ -137,64 +190,53 @@ void Chimera::setMask(string filename) { }else if (filename == "") { //do nothing seqMask = ""; }else{ + + #ifdef USE_MPI + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + + char inFileName[filename.length()]; + strcpy(inFileName, filename.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_get_size(inMPI, &size); + + char buffer[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + + if (!iss.eof()) { + Sequence temp(iss); + seqMask = temp.getAligned(); + }else { + m->mothurOut("Problem with mask."); m->mothurOutEndLine(); + seqMask = ""; + } + + MPI_File_close(&inMPI); + #else + ifstream infile; openInputFile(filename, infile); - while (!infile.eof()) { + if (!infile.eof()) { Sequence temp(infile); seqMask = temp.getAligned(); - - gobble(infile); + }else { + m->mothurOut("Problem with mask."); m->mothurOutEndLine(); + seqMask = ""; } - infile.close(); - } - } - catch(exception& e) { - m->errorOut(e, "Chimera", "setMask"); - exit(1); - } -} -//*************************************************************************************************************** - -vector< vector > Chimera::readQuantiles() { - try { - - ifstream in; - openInputFile(quanfile, in); - - vector< vector > quan; - vector temp; temp.resize(6, 0); - - //to fill 0 - quan.push_back(temp); - - int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; - - while(!in.eof()){ - - in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; - - temp.clear(); - - temp.push_back(ten); - temp.push_back(twentyfive); - temp.push_back(fifty); - temp.push_back(seventyfive); - temp.push_back(ninetyfive); - temp.push_back(ninetynine); - - quan.push_back(temp); + #endif - gobble(in); } - - in.close(); - return quan; - } catch(exception& e) { - m->errorOut(e, "Chimera", "readQuantiles"); + m->errorOut(e, "Chimera", "setMask"); exit(1); } } diff --git a/chimera.h b/chimera.h index ef62b53..11bc435 100644 --- a/chimera.h +++ b/chimera.h @@ -17,13 +17,14 @@ /***********************************************************************/ struct Preference { string name; - vector leftParent; //keep the name of closest left associated with the two scores - vector rightParent; //keep the name of closest right associated with the two scores - vector score; //so you can keep last score and calc this score and keep whichever is bigger. - vector closestLeft; //keep the closest left associated with the two scores - vector closestRight; //keep the closest right associated with the two scores + string leftParent; //keep the name of closest left + string rightParent; //keep the name of closest + float score; //preference score + float closestLeft; //keep the closest left + float closestRight; //keep the closest right int midpoint; - + Preference() { name = ""; leftParent = ""; rightParent = ""; score = 0.0; closestLeft = 10000.0; closestRight = 10000.0; midpoint = 0; } + ~Preference() {} }; /***********************************************************************/ struct score_struct { @@ -88,38 +89,38 @@ class Chimera { public: - Chimera(){ m = MothurOut::getInstance(); } - Chimera(string) { m = MothurOut::getInstance(); } - Chimera(string, bool, string) { m = MothurOut::getInstance(); } - Chimera(string, string) { m = MothurOut::getInstance(); } + Chimera(){ m = MothurOut::getInstance(); length = 0; unaligned = false; } + //Chimera(string) { m = MothurOut::getInstance(); } + //Chimera(string, bool, string) { m = MothurOut::getInstance(); } + //Chimera(string, string) { m = MothurOut::getInstance(); } virtual ~Chimera(){ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } }; - virtual void setFilter(bool f) { filter = f; } - virtual void setCorrection(bool c) { correction = c; } - virtual void setProcessors(int p) { processors = p; } - virtual void setWindow(int w) { window = w; } - virtual void setIncrement(int i) { increment = i; } - virtual void setNumWanted(int n) { numWanted = n; } - virtual void setKmerSize(int k) { kmerSize = k; } - virtual void setSVG(int s) { svg = s; } - virtual void setName(string n) { name = n; } - virtual void setMatch(int m) { match = m; } - virtual void setMisMatch(int m) { misMatch = m; } - virtual void setDivR(float d) { divR = d; } - virtual void setParents(int p) { parents = p; } - virtual void setMinSim(int s) { minSim = s; } - virtual void setMinCoverage(int c) { minCov = c; } - virtual void setMinBS(int b) { minBS = b; } - virtual void setMinSNP(int s) { minSNP = s; } - virtual void setIters(int i) { iters = i; } + //virtual void setFilter(bool f) { filter = f; } + //virtual void setCorrection(bool c) { correction = c; } + //virtual void setProcessors(int p) { processors = p; } + //virtual void setWindow(int w) { window = w; } + //virtual void setIncrement(int i) { increment = i; } + //virtual void setNumWanted(int n) { numWanted = n; } + //virtual void setKmerSize(int k) { kmerSize = k; } + //virtual void setSVG(int s) { svg = s; } + //virtual void setName(string n) { name = n; } + //virtual void setMatch(int m) { match = m; } + //virtual void setMisMatch(int m) { misMatch = m; } + //virtual void setDivR(float d) { divR = d; } + //virtual void setParents(int p) { parents = p; } + //virtual void setMinSim(int s) { minSim = s; } + //virtual void setMinCoverage(int c) { minCov = c; } + //virtual void setMinBS(int b) { minBS = b; } + //virtual void setMinSNP(int s) { minSNP = s; } + //virtual void setIters(int i) { iters = i; } virtual bool getUnaligned() { return unaligned; } - virtual void setTemplateFile(string t) { templateFileName = t; templateSeqs = readSeqs(t); } + //virtual void setTemplateFile(string t) { templateFileName = t; templateSeqs = readSeqs(t); } virtual int getLength() { return length; } - virtual void setCons(string){}; - virtual void setQuantiles(string){}; - virtual int doPrep(){ return 0; } + //virtual void setCons(string){}; + //virtual void setQuantiles(string){}; + //virtual int doPrep(){ return 0; } virtual vector readSeqs(string); - virtual vector< vector > readQuantiles(); + //virtual vector< vector > readQuantiles(); virtual void setMask(string); virtual map runFilter(Sequence*); virtual string createFilter(vector, float); @@ -127,16 +128,20 @@ class Chimera { virtual void printHeader(ostream&){}; virtual int getChimeras(Sequence*){ return 0; } virtual int getChimeras(){ return 0; } - virtual int print(ostream&, ostream&){ return 0; } + virtual int print(ostream&, ostream&){ return 0; } + + #ifdef USE_MPI + virtual int print(MPI_File&, MPI_File&){ return 0; } + #endif protected: vector templateSeqs; - bool filter, correction, svg, unaligned; - int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters, length; - float divR; - string seqMask, quanfile, filterString, name, outputDir, templateFileName; + bool filter, unaligned; // correction, svg, + int length; //processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters, + //float divR; + string seqMask, filterString, outputDir, templateFileName; //quanfile, name, Sequence* getSequence(string); //find sequence from name MothurOut* m; }; diff --git a/chimerabellerophoncommand.cpp b/chimerabellerophoncommand.cpp new file mode 100644 index 0000000..f36d7a0 --- /dev/null +++ b/chimerabellerophoncommand.cpp @@ -0,0 +1,190 @@ +/* + * chimerabellerophoncommand.cpp + * Mothur + * + * Created by westcott on 4/1/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "chimerabellerophoncommand.h" +#include "bellerophon.h" + +//*************************************************************************************************************** + +ChimeraBellerophonCommand::ChimeraBellerophonCommand(string option) { + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta","filter","correction","processors","window","increment","outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + } + + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { abort = true; } + else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.bellerophon command."); m->mothurOutEndLine(); abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it + } + + string temp; + temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; } + filter = isTrue(temp); + + temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; } + correction = isTrue(temp); + + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } + convert(temp, processors); + + temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; } + convert(temp, window); + + temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; } + convert(temp, increment); + } + } + catch(exception& e) { + m->errorOut(e, "ChimeraBellerophonCommand", "ChimeraBellerophonCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void ChimeraBellerophonCommand::help(){ + try { + m->mothurOut("The chimera.bellerophon command reads a fastafile and creates list of potentially chimeric sequences.\n"); + m->mothurOut("The chimera.bellerophon command parameters are fasta, filter, correction, processors, window, increment. The fasta parameter is required.\n"); + m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter, default=false. \n"); + m->mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n"); + m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); + #ifdef USE_MPI + m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); + #endif + m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default is 1/4 sequence length. \n"); + m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default is 25.\n"); + m->mothurOut("chimera.bellerophon(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors) \n"); + m->mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, window=200) \n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "ChimeraBellerophonCommand", "help"); + exit(1); + } +} + +//*************************************************************************************************************** + +ChimeraBellerophonCommand::~ChimeraBellerophonCommand(){ /* do nothing */ } + +//*************************************************************************************************************** + +int ChimeraBellerophonCommand::execute(){ + try{ + + if (abort == true) { return 0; } + + int start = time(NULL); + + chimera = new Bellerophon(fastafile, filter, correction, window, increment, processors, outputDir); + + string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + "bellerophon.chimeras"; + string accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + "bellerophon.accnos"; + bool hasAccnos = true; + + chimera->getChimeras(); + + if (m->control_pressed) { delete chimera; return 0; } + + #ifdef USE_MPI + MPI_File outMPI; + MPI_File outMPIAccnos; + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + + char outFilename[accnosFileName.length()]; + strcpy(outFilename, accnosFileName.c_str()); + + char FileName[outputFileName.length()]; + strcpy(FileName, outputFileName.c_str()); + + MPI_File_open(MPI_COMM_WORLD, FileName, outMode, MPI_INFO_NULL, &outMPI); //comm, filename, mode, info, filepointer + MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPIAccnos); + + numSeqs = chimera->print(outMPI, outMPIAccnos); + + MPI_File_close(&outMPI); + MPI_File_close(&outMPIAccnos); + + #else + + ofstream out; + openOutputFile(outputFileName, out); + + ofstream out2; + openOutputFile(accnosFileName, out2); + + numSeqs = chimera->print(out, out2); + out.close(); + out2.close(); + + #endif + + if (m->control_pressed) { remove(accnosFileName.c_str()); remove(outputFileName.c_str()); delete chimera; return 0; } + + //delete accnos file if its blank + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + m->mothurOut(outputFileName); m->mothurOutEndLine(); + if (hasAccnos) { m->mothurOut(accnosFileName); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); + + delete chimera; + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraBellerophonCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + diff --git a/chimerabellerophoncommand.h b/chimerabellerophoncommand.h new file mode 100644 index 0000000..e450b52 --- /dev/null +++ b/chimerabellerophoncommand.h @@ -0,0 +1,39 @@ +#ifndef CHIMERABELLEROPHONCOMMAND_H +#define CHIMERABELLEROPHONCOMMAND_H + +/* + * chimerabellerophoncommand.h + * Mothur + * + * Created by westcott on 4/1/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" +#include "chimera.h" + + +/***********************************************************/ + +class ChimeraBellerophonCommand : public Command { +public: + ChimeraBellerophonCommand(string); + ~ChimeraBellerophonCommand(); + int execute(); + void help(); + +private: + + bool abort, filter, correction; + string fastafile, outputDir; + int processors, window, increment, numSeqs; + Chimera* chimera; +}; + +/***********************************************************/ + +#endif + + diff --git a/chimeraccodecommand.cpp b/chimeraccodecommand.cpp new file mode 100644 index 0000000..748163e --- /dev/null +++ b/chimeraccodecommand.cpp @@ -0,0 +1,574 @@ +/* + * chimeraccodecommand.cpp + * Mothur + * + * Created by westcott on 3/30/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "chimeraccodecommand.h" +#include "ccode.h" + +//*************************************************************************************************************** + +ChimeraCcodeCommand::ChimeraCcodeCommand(string option) { + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta", "filter", "processors", "window", "template", "mask", "numwanted", "outputdir","inputdir", }; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("template"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["template"] = inputDir + it->second; } + } + } + + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { abort = true; } + else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.ccode command."); m->mothurOutEndLine(); abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it + } + + templatefile = validParameter.validFile(parameters, "template", true); + if (templatefile == "not open") { abort = true; } + else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.ccode command."); m->mothurOutEndLine(); abort = true; } + + maskfile = validParameter.validFile(parameters, "mask", false); + if (maskfile == "not found") { maskfile = ""; } + else if (maskfile != "default") { + if (inputDir != "") { + string path = hasPath(maskfile); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { maskfile = inputDir + maskfile; } + } + + ifstream in; + int ableToOpen = openInputFile(maskfile, in); + if (ableToOpen == 1) { abort = true; } + in.close(); + } + + string temp; + temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; } + filter = isTrue(temp); + + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } + convert(temp, processors); + + temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; } + convert(temp, window); + + temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "20"; } + convert(temp, numwanted); + + } + } + catch(exception& e) { + m->errorOut(e, "ChimeraCcodeCommand", "ChimeraCcodeCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void ChimeraCcodeCommand::help(){ + try { + + m->mothurOut("The chimera.ccode command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n"); + m->mothurOut("This command was created using the algorythms described in the 'Evaluating putative chimeric sequences from PCR-amplified products' paper by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez.\n"); + m->mothurOut("The chimera.ccode command parameters are fasta, template, filter, mask, processors, window and numwanted.\n"); + m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n"); + m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n"); + m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n"); + m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); + #ifdef USE_MPI + m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); + #endif + m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences. \n"); + m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras. \n"); + m->mothurOut("The numwanted parameter allows you to specify how many sequences you would each query sequence compared with.\n"); + m->mothurOut("The chimera.ccode command should be in the following format: \n"); + m->mothurOut("chimera.ccode(fasta=yourFastaFile, template=yourTemplate) \n"); + m->mothurOut("Example: chimera.seqs(fasta=AD.align, template=core_set_aligned.imputed.fasta) \n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "ChimeraCcodeCommand", "help"); + exit(1); + } +} + +//*************************************************************************************************************** + +ChimeraCcodeCommand::~ChimeraCcodeCommand(){ /* do nothing */ } + +//*************************************************************************************************************** + +int ChimeraCcodeCommand::execute(){ + try{ + + if (abort == true) { return 0; } + + int start = time(NULL); + + //set user options + if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); } + + chimera = new Ccode(fastafile, templatefile, filter, maskfile, window, numwanted, outputDir); + + //is your template aligned? + if (chimera->getUnaligned()) { m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine(); delete chimera; return 0; } + templateSeqsLength = chimera->getLength(); + + string outputFileName, accnosFileName; + if (maskfile != "") { + outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".ccode.chimeras"; + accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".ccode.accnos"; + }else { + outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + "ccode.chimeras"; + accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + "ccode.accnos"; + } + + string mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo"; + bool hasAccnos = true; + + if (m->control_pressed) { delete chimera; return 0; } + + #ifdef USE_MPI + + int pid, end, numSeqsPerProcessor; + int tag = 2001; + vector MPIPos; + MPIWroteAccnos = false; + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + MPI_File inMPI; + MPI_File outMPI; + MPI_File outMPIAccnos; + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + int inMode=MPI_MODE_RDONLY; + + char outFilename[outputFileName.length()]; + strcpy(outFilename, outputFileName.c_str()); + + char outAccnosFilename[accnosFileName.length()]; + strcpy(outAccnosFilename, accnosFileName.c_str()); + + char inFileName[fastafile.length()]; + strcpy(inFileName, fastafile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI); + MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; } + + if (pid == 0) { //you are the root process + string outTemp = "For full window mapping info refer to " + mapInfo + "\n\n"; + + //print header + int length = outTemp.length(); + char buf2[length]; + strcpy(buf2, outTemp.c_str()); + MPI_File_write_shared(outMPI, buf2, length, MPI_CHAR, &status); + + MPIPos = setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + + //figure out how many sequences you have to align + numSeqsPerProcessor = numSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); delete chimera; return 0; } + + for (int i = 1; i < processors; i++) { + bool tempResult; + MPI_Recv(&tempResult, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); + if (tempResult != 0) { MPIWroteAccnos = true; } + } + }else{ //you are a child process + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + MPIPos.resize(numSeqs+1); + MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + + //figure out how many sequences you have to align + numSeqsPerProcessor = numSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; } + + MPI_Send(&MPIWroteAccnos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); + } + + //close files + MPI_File_close(&inMPI); + MPI_File_close(&outMPI); + MPI_File_close(&outMPIAccnos); + + //delete accnos file if blank + if (pid == 0) { + if (!MPIWroteAccnos) { + //MPI_Info info; + //MPI_File_delete(outAccnosFilename, info); + hasAccnos = false; + remove(accnosFileName.c_str()); + } + } + + #else + ofstream outHeader; + string tempHeader = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + "ccode.chimeras.tempHeader"; + openOutputFile(tempHeader, outHeader); + + outHeader << "For full window mapping info refer to " << mapInfo << endl << endl; + + outHeader.close(); + + //break up file + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numSeqs)); + + driver(lines[0], outputFileName, fastafile, accnosFileName); + + if (m->control_pressed) { + remove(outputFileName.c_str()); + remove(tempHeader.c_str()); + remove(accnosFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + + //delete accnos file if its blank + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } + + }else{ + vector positions; + processIDS.resize(0); + + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); + if (input.length() != 0) { + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + numSeqs = positions.size(); + + int numSeqsPerProcessor = numSeqs / processors; + + for (int i = 0; i < processors; i++) { + long int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } + + + createProcesses(outputFileName, fastafile, accnosFileName); + + rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); + + //append output files + for(int i=1;i nonBlankAccnosFiles; + //delete blank accnos files generated with multiple processes + for(int i=0;icontrol_pressed) { + remove(outputFileName.c_str()); + remove(accnosFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + + } + + #else + ifstream inFASTA; + openInputFile(candidateFileNames[s], inFASTA); + numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + lines.push_back(new linePair(0, numSeqs)); + + driver(lines[0], outputFileName, fastafile, accnosFileName); + + if (m->control_pressed) { + remove(outputFileName.c_str()); + remove(tempHeader.c_str()); + remove(accnosFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + + //delete accnos file if its blank + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } + #endif + + //m->mothurOut("Output File Names: "); + //if ((filter) && (method == "bellerophon")) { m->mothurOut( + //if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; } + // else { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; } + + appendFiles(outputFileName, tempHeader); + + remove(outputFileName.c_str()); + rename(tempHeader.c_str(), outputFileName.c_str()); + #endif + + delete chimera; + + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + m->mothurOut(outputFileName); m->mothurOutEndLine(); + if (hasAccnos) { m->mothurOut(accnosFileName); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraCcodeCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +int ChimeraCcodeCommand::driver(linePair* line, string outputFName, string filename, string accnos){ + try { + ofstream out; + openOutputFile(outputFName, out); + + ofstream out2; + openOutputFile(accnos, out2); + + ifstream inFASTA; + openInputFile(filename, inFASTA); + + inFASTA.seekg(line->start); + + for(int i=0;inumSeqs;i++){ + + if (m->control_pressed) { return 1; } + + Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + + if (candidateSeq->getAligned().length() != templateSeqsLength) { + m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine(); + }else{ + //find chimeras + chimera->getChimeras(candidateSeq); + + if (m->control_pressed) { delete candidateSeq; return 1; } + + //print results + chimera->print(out, out2); + } + } + delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); } + } + //report progress + if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); } + + out.close(); + out2.close(); + inFASTA.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraCcodeCommand", "driver"); + exit(1); + } +} +//********************************************************************************************************************** +#ifdef USE_MPI +int ChimeraCcodeCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector& MPIPos){ + try { + + MPI_Status status; + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + for(int i=0;icontrol_pressed) { return 0; } + + //read next sequence + int length = MPIPos[start+i+1] - MPIPos[start+i]; + + char buf4[length]; + MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + istringstream iss (tempBuf,istringstream::in); + + Sequence* candidateSeq = new Sequence(iss); gobble(iss); + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + + if (candidateSeq->getAligned().length() != templateSeqsLength) { + m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine(); + }else{ + //find chimeras + chimera->getChimeras(candidateSeq); + + if (m->control_pressed) { delete candidateSeq; return 1; } + + //print results + bool isChimeric = chimera->print(outMPI, outAccMPI); + if (isChimeric) { MPIWroteAccnos = true; } + } + } + delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); } + } + //report progress + if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraCcodeCommand", "driverMPI"); + exit(1); + } +} +#endif + +/**************************************************************************************************/ + +int ChimeraCcodeCommand::createProcesses(string outputFileName, string filename, string accnos) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + // processIDS.resize(0); + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); + exit(0); + }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;ierrorOut(e, "ChimeraCcodeCommand", "createProcesses"); + exit(1); + } +} +//********************************************************************************************************************** + diff --git a/chimeraccodecommand.h b/chimeraccodecommand.h new file mode 100644 index 0000000..9a0efb9 --- /dev/null +++ b/chimeraccodecommand.h @@ -0,0 +1,56 @@ +#ifndef CHIMERACCODECOMMAND_H +#define CHIMERACCODECOMMAND_H + +/* + * chimeraccodecommand.h + * Mothur + * + * Created by westcott on 3/30/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" +#include "chimera.h" + + +/***********************************************************/ + +class ChimeraCcodeCommand : public Command { +public: + ChimeraCcodeCommand(string); + ~ChimeraCcodeCommand(); + int execute(); + void help(); + + +private: + + struct linePair { + int start; + int numSeqs; + linePair(long int i, int j) : start(i), numSeqs(j) {} + }; + vector processIDS; //processid + vector lines; + + int driver(linePair*, string, string, string); + int createProcesses(string, string, string); + + #ifdef USE_MPI + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); + #endif + + bool abort, filter, MPIWroteAccnos; + string fastafile, templatefile, outputDir, maskfile; + int processors, window, numwanted, numSeqs, templateSeqsLength; + Chimera* chimera; + + +}; + +/***********************************************************/ + +#endif + diff --git a/chimeracheckcommand.cpp b/chimeracheckcommand.cpp new file mode 100644 index 0000000..ba5e5be --- /dev/null +++ b/chimeracheckcommand.cpp @@ -0,0 +1,464 @@ +/* + * chimeracheckcommand.cpp + * Mothur + * + * Created by westcott on 3/31/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "chimeracheckcommand.h" +#include "chimeracheckrdp.h" + +//*************************************************************************************************************** + +ChimeraCheckCommand::ChimeraCheckCommand(string option) { + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta","processors","increment","template","ksize","svg", "name","outputdir","inputdir" }; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("template"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["template"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + } + + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { abort = true; } + else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.check command."); m->mothurOutEndLine(); abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it + } + + templatefile = validParameter.validFile(parameters, "template", true); + if (templatefile == "not open") { abort = true; } + else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.check command."); m->mothurOutEndLine(); abort = true; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } + + string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } + convert(temp, processors); + + temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; } + convert(temp, ksize); + + temp = validParameter.validFile(parameters, "svg", false); if (temp == "not found") { temp = "F"; } + svg = isTrue(temp); + + temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "10"; } + convert(temp, increment); + } + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckCommand", "ChimeraCheckCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void ChimeraCheckCommand::help(){ + try { + + m->mothurOut("The chimera.check command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n"); + m->mothurOut("This command was created using the algorythms described in CHIMERA_CHECK version 2.7 written by Niels Larsen. \n"); + m->mothurOut("The chimera.check command parameters are fasta, template, processors, ksize, increment, svg and name.\n"); + m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n"); + m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n"); + m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); + #ifdef USE_MPI + m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); + #endif + m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default is 10.\n"); + m->mothurOut("The ksize parameter allows you to input kmersize, default is 7. \n"); + m->mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence, default is False.\n"); + m->mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n"); + m->mothurOut("The chimera.check command should be in the following format: \n"); + m->mothurOut("chimera.check(fasta=yourFastaFile, template=yourTemplateFile, processors=yourProcessors, ksize=yourKmerSize) \n"); + m->mothurOut("Example: chimera.check(fasta=AD.fasta, template=core_set_aligned,imputed.fasta, processors=4, ksize=8) \n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckCommand", "help"); + exit(1); + } +} + +//*************************************************************************************************************** + +ChimeraCheckCommand::~ChimeraCheckCommand(){ /* do nothing */ } + +//*************************************************************************************************************** + +int ChimeraCheckCommand::execute(){ + try{ + + if (abort == true) { return 0; } + + int start = time(NULL); + + chimera = new ChimeraCheckRDP(fastafile, templatefile, namefile, svg, increment, ksize, outputDir); + + if (m->control_pressed) { delete chimera; return 0; } + + string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + "chimeracheck.chimeras"; + + #ifdef USE_MPI + + int pid, end, numSeqsPerProcessor; + int tag = 2001; + vector MPIPos; + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + MPI_File inMPI; + MPI_File outMPI; + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + int inMode=MPI_MODE_RDONLY; + + char outFilename[outputFileName.length()]; + strcpy(outFilename, outputFileName.c_str()); + + char inFileName[fastafile.length()]; + strcpy(inFileName, fastafile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); delete chimera; return 0; } + + if (pid == 0) { //you are the root process + MPIPos = setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + + //figure out how many sequences you have to align + numSeqsPerProcessor = numSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); remove(outputFileName.c_str()); delete chimera; return 0; } + + //wait on chidren + for(int i = 1; i < processors; i++) { + char buf[4]; + MPI_Recv(buf, 4, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); + } + }else{ //you are a child process + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + MPIPos.resize(numSeqs+1); + MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + + //figure out how many sequences you have to align + numSeqsPerProcessor = numSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); delete chimera; return 0; } + + //tell parent you are done. + char buf[4]; + strcpy(buf, "done"); + MPI_Send(buf, 4, MPI_CHAR, 0, tag, MPI_COMM_WORLD); + } + + //close files + MPI_File_close(&inMPI); + MPI_File_close(&outMPI); + #else + + //break up file + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numSeqs)); + + driver(lines[0], outputFileName, fastafile); + + if (m->control_pressed) { + remove(outputFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + + }else{ + vector positions; + processIDS.resize(0); + + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); + if (input.length() != 0) { + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + numSeqs = positions.size(); + + int numSeqsPerProcessor = numSeqs / processors; + + for (int i = 0; i < processors; i++) { + long int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } + + + createProcesses(outputFileName, fastafile); + + rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); + + //append output files + for(int i=1;icontrol_pressed) { + remove(outputFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + } + + #else + ifstream inFASTA; + openInputFile(candidateFileNames[s], inFASTA); + numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + lines.push_back(new linePair(0, numSeqs)); + + driver(lines[0], outputFileName, fastafile); + + if (m->control_pressed) { + remove(outputFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + #endif + #endif + delete chimera; + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + + m->mothurOutEndLine(); m->mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); m->mothurOutEndLine(); + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + m->mothurOut(outputFileName); m->mothurOutEndLine(); + m->mothurOutEndLine(); + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +int ChimeraCheckCommand::driver(linePair* line, string outputFName, string filename){ + try { + ofstream out; + openOutputFile(outputFName, out); + + ofstream out2; + + ifstream inFASTA; + openInputFile(filename, inFASTA); + + inFASTA.seekg(line->start); + + for(int i=0;inumSeqs;i++){ + + if (m->control_pressed) { return 1; } + + Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + //find chimeras + chimera->getChimeras(candidateSeq); + + if (m->control_pressed) { delete candidateSeq; return 1; } + + //print results + chimera->print(out, out2); + } + delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); } + } + //report progress + if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); } + + out.close(); + inFASTA.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckCommand", "driver"); + exit(1); + } +} +//********************************************************************************************************************** +#ifdef USE_MPI +int ChimeraCheckCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector& MPIPos){ + try { + MPI_File outAccMPI; + MPI_Status status; + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + for(int i=0;icontrol_pressed) { return 0; } + + //read next sequence + int length = MPIPos[start+i+1] - MPIPos[start+i]; + + char buf4[length]; + MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + istringstream iss (tempBuf,istringstream::in); + + Sequence* candidateSeq = new Sequence(iss); gobble(iss); + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + //find chimeras + chimera->getChimeras(candidateSeq); + + //print results + chimera->print(outMPI, outAccMPI); + } + delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); } + } + //report progress + if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckCommand", "driverMPI"); + exit(1); + } +} +#endif + +/**************************************************************************************************/ + +int ChimeraCheckCommand::createProcesses(string outputFileName, string filename) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + // processIDS.resize(0); + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename); + exit(0); + }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;ierrorOut(e, "ChimeraCheckCommand", "createProcesses"); + exit(1); + } +} +/**************************************************************************************************/ + + diff --git a/chimeracheckcommand.h b/chimeracheckcommand.h new file mode 100644 index 0000000..6db61bd --- /dev/null +++ b/chimeracheckcommand.h @@ -0,0 +1,57 @@ +#ifndef CHIMERACHECKCOMMAND_H +#define CHIMERACHECKCOMMAND_H + +/* + * chimeracheckcommand.h + * Mothur + * + * Created by westcott on 3/31/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" +#include "chimera.h" + + +/***********************************************************/ + +class ChimeraCheckCommand : public Command { +public: + ChimeraCheckCommand(string); + ~ChimeraCheckCommand(); + int execute(); + void help(); + + +private: + + struct linePair { + int start; + int numSeqs; + linePair(long int i, int j) : start(i), numSeqs(j) {} + }; + vector processIDS; //processid + vector lines; + + int driver(linePair*, string, string); + int createProcesses(string, string); + + #ifdef USE_MPI + int driverMPI(int, int, MPI_File&, MPI_File&, vector&); + #endif + + bool abort, svg; + string fastafile, templatefile, namefile, outputDir; + int processors, increment, ksize, numSeqs, templateSeqsLength; + Chimera* chimera; + + +}; + +/***********************************************************/ + +#endif + + diff --git a/chimeracheckrdp.cpp b/chimeracheckrdp.cpp index 790d3eb..51a3d9b 100644 --- a/chimeracheckrdp.cpp +++ b/chimeracheckrdp.cpp @@ -10,7 +10,30 @@ #include "chimeracheckrdp.h" //*************************************************************************************************************** -ChimeraCheckRDP::ChimeraCheckRDP(string filename, string o) { fastafile = filename; outputDir = o; } +ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp, string n, bool s, int inc, int k, string o) : Chimera() { + try { + fastafile = filename; + templateFileName = temp; + name = n; + svg = s; + increment = inc; + kmerSize = k; + outputDir = o; + + templateDB = new AlignmentDB(templateFileName, "kmer", kmerSize, 0.0,0.0,0.0,0.0); + m->mothurOutEndLine(); + + kmer = new Kmer(kmerSize); + + if (name != "") { + readName(name); //fills name map with names of seqs the user wants to have .svg for. + } + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckRDP", "ChimeraCheckRDP"); + exit(1); + } +} //*************************************************************************************************************** ChimeraCheckRDP::~ChimeraCheckRDP() { @@ -19,7 +42,7 @@ ChimeraCheckRDP::~ChimeraCheckRDP() { delete kmer; } catch(exception& e) { - m->errorOut(e, "ChimeraCheckRDP", "~AlignSim"); + m->errorOut(e, "ChimeraCheckRDP", "~ChimeraCheckRDP"); exit(1); } } @@ -56,25 +79,49 @@ int ChimeraCheckRDP::print(ostream& out, ostream& outAcc) { exit(1); } } +#ifdef USE_MPI //*************************************************************************************************************** -int ChimeraCheckRDP::doPrep() { +int ChimeraCheckRDP::print(MPI_File& out, MPI_File& outAcc) { try { - templateDB = new AlignmentDB(templateFileName, "kmer", kmerSize, 0.0,0.0,0.0,0.0); - m->mothurOutEndLine(); - kmer = new Kmer(kmerSize); + cout << "Processing: " << querySeq->getName() << endl; - if (name != "") { - readName(name); //fills name map with names of seqs the user wants to have .svg for. + string outString = ""; + + outString += querySeq->getName() + "\nIS scores: \t"; + + for (int k = 0; k < IS.size(); k++) { + outString += toString(IS[k].score) + "\t"; + } + outString += "\n"; + + MPI_Status status; + int length = outString.length(); + char buf[length]; + strcpy(buf, outString.c_str()); + + MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); + + if (svg) { + if (name != "") { //if user has specific names + map::iterator it = names.find(querySeq->getName()); + + if (it != names.end()) { //user wants pic of this + makeSVGpic(IS); //zeros out negative results + } + }else{//output them all + makeSVGpic(IS); //zeros out negative results + } } return 0; } catch(exception& e) { - m->errorOut(e, "ChimeraCheckRDP", "doPrep"); + m->errorOut(e, "ChimeraCheckRDP", "print"); exit(1); } } +#endif //*************************************************************************************************************** int ChimeraCheckRDP::getChimeras(Sequence* query) { try { @@ -123,6 +170,8 @@ vector ChimeraCheckRDP::findIS() { //for each window for (int f = start; f < (seq.length() - start); f+=increment) { + + if (m->control_pressed) { return isValues; } if ((f - kmerSize) < 0) { m->mothurOut("Your sequence is too short for your kmerSize."); m->mothurOutEndLine(); exit(1); } @@ -204,18 +253,47 @@ vector ChimeraCheckRDP::findIS() { //*************************************************************************************************************** void ChimeraCheckRDP::readName(string namefile) { try{ - ifstream in; - openInputFile(namefile, in); + string name; + + #ifdef USE_MPI + + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + char inFileName[namefile.length()]; + strcpy(inFileName, namefile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); + + char buffer[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + + while(!iss.eof()) { + iss >> name; gobble(iss); + names[name] = name; + } + + MPI_File_close(&inMPI); + + #else + + ifstream in; + openInputFile(namefile, in); + while (!in.eof()) { - - in >> name; - + in >> name; gobble(in); names[name] = name; - - gobble(in); } + in.close(); + + #endif } catch(exception& e) { @@ -260,7 +338,80 @@ int ChimeraCheckRDP::calcKmers(map query, map subject) { exit(1); } } +#ifdef USE_MPI +//*************************************************************************************************************** +void ChimeraCheckRDP::makeSVGpic(vector info) { + try{ + + string file = outputDir + querySeq->getName() + ".chimeracheck.svg"; + + MPI_File outSVG; + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + + char FileName[file.length()]; + strcpy(FileName, file.c_str()); + + MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outSVG); //comm, filename, mode, info, filepointer + + int width = (info.size()*5) + 150; + + string outString = ""; + + outString += "\n"; + outString += "\n"; + outString += "Plotted IS values for " + querySeq->getName() + "\n"; + + outString += "\n"; + outString += "\n"; + + outString += "" + toString(info[0].midpoint) + "\n"; + outString += "" + toString(info[info.size()-1].midpoint) + "\n"; + outString += "Base Positions\n"; + + outString += "0\n"; + + outString += "IS\n"; + + + //find max is score + float biggest = 0.0; + for (int i = 0; i < info.size(); i++) { + if (info[i].score > biggest) { + biggest = info[i].score; + } + } + + outString += "" + toString(biggest) + "\n"; + + int scaler2 = 500 / biggest; + + + outString += " "; + for (int i = 0; i < info.size(); i++) { + if(info[i].score < 0) { info[i].score = 0; } + outString += toString(((i*5) + 75)) + "," + toString((600 - (info[i].score * scaler2))) + " "; + } + + outString += "\"/> "; + outString += "\n\n"; + + MPI_Status status; + int length = outString.length(); + char buf2[length]; + strcpy(buf2, outString.c_str()); + + MPI_File_write(outSVG, buf2, length, MPI_CHAR, &status); + + MPI_File_close(&outSVG); + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckRDP", "makeSVGpic"); + exit(1); + } +} +#else //*************************************************************************************************************** void ChimeraCheckRDP::makeSVGpic(vector info) { try{ @@ -318,6 +469,7 @@ void ChimeraCheckRDP::makeSVGpic(vector info) { exit(1); } } -//*************************************************************************************************************** +#endif +//***************************************************************************************************************/ diff --git a/chimeracheckrdp.h b/chimeracheckrdp.h index f54faab..f17c7b1 100644 --- a/chimeracheckrdp.h +++ b/chimeracheckrdp.h @@ -25,15 +25,17 @@ class ChimeraCheckRDP : public Chimera { public: - ChimeraCheckRDP(string, string); + ChimeraCheckRDP(string, string, string, bool, int, int, string); //fasta, template, name, svg, increment, ksize, outputDir ~ChimeraCheckRDP(); int getChimeras(Sequence*); int print(ostream&, ostream&); - int doPrep(); - private: + #ifdef USE_MPI + int print(MPI_File&, MPI_File&); + #endif + private: Sequence* querySeq; AlignmentDB* templateDB; @@ -43,6 +45,9 @@ class ChimeraCheckRDP : public Chimera { vector IS; //IS is the vector of IS values for each window for query string fastafile; map names; + string name; + bool svg; + int kmerSize, increment; vector findIS(); int calcKmers(map, map); diff --git a/chimerapintailcommand.cpp b/chimerapintailcommand.cpp new file mode 100644 index 0000000..919285d --- /dev/null +++ b/chimerapintailcommand.cpp @@ -0,0 +1,573 @@ +/* + * chimerapintailcommand.cpp + * Mothur + * + * Created by westcott on 4/1/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "chimerapintailcommand.h" +#include "pintail.h" + +//*************************************************************************************************************** + +ChimeraPintailCommand::ChimeraPintailCommand(string option) { + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta","filter","processors","window" "increment","template","conservation","quantile","mask","outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("template"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["template"] = inputDir + it->second; } + } + + it = parameters.find("conservation"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["conservation"] = inputDir + it->second; } + } + + it = parameters.find("quantile"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["quantile"] = inputDir + it->second; } + } + } + + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { abort = true; } + else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it + } + + templatefile = validParameter.validFile(parameters, "template", true); + if (templatefile == "not open") { abort = true; } + else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; } + + consfile = validParameter.validFile(parameters, "conservation", true); + if (consfile == "not open") { abort = true; } + else if (consfile == "not found") { consfile = ""; } + + quanfile = validParameter.validFile(parameters, "quantile", true); + if (quanfile == "not open") { abort = true; } + else if (quanfile == "not found") { quanfile = ""; } + + maskfile = validParameter.validFile(parameters, "mask", false); + if (maskfile == "not found") { maskfile = ""; } + else if (maskfile != "default") { + if (inputDir != "") { + string path = hasPath(maskfile); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { maskfile = inputDir + maskfile; } + } + + ifstream in; + int ableToOpen = openInputFile(maskfile, in); + if (ableToOpen == 1) { abort = true; } + in.close(); + } + + string temp; + temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; } + filter = isTrue(temp); + + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } + convert(temp, processors); + + temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; } + convert(temp, window); + + temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; } + convert(temp, increment); + } + } + catch(exception& e) { + m->errorOut(e, "ChimeraPintailCommand", "ChimeraPintailCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void ChimeraPintailCommand::help(){ + try { + + m->mothurOut("The chimera.pintail command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n"); + m->mothurOut("This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n"); + m->mothurOut("The chimera.pintail command parameters are fasta, template, filter, mask, processors, window, increment, conservation and quantile.\n"); + m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n"); + m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n"); + m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n"); + m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences, by default no mask is applied. You can apply an ecoli mask by typing, mask=default. \n"); + m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); + #ifdef USE_MPI + m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); + #endif + m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=1/4 sequence length. \n"); + m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n"); + m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n"); + m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n"); + m->mothurOut("The chimera.pintail command should be in the following format: \n"); + m->mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n"); + m->mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, method=bellerophon, window=200) \n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "ChimeraPintailCommand", "help"); + exit(1); + } +} + +//*************************************************************************************************************** + +ChimeraPintailCommand::~ChimeraPintailCommand(){ /* do nothing */ } + +//*************************************************************************************************************** + +int ChimeraPintailCommand::execute(){ + try{ + + if (abort == true) { return 0; } + + int start = time(NULL); + + chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir); + + //set user options + if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); } + + + string outputFileName, accnosFileName; + if (maskfile != "") { + outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.chimeras"; + accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.accnos"; + }else { + outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + "pintail.chimeras"; + accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + "pintail.accnos"; + } + bool hasAccnos = true; + + if (m->control_pressed) { delete chimera; return 0; } + + if (chimera->getUnaligned()) { + m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine(); + delete chimera; + return 0; + } + templateSeqsLength = chimera->getLength(); + + #ifdef USE_MPI + int pid, end, numSeqsPerProcessor; + int tag = 2001; + vector MPIPos; + MPIWroteAccnos = false; + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + MPI_File inMPI; + MPI_File outMPI; + MPI_File outMPIAccnos; + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + int inMode=MPI_MODE_RDONLY; + + char outFilename[outputFileName.length()]; + strcpy(outFilename, outputFileName.c_str()); + + char outAccnosFilename[accnosFileName.length()]; + strcpy(outAccnosFilename, accnosFileName.c_str()); + + char inFileName[fastafile.length()]; + strcpy(inFileName, fastafile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI); + MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; } + + if (pid == 0) { //you are the root process + + MPIPos = setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + + //figure out how many sequences you have to align + numSeqsPerProcessor = numSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); delete chimera; return 0; } + + for (int i = 1; i < processors; i++) { + bool tempResult; + MPI_Recv(&tempResult, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); + if (tempResult != 0) { MPIWroteAccnos = true; } + } + }else{ //you are a child process + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + MPIPos.resize(numSeqs+1); + MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + + //figure out how many sequences you have to align + numSeqsPerProcessor = numSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; } + + MPI_Send(&MPIWroteAccnos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); + } + + //close files + MPI_File_close(&inMPI); + MPI_File_close(&outMPI); + MPI_File_close(&outMPIAccnos); + + //delete accnos file if blank + if (pid == 0) { + if (!MPIWroteAccnos) { + //MPI_Info info; + //MPI_File_delete(outAccnosFilename, info); + hasAccnos = false; + remove(accnosFileName.c_str()); + } + } + + #else + + //break up file + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numSeqs)); + + driver(lines[0], outputFileName, fastafile, accnosFileName); + + if (m->control_pressed) { + remove(outputFileName.c_str()); + remove(accnosFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + + //delete accnos file if its blank + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } + + }else{ + vector positions; + processIDS.resize(0); + + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); + if (input.length() != 0) { + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + numSeqs = positions.size(); + + int numSeqsPerProcessor = numSeqs / processors; + + for (int i = 0; i < processors; i++) { + long int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } + + + createProcesses(outputFileName, fastafile, accnosFileName); + + rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); + + //append output files + for(int i=1;i nonBlankAccnosFiles; + //delete blank accnos files generated with multiple processes + for(int i=0;icontrol_pressed) { + remove(outputFileName.c_str()); + remove(accnosFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + } + + #else + ifstream inFASTA; + openInputFile(candidateFileNames[s], inFASTA); + numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + lines.push_back(new linePair(0, numSeqs)); + + driver(lines[0], outputFileName, fastafile, accnosFileName); + + if (m->control_pressed) { + remove(outputFileName.c_str()); + remove(accnosFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + + //delete accnos file if its blank + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } + #endif + + #endif + + delete chimera; + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + m->mothurOut(outputFileName); m->mothurOutEndLine(); + if (hasAccnos) { m->mothurOut(accnosFileName); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraPintailCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +int ChimeraPintailCommand::driver(linePair* line, string outputFName, string filename, string accnos){ + try { + ofstream out; + openOutputFile(outputFName, out); + + ofstream out2; + openOutputFile(accnos, out2); + + ifstream inFASTA; + openInputFile(filename, inFASTA); + + inFASTA.seekg(line->start); + + for(int i=0;inumSeqs;i++){ + + if (m->control_pressed) { return 1; } + + Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + + if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned + m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine(); + }else{ + //find chimeras + chimera->getChimeras(candidateSeq); + + if (m->control_pressed) { delete candidateSeq; return 1; } + + //print results + chimera->print(out, out2); + } + } + delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); } + } + //report progress + if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); } + + out.close(); + out2.close(); + inFASTA.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraPintailCommand", "driver"); + exit(1); + } +} +//********************************************************************************************************************** +#ifdef USE_MPI +int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector& MPIPos){ + try { + + MPI_Status status; + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + for(int i=0;icontrol_pressed) { return 1; } + + //read next sequence + int length = MPIPos[start+i+1] - MPIPos[start+i]; + + char buf4[length]; + MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + istringstream iss (tempBuf,istringstream::in); + + Sequence* candidateSeq = new Sequence(iss); gobble(iss); + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + + if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned + m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine(); + }else{ + //find chimeras + chimera->getChimeras(candidateSeq); + + if (m->control_pressed) { delete candidateSeq; return 1; } + + //print results + bool isChimeric = chimera->print(outMPI, outAccMPI); + if (isChimeric) { MPIWroteAccnos = true; } + } + } + delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); } + } + //report progress + if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraPintailCommand", "driverMPI"); + exit(1); + } +} +#endif + +/**************************************************************************************************/ + +int ChimeraPintailCommand::createProcesses(string outputFileName, string filename, string accnos) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + // processIDS.resize(0); + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); + exit(0); + }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;ierrorOut(e, "ChimeraPintailCommand", "createProcesses"); + exit(1); + } +} + +/**************************************************************************************************/ + + diff --git a/chimerapintailcommand.h b/chimerapintailcommand.h new file mode 100644 index 0000000..0ddfc1c --- /dev/null +++ b/chimerapintailcommand.h @@ -0,0 +1,58 @@ +#ifndef CHIMERAPINTAILCOMMAND_H +#define CHIMERAPINTAILCOMMAND_H + +/* + * chimerapintailcommand.h + * Mothur + * + * Created by westcott on 4/1/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" +#include "chimera.h" + + +/***********************************************************/ + +class ChimeraPintailCommand : public Command { + +public: + + ChimeraPintailCommand(string); + ~ChimeraPintailCommand(); + int execute(); + void help(); + +private: + + struct linePair { + int start; + int numSeqs; + linePair(long int i, int j) : start(i), numSeqs(j) {} + }; + vector processIDS; //processid + vector lines; + + int driver(linePair*, string, string, string); + int createProcesses(string, string, string); + + #ifdef USE_MPI + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); + #endif + + bool abort, filter, MPIWroteAccnos; + string fastafile, templatefile, consfile, quanfile, maskfile, outputDir; + int processors, window, increment, numSeqs, templateSeqsLength; + Chimera* chimera; + + +}; + +/***********************************************************/ + +#endif + + diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index 65e082b..663b894 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -8,262 +8,13 @@ */ #include "chimeraseqscommand.h" -#include "bellerophon.h" -#include "pintail.h" -#include "ccode.h" -#include "chimeracheckrdp.h" -#include "chimeraslayer.h" - //*************************************************************************************************************** -ChimeraSeqsCommand::ChimeraSeqsCommand(string option) { - try { - abort = false; - - //allow user to run help - if(option == "help") { help(); abort = true; } - - else { - //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", - "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" }; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); - - OptionParser parser(option); - map parameters = parser.getParameters(); - - ValidParameters validParameter; - map::iterator it; - - //check to make sure all parameters are valid for command - for (it = parameters.begin(); it != parameters.end(); it++) { - if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } - } - - //if the user changes the input directory command factory will send this info to us in the output parameter - string inputDir = validParameter.validFile(parameters, "inputdir", false); - if (inputDir == "not found"){ inputDir = ""; } - else { - string path; - it = parameters.find("fasta"); - //user has given a template file - if(it != parameters.end()){ - path = hasPath(it->second); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["fasta"] = inputDir + it->second; } - } - - it = parameters.find("template"); - //user has given a template file - if(it != parameters.end()){ - path = hasPath(it->second); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["template"] = inputDir + it->second; } - } - - it = parameters.find("conservation"); - //user has given a template file - if(it != parameters.end()){ - path = hasPath(it->second); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["conservation"] = inputDir + it->second; } - } - - it = parameters.find("quantile"); - //user has given a template file - if(it != parameters.end()){ - path = hasPath(it->second); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["quantile"] = inputDir + it->second; } - } - - it = parameters.find("name"); - //user has given a template file - if(it != parameters.end()){ - path = hasPath(it->second); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["name"] = inputDir + it->second; } - } - } - - - //check for required parameters - fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } - else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.seqs command."); m->mothurOutEndLine(); abort = true; } - - //if the user changes the output directory command factory will send this info to us in the output parameter - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ - outputDir = ""; - outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it - } - - templatefile = validParameter.validFile(parameters, "template", true); - if (templatefile == "not open") { abort = true; } - else if (templatefile == "not found") { templatefile = ""; } - - consfile = validParameter.validFile(parameters, "conservation", true); - if (consfile == "not open") { abort = true; } - else if (consfile == "not found") { consfile = ""; } - - quanfile = validParameter.validFile(parameters, "quantile", true); - if (quanfile == "not open") { abort = true; } - else if (quanfile == "not found") { quanfile = ""; } - - namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } - else if (namefile == "not found") { namefile = ""; } - - maskfile = validParameter.validFile(parameters, "mask", false); - if (maskfile == "not found") { maskfile = ""; } - else if (maskfile != "default") { - if (inputDir != "") { - string path = hasPath(maskfile); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { maskfile = inputDir + maskfile; } - } - - ifstream in; - int ableToOpen = openInputFile(maskfile, in); - if (ableToOpen == 1) { abort = true; } - in.close(); - } - - method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; } - - string temp; - temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; } - filter = isTrue(temp); - - temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; } - correction = isTrue(temp); - - temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } - convert(temp, processors); - - temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; } - convert(temp, ksize); - - temp = validParameter.validFile(parameters, "svg", false); if (temp == "not found") { temp = "F"; } - svg = isTrue(temp); - - temp = validParameter.validFile(parameters, "window", false); - if ((temp == "not found") && (method == "chimeraslayer")) { temp = "50"; } - else if (temp == "not found") { temp = "0"; } - convert(temp, window); - - temp = validParameter.validFile(parameters, "match", false); if (temp == "not found") { temp = "5"; } - convert(temp, match); - - temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found") { temp = "-4"; } - convert(temp, mismatch); - - temp = validParameter.validFile(parameters, "divergence", false); if (temp == "not found") { temp = "1.007"; } - convert(temp, divR); - - temp = validParameter.validFile(parameters, "minsim", false); if (temp == "not found") { temp = "90"; } - convert(temp, minSimilarity); - - temp = validParameter.validFile(parameters, "mincov", false); if (temp == "not found") { temp = "70"; } - convert(temp, minCoverage); - - temp = validParameter.validFile(parameters, "minbs", false); if (temp == "not found") { temp = "90"; } - convert(temp, minBS); - - temp = validParameter.validFile(parameters, "minsnp", false); if (temp == "not found") { temp = "10"; } - convert(temp, minSNP); - - temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "3"; } - convert(temp, parents); - - temp = validParameter.validFile(parameters, "realign", false); if (temp == "not found") { temp = "f"; } - realign = isTrue(temp); - - search = validParameter.validFile(parameters, "search", false); if (search == "not found") { search = "distance"; } - - temp = validParameter.validFile(parameters, "iters", false); - if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; } - else if (temp == "not found") { temp = "1000"; } - convert(temp, iters); - - temp = validParameter.validFile(parameters, "increment", false); - if ((temp == "not found") && (method == "chimeracheck")) { temp = "10"; } - else if ((temp == "not found") && (method == "chimeraslayer")) { temp = "5"; } - else if (temp == "not found") { temp = "25"; } - convert(temp, increment); - - temp = validParameter.validFile(parameters, "numwanted", false); - if ((temp == "not found") && (method == "chimeraslayer")) { temp = "15"; } - else if (temp == "not found") { temp = "20"; } - convert(temp, numwanted); - - if ((search != "distance") && (search != "blast") && (search != "kmer")) { m->mothurOut(search + " is not a valid search."); m->mothurOutEndLine(); abort = true; } - - if (((method != "bellerophon")) && (templatefile == "")) { m->mothurOut("You must provide a template file with the pintail, ccode, chimeraslayer or chimeracheck methods."); m->mothurOutEndLine(); abort = true; } - - - } - } - catch(exception& e) { - m->errorOut(e, "ChimeraSeqsCommand", "ChimeraSeqsCommand"); - exit(1); - } -} +ChimeraSeqsCommand::ChimeraSeqsCommand(string option) {} //********************************************************************************************************************** -void ChimeraSeqsCommand::help(){ - try { - - //"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize", "svg", "name" - //m->mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED and if using a template that the template file sequences are the same length as the fasta file sequences.\n\n"); - m->mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n"); - m->mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation, quantile, numwanted, ksize, svg, name, iters, search, realign.\n"); - m->mothurOut("The fasta parameter is always required and template is required if using pintail, ccode or chimeracheck.\n"); - m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n"); - m->mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n"); - m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); - m->mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. Options include bellerophon, ccode and chimeracheck \n"); - m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences. \n"); - m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras. \n"); - m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences.\n"); - m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences. \n"); - m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n"); - m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences.\n"); - m->mothurOut("The numwanted parameter allows you to specify how many sequences you would each query sequence compared with.\n"); - m->mothurOut("The ksize parameter allows you to input kmersize. \n"); - m->mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence.\n"); - m->mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n"); - m->mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method.\n"); - m->mothurOut("The minsim parameter allows you .... \n"); - m->mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n"); - m->mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n"); - m->mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n"); - m->mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. -used only by chimeraslayer. \n"); - m->mothurOut("The realign parameter allows you to realign the query to the potential paretns. Choices are true or false, default false. -used only by chimeraslayer. \n"); - m->mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n"); - m->mothurOut("Details for each method: \n"); - m->mothurOut("\tpintail: \n"); - m->mothurOut("\t\tparameters: fasta=required, template=required, filter=F, mask=no mask, processors=1, window=300, increment=25, conservation=not required, but will improve speed, quantile=not required, but will greatly improve speed. \n"); - m->mothurOut("\t\tIf you have run chimera.seqs using pintail a .quan and .freq file will be created for your template, if you have not provided them for use in future command executions.\n"); - m->mothurOut("\tbellerophon: \n"); - m->mothurOut("\t\tparameters: fasta=required, filter=F, processors=1, window=1/4 length of seq, increment=25, correction=T. \n"); - m->mothurOut("\tccode: \n"); - m->mothurOut("\t\tparameters: fasta=required, template=required, filter=F, mask=no mask, processors=1, window=10% of length, numwanted=20\n"); - m->mothurOut("\tchimeracheck: \n"); - m->mothurOut("\t\tparameters: fasta=required, template=required, processors=1, increment=10, ksize=7, svg=F, name=none\n\n"); - m->mothurOut("\tchimeraslayer: \n"); - m->mothurOut("\t\tparameters: fasta=required, template=required, processors=1, increment=10, mask=no mask, numwanted=10, match=5, mismatch=-4, divergence=1.0, minsim=90, parents=5, iters=1000, window=100. \n\n"); - m->mothurOut("The chimera.seqs command should be in the following format: \n"); - m->mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n"); - m->mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, method=bellerophon, window=200) \n"); - m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n"); - } - catch(exception& e) { - m->errorOut(e, "ChimeraSeqsCommand", "help"); - exit(1); - } -} +void ChimeraSeqsCommand::help(){} //*************************************************************************************************************** @@ -272,365 +23,11 @@ ChimeraSeqsCommand::~ChimeraSeqsCommand(){ /* do nothing */ } //*************************************************************************************************************** int ChimeraSeqsCommand::execute(){ - try{ - - if (abort == true) { return 0; } - - int start = time(NULL); - - if (method == "bellerophon") { chimera = new Bellerophon(fastafile, outputDir); } - else if (method == "pintail") { chimera = new Pintail(fastafile, outputDir); } - else if (method == "ccode") { chimera = new Ccode(fastafile, outputDir); } - else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, outputDir); } - else if (method == "chimeraslayer") { chimera = new ChimeraSlayer(search, realign, fastafile); } - else { m->mothurOut("Not a valid method."); m->mothurOutEndLine(); return 0; } - - //set user options - if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); } - - chimera->setCons(consfile); - chimera->setQuantiles(quanfile); - chimera->setMask(maskfile); - chimera->setFilter(filter); - chimera->setCorrection(correction); - chimera->setProcessors(processors); - chimera->setWindow(window); - chimera->setIncrement(increment); - chimera->setNumWanted(numwanted); - chimera->setKmerSize(ksize); - chimera->setSVG(svg); - chimera->setName(namefile); - chimera->setMatch(match); - chimera->setMisMatch(mismatch); - chimera->setDivR(divR); - chimera->setParents(parents); - chimera->setMinSim(minSimilarity); - chimera->setMinCoverage(minCoverage); - chimera->setMinBS(minBS); - chimera->setMinSNP(minSNP); - chimera->setIters(iters); - - - string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras"; - string accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".accnos"; - bool hasAccnos = true; - - if (method == "bellerophon") {//run bellerophon separately since you need to read entire fastafile to run it - chimera->getChimeras(); - - if (m->control_pressed) { delete chimera; return 0; } - - ofstream out; - openOutputFile(outputFileName, out); - - ofstream out2; - openOutputFile(accnosFileName, out2); - - chimera->print(out, out2); - out.close(); - out2.close(); - - if (m->control_pressed) { remove(accnosFileName.c_str()); remove(outputFileName.c_str()); delete chimera; return 0; } - - //delete accnos file if its blank - if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } - - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - m->mothurOut(outputFileName); m->mothurOutEndLine(); - if (hasAccnos) { m->mothurOut(accnosFileName); m->mothurOutEndLine(); } - m->mothurOutEndLine(); - - delete chimera; - return 0; - } - - //reads template - chimera->setTemplateFile(templatefile); - - if (m->control_pressed) { delete chimera; return 0; } - - if (method != "chimeracheck") { - if (chimera->getUnaligned()) { - m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine(); - delete chimera; - return 0; - } - } - - //some methods need to do prep work before processing the chimeras - chimera->doPrep(); - - if (m->control_pressed) { delete chimera; return 0; } - - templateSeqsLength = chimera->getLength(); - - ofstream outHeader; - string tempHeader = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras.tempHeader"; - openOutputFile(tempHeader, outHeader); - - chimera->printHeader(outHeader); - outHeader.close(); - - - //break up file - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - if(processors == 1){ - ifstream inFASTA; - openInputFile(fastafile, inFASTA); - numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); - inFASTA.close(); - - lines.push_back(new linePair(0, numSeqs)); - - driver(lines[0], outputFileName, fastafile, accnosFileName); - - if (m->control_pressed) { - remove(outputFileName.c_str()); - remove(tempHeader.c_str()); - remove(accnosFileName.c_str()); - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - delete chimera; - return 0; - } - - //delete accnos file if its blank - if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } - - }else{ - vector positions; - processIDS.resize(0); - - ifstream inFASTA; - openInputFile(fastafile, inFASTA); - - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - numSeqs = positions.size(); - - int numSeqsPerProcessor = numSeqs / processors; - - for (int i = 0; i < processors; i++) { - long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - - - createProcesses(outputFileName, fastafile, accnosFileName); - - rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); - - //append output files - for(int i=1;i nonBlankAccnosFiles; - //delete blank accnos files generated with multiple processes - for(int i=0;icontrol_pressed) { - remove(outputFileName.c_str()); - remove(accnosFileName.c_str()); - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - delete chimera; - return 0; - } - - } - - #else - ifstream inFASTA; - openInputFile(candidateFileNames[s], inFASTA); - numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); - inFASTA.close(); - lines.push_back(new linePair(0, numSeqs)); - - driver(lines[0], outputFileName, fastafile, accnosFileName); - - if (m->control_pressed) { - remove(outputFileName.c_str()); - remove(tempHeader.c_str()); - remove(accnosFileName.c_str()); - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - delete chimera; - return 0; - } - - //delete accnos file if its blank - if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } - #endif - - //m->mothurOut("Output File Names: "); - //if ((filter) && (method == "bellerophon")) { m->mothurOut( - //if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; } - // else { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; } - - appendOutputFiles(tempHeader, outputFileName); - remove(outputFileName.c_str()); - rename(tempHeader.c_str(), outputFileName.c_str()); + m->mothurOut("The chimera.seqs command has been broken up into 5 separate commands.\n"); + m->mothurOut("The chimera.bellerophon, chimera.ccode, chimera.check, chimera.pintail and chimera.slayer commands.\n"); - delete chimera; - - if (method == "chimeracheck") { remove(accnosFileName.c_str()); m->mothurOutEndLine(); m->mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); m->mothurOutEndLine(); } - - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - m->mothurOut(outputFileName); m->mothurOutEndLine(); - if (hasAccnos) { m->mothurOut(accnosFileName); m->mothurOutEndLine(); } - m->mothurOutEndLine(); - - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - - m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); - - return 0; - - } - catch(exception& e) { - m->errorOut(e, "ChimeraSeqsCommand", "execute"); - exit(1); - } -}//********************************************************************************************************************** - -int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filename, string accnos){ - try { - ofstream out; - openOutputFile(outputFName, out); - - ofstream out2; - openOutputFile(accnos, out2); - - ifstream inFASTA; - openInputFile(filename, inFASTA); - - inFASTA.seekg(line->start); - - for(int i=0;inumSeqs;i++){ - - if (m->control_pressed) { return 1; } - - Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); - - if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file - - if ((candidateSeq->getAligned().length() != templateSeqsLength) && (method != "chimeracheck")) { //chimeracheck does not require seqs to be aligned - m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine(); - }else{ - //find chimeras - chimera->getChimeras(candidateSeq); - - if (m->control_pressed) { delete candidateSeq; return 1; } - - //print results - chimera->print(out, out2); - } - } - delete candidateSeq; - - //report progress - if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); } - } - //report progress - if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); } - - out.close(); - out2.close(); - inFASTA.close(); - - return 0; - } - catch(exception& e) { - m->errorOut(e, "ChimeraSeqsCommand", "driver"); - exit(1); - } -} - -/**************************************************************************************************/ - -int ChimeraSeqsCommand::createProcesses(string outputFileName, string filename, string accnos) { - try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - // processIDS.resize(0); - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later - process++; - }else if (pid == 0){ - driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); - exit(0); - }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;ierrorOut(e, "ChimeraSeqsCommand", "createProcesses"); - exit(1); - } -} - -/**************************************************************************************************/ - -void ChimeraSeqsCommand::appendOutputFiles(string temp, string filename) { - try{ - - ofstream output; - ifstream input; - - openOutputFileAppend(temp, output); - openInputFile(filename, input, "noerror"); - - while(char c = input.get()){ - if(input.eof()) { break; } - else { output << c; } - } - - input.close(); - output.close(); - } - catch(exception& e) { - m->errorOut(e, "ChimeraSeqsCommand", "appendOuputFiles"); - exit(1); - } + return 0; } //********************************************************************************************************************** diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h index 040d2dd..afbc259 100644 --- a/chimeraseqscommand.h +++ b/chimeraseqscommand.h @@ -12,8 +12,6 @@ #include "mothur.h" #include "command.hpp" -#include "chimera.h" - /***********************************************************/ @@ -27,26 +25,6 @@ public: private: - struct linePair { - int start; - int numSeqs; - linePair(long int i, int j) : start(i), numSeqs(j) {} - }; - vector processIDS; //processid - vector lines; - - int driver(linePair*, string, string, string); - int createProcesses(string, string, string); - void appendOutputFiles(string, string); - - bool abort; - string method, fastafile, templatefile, consfile, quanfile, maskfile, namefile, outputDir, search; - bool filter, correction, svg, printAll, realign; - int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength; - float divR; - Chimera* chimera; - - }; /***********************************************************/ diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 57ed713..fa706ea 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -12,21 +12,94 @@ #include "kmerdb.hpp" //*************************************************************************************************************** -ChimeraSlayer::ChimeraSlayer(string mode, bool r, string f) : searchMethod(mode), realign(r), fastafile(f) { - decalc = new DeCalculator(); +ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div, +int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() { + try { + fastafile = file; + templateFileName = temp; templateSeqs = readSeqs(temp); + searchMethod = mode; + kmerSize = k; + match = ms; + misMatch = mms; + window = win; + divR = div; + minSim = minsim; + minCov = mincov; + minBS = minbs; + minSNP = minsnp; + parents = par; + iters = it; + increment = inc; + numWanted = numw; + realign = r; + + decalc = new DeCalculator(); + + doPrep(); + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer"); + exit(1); + } } //*************************************************************************************************************** int ChimeraSlayer::doPrep() { try { - + + + //read in all query seqs + vector tempQuerySeqs = readSeqs(fastafile); + + vector temp = templateSeqs; + for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } + + createFilter(temp, 0.0); //just removed columns where all seqs have a gap + + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } + + if (m->control_pressed) { return 0; } + + //run filter on template + for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); } + string kmerDBNameLeft; string kmerDBNameRight; - + //generate the kmerdb to pass to maligner if (searchMethod == "kmer") { - //leftside + string rightTemplateFileName = "right." + templateFileName; + databaseRight = new KmerDB(rightTemplateFileName, kmerSize); + string leftTemplateFileName = "left." + templateFileName; - databaseLeft = new KmerDB(leftTemplateFileName, kmerSize); + databaseLeft = new KmerDB(leftTemplateFileName, kmerSize); + #ifdef USE_MPI + for (int i = 0; i < templateSeqs.size(); i++) { + + if (m->control_pressed) { return 0; } + + string leftFrag = templateSeqs[i]->getUnaligned(); + leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); + + Sequence leftTemp(templateSeqs[i]->getName(), leftFrag); + databaseLeft->addSequence(leftTemp); + } + databaseLeft->generateDB(); + databaseLeft->setNumSeqs(templateSeqs.size()); + + for (int i = 0; i < templateSeqs.size(); i++) { + if (m->control_pressed) { return 0; } + + string rightFrag = templateSeqs[i]->getUnaligned(); + rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); + + Sequence rightTemp(templateSeqs[i]->getName(), rightFrag); + databaseRight->addSequence(rightTemp); + } + databaseRight->generateDB(); + databaseRight->setNumSeqs(templateSeqs.size()); + + #else + //leftside kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; ifstream kmerFileTestLeft(kmerDBNameLeft.c_str()); @@ -52,8 +125,6 @@ int ChimeraSlayer::doPrep() { databaseLeft->setNumSeqs(templateSeqs.size()); //rightside - string rightTemplateFileName = "right." + templateFileName; - databaseRight = new KmerDB(rightTemplateFileName, kmerSize); kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; ifstream kmerFileTestRight(kmerDBNameRight.c_str()); @@ -76,40 +147,8 @@ int ChimeraSlayer::doPrep() { kmerFileTestRight.close(); databaseRight->setNumSeqs(templateSeqs.size()); - - } - - int start = time(NULL); - //filter the sequences - //read in all query seqs - ifstream in; - openInputFile(fastafile, in); - - vector tempQuerySeqs; - while(!in.eof()){ - if (m->control_pressed) { for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } return 0; } - - Sequence* s = new Sequence(in); - gobble(in); - - if (s->getName() != "") { tempQuerySeqs.push_back(s); } + #endif } - in.close(); - - vector temp = templateSeqs; - for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } - - createFilter(temp, 0.0); //just removed columns where all seqs have a gap - - for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } - - if (m->control_pressed) { return 0; } - - - //run filter on template - for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); } - - m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to filter."); m->mothurOutEndLine(); return 0; @@ -158,13 +197,65 @@ int ChimeraSlayer::print(ostream& out, ostream& outAcc) { exit(1); } } +#ifdef USE_MPI +//*************************************************************************************************************** +int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { + try { + MPI_Status status; + bool results = false; + string outAccString = ""; + string outputString = ""; + + if (chimeraFlags == "yes") { + string chimeraFlag = "no"; + if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR) + || + (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; } + + + if (chimeraFlag == "yes") { + if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) { + cout << querySeq->getName() << "\tyes" << endl; + outAccString += querySeq->getName() + "\n"; + results = true; + + //write to accnos file + int length = outAccString.length(); + char buf2[length]; + strcpy(buf2, outAccString.c_str()); + + MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status); + } + } + + outputString = getBlock(chimeraResults[0]); + outputString += "\n"; + + }else { outputString += querySeq->getName() + "\tno\n"; } + + //write to output file + int length = outputString.length(); + char buf[length]; + strcpy(buf, outputString.c_str()); + + MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); + + return results; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "print"); + exit(1); + } +} +#endif + //*************************************************************************************************************** int ChimeraSlayer::getChimeras(Sequence* query) { try { chimeraFlags = "no"; //filter query - spotMap = runFilter(query); + spotMap = runFilter(query); querySeq = query; @@ -274,7 +365,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) { //*************************************************************************************************************** void ChimeraSlayer::printBlock(data_struct data, ostream& out){ try { - //out << "Name\tParentA\tParentB\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n"; + //out << ":)\n"; out << querySeq->getName() << '\t'; out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t'; @@ -307,4 +398,25 @@ void ChimeraSlayer::printBlock(data_struct data, ostream& out){ } } //*************************************************************************************************************** +string ChimeraSlayer::getBlock(data_struct data){ + try { + + string outputString = ""; + + outputString += querySeq->getName() + "\t"; + outputString += data.parentA.getName() + "\t" + data.parentB.getName() + "\t"; + + outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t"; + outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t"; + + outputString += "yes\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t"; + + return outputString; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "getBlock"); + exit(1); + } +} +//***************************************************************************************************************/ diff --git a/chimeraslayer.h b/chimeraslayer.h index 58e1656..3ce4cce 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -15,7 +15,7 @@ #include "maligner.h" #include "slayer.h" -/***********************************************************************/ +//***********************************************************************/ //This class was modeled after the chimeraSlayer written by the Broad Institute /***********************************************************************/ @@ -23,7 +23,7 @@ class ChimeraSlayer : public Chimera { public: - ChimeraSlayer(string, bool, string); + ChimeraSlayer(string, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool); ~ChimeraSlayer(); int getChimeras(Sequence*); @@ -31,6 +31,10 @@ class ChimeraSlayer : public Chimera { void printHeader(ostream&); int doPrep(); + #ifdef USE_MPI + int print(MPI_File&, MPI_File&); + #endif + private: Sequence* querySeq; DeCalculator* decalc; @@ -43,8 +47,11 @@ class ChimeraSlayer : public Chimera { vector chimeraResults; string chimeraFlags, searchMethod, fastafile; bool realign; + int window, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters, increment; + float divR; void printBlock(data_struct, ostream&); + string getBlock(data_struct); }; diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp new file mode 100644 index 0000000..336dba9 --- /dev/null +++ b/chimeraslayercommand.cpp @@ -0,0 +1,604 @@ +/* + * chimeraslayercommand.cpp + * Mothur + * + * Created by westcott on 3/31/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "chimeraslayercommand.h" +#include "bellerophon.h" +#include "pintail.h" +#include "ccode.h" +#include "chimeracheckrdp.h" +#include "chimeraslayer.h" + + +//*************************************************************************************************************** + +ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta", "processors", "window", "template","numwanted", "ksize", "match","mismatch", + "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" }; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("template"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["template"] = inputDir + it->second; } + } + } + + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { abort = true; } + else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it + } + + templatefile = validParameter.validFile(parameters, "template", true); + if (templatefile == "not open") { abort = true; } + else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; } + + string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } + convert(temp, processors); + + temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; } + convert(temp, ksize); + + temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "50"; } + convert(temp, window); + + temp = validParameter.validFile(parameters, "match", false); if (temp == "not found") { temp = "5"; } + convert(temp, match); + + temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found") { temp = "-4"; } + convert(temp, mismatch); + + temp = validParameter.validFile(parameters, "divergence", false); if (temp == "not found") { temp = "1.007"; } + convert(temp, divR); + + temp = validParameter.validFile(parameters, "minsim", false); if (temp == "not found") { temp = "90"; } + convert(temp, minSimilarity); + + temp = validParameter.validFile(parameters, "mincov", false); if (temp == "not found") { temp = "70"; } + convert(temp, minCoverage); + + temp = validParameter.validFile(parameters, "minbs", false); if (temp == "not found") { temp = "90"; } + convert(temp, minBS); + + temp = validParameter.validFile(parameters, "minsnp", false); if (temp == "not found") { temp = "10"; } + convert(temp, minSNP); + + temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "3"; } + convert(temp, parents); + + temp = validParameter.validFile(parameters, "realign", false); if (temp == "not found") { temp = "f"; } + realign = isTrue(temp); + + search = validParameter.validFile(parameters, "search", false); if (search == "not found") { search = "distance"; } + + temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; } + convert(temp, iters); + + temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "5"; } + convert(temp, increment); + + temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "15"; } + convert(temp, numwanted); + + if ((search != "distance") && (search != "blast") && (search != "kmer")) { m->mothurOut(search + " is not a valid search."); m->mothurOutEndLine(); abort = true; } + } + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "ChimeraSlayerCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void ChimeraSlayerCommand::help(){ + try { + + m->mothurOut("The chimera.slayer command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n"); + m->mothurOut("This command was modeled after the chimeraSlayer written by the Broad Institute.\n"); + m->mothurOut("The chimera.slayer command parameters are fasta, template, filter, mask, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign, + m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n"); + m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n"); + m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); + #ifdef USE_MPI + m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); + #endif + m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences. \n"); + m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=50. \n"); + m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=5.\n"); + m->mothurOut("The numwanted parameter allows you to specify how many sequences you would each query sequence compared with, default=15.\n"); + m->mothurOut("The ksize parameter allows you to input kmersize, default is 7, used if search is kmer. \n"); + m->mothurOut("The match parameter allows you to reward matched bases in blast search, default is 5. \n"); + m->mothurOut("The parents parameter allows you to select the number of potential parents to investigate from the numwanted best matches after rating them, default is 3. \n"); + m->mothurOut("The mismatch parameter allows you to penalize mismatched bases in blast search, default is -4. \n"); + m->mothurOut("The divergence parameter allows you to set a cutoff for chimera determination, default is 1.007. \n"); + m->mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method, default=100.\n"); + m->mothurOut("The minsim parameter allows you to specify a minimum similarity with the parent fragments, default=90. \n"); + m->mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n"); + m->mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n"); + m->mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n"); + m->mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n"); + //m->mothurOut("The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default false. Found to make results worse. \n"); + m->mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n"); + m->mothurOut("The chimera.slayer command should be in the following format: \n"); + m->mothurOut("chimera.slayer(fasta=yourFastaFile, template=yourTemplate, search=yourSearch) \n"); + m->mothurOut("Example: chimera.slayer(fasta=AD.align, template=core_set_aligned.imputed.fasta, search=kmer) \n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "help"); + exit(1); + } +} + +//*************************************************************************************************************** + +ChimeraSlayerCommand::~ChimeraSlayerCommand(){ /* do nothing */ } + +//*************************************************************************************************************** + +int ChimeraSlayerCommand::execute(){ + try{ + + if (abort == true) { return 0; } + + int start = time(NULL); + + chimera = new ChimeraSlayer(fastafile, templatefile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign); + + string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + "slayer.chimeras"; + string accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + "slayer.accnos"; + bool hasAccnos = true; + + if (m->control_pressed) { delete chimera; return 0; } + + if (chimera->getUnaligned()) { + m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine(); + delete chimera; + return 0; + } + templateSeqsLength = chimera->getLength(); + + #ifdef USE_MPI + int pid, end, numSeqsPerProcessor; + int tag = 2001; + vector MPIPos; + MPIWroteAccnos = false; + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + MPI_File inMPI; + MPI_File outMPI; + MPI_File outMPIAccnos; + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + int inMode=MPI_MODE_RDONLY; + + char outFilename[outputFileName.length()]; + strcpy(outFilename, outputFileName.c_str()); + + char outAccnosFilename[accnosFileName.length()]; + strcpy(outAccnosFilename, accnosFileName.c_str()); + + char inFileName[fastafile.length()]; + strcpy(inFileName, fastafile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI); + MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; } + + + if (pid == 0) { //you are the root process + m->mothurOutEndLine(); + m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results."); + m->mothurOutEndLine(); + + string outTemp = "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n"; + + //print header + int length = outTemp.length(); + char buf2[length]; + strcpy(buf2, outTemp.c_str()); + MPI_File_write_shared(outMPI, buf2, length, MPI_CHAR, &status); + + MPIPos = setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + + //figure out how many sequences you have to align + numSeqsPerProcessor = numSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); delete chimera; return 0; } + + for (int i = 1; i < processors; i++) { + bool tempResult; + MPI_Recv(&tempResult, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); + if (tempResult != 0) { MPIWroteAccnos = true; } + } + }else{ //you are a child process + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + MPIPos.resize(numSeqs+1); + MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + + //figure out how many sequences you have to align + numSeqsPerProcessor = numSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; } + + MPI_Send(&MPIWroteAccnos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); + } + + //close files + MPI_File_close(&inMPI); + MPI_File_close(&outMPI); + MPI_File_close(&outMPIAccnos); + + //delete accnos file if blank + if (pid == 0) { + if (!MPIWroteAccnos) { + //MPI_Info info; + //MPI_File_delete(outAccnosFilename, info); + hasAccnos = false; + remove(accnosFileName.c_str()); + } + } + + #else + ofstream outHeader; + string tempHeader = outputDir + getRootName(getSimpleName(fastafile)) + "slayer.chimeras.tempHeader"; + openOutputFile(tempHeader, outHeader); + + chimera->printHeader(outHeader); + outHeader.close(); + + //break up file + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numSeqs)); + + driver(lines[0], outputFileName, fastafile, accnosFileName); + + if (m->control_pressed) { + remove(outputFileName.c_str()); + remove(tempHeader.c_str()); + remove(accnosFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + + //delete accnos file if its blank + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } + + }else{ + vector positions; + processIDS.resize(0); + + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); + if (input.length() != 0) { + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + numSeqs = positions.size(); + + int numSeqsPerProcessor = numSeqs / processors; + + for (int i = 0; i < processors; i++) { + long int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } + + + createProcesses(outputFileName, fastafile, accnosFileName); + + rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); + + //append output files + for(int i=1;i nonBlankAccnosFiles; + //delete blank accnos files generated with multiple processes + for(int i=0;icontrol_pressed) { + remove(outputFileName.c_str()); + remove(accnosFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + + } + + #else + ifstream inFASTA; + openInputFile(candidateFileNames[s], inFASTA); + numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + lines.push_back(new linePair(0, numSeqs)); + + driver(lines[0], outputFileName, fastafile, accnosFileName); + + if (m->control_pressed) { + remove(outputFileName.c_str()); + remove(tempHeader.c_str()); + remove(accnosFileName.c_str()); + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + delete chimera; + return 0; + } + + //delete accnos file if its blank + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } + #endif + + appendFiles(tempHeader, outputFileName); + + remove(outputFileName.c_str()); + rename(tempHeader.c_str(), outputFileName.c_str()); + + #endif + delete chimera; + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + m->mothurOut(outputFileName); m->mothurOutEndLine(); + if (hasAccnos) { m->mothurOut(accnosFileName); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +int ChimeraSlayerCommand::driver(linePair* line, string outputFName, string filename, string accnos){ + try { + ofstream out; + openOutputFile(outputFName, out); + + ofstream out2; + openOutputFile(accnos, out2); + + ifstream inFASTA; + openInputFile(filename, inFASTA); + + inFASTA.seekg(line->start); + + for(int i=0;inumSeqs;i++){ + + if (m->control_pressed) { return 1; } + + Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + + if (candidateSeq->getAligned().length() != templateSeqsLength) { + m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine(); + }else{ + //find chimeras + chimera->getChimeras(candidateSeq); + + if (m->control_pressed) { delete candidateSeq; return 1; } + + //print results + chimera->print(out, out2); + } + } + delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); } + } + //report progress + if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); } + + out.close(); + out2.close(); + inFASTA.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "driver"); + exit(1); + } +} +//********************************************************************************************************************** +#ifdef USE_MPI +int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector& MPIPos){ + try { + + MPI_Status status; + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + for(int i=0;icontrol_pressed) { return 1; } + + //read next sequence + int length = MPIPos[start+i+1] - MPIPos[start+i]; + + char buf4[length]; + MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + istringstream iss (tempBuf,istringstream::in); + + Sequence* candidateSeq = new Sequence(iss); gobble(iss); + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + + if (candidateSeq->getAligned().length() != templateSeqsLength) { + m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine(); + }else{ + //find chimeras + chimera->getChimeras(candidateSeq); + + if (m->control_pressed) { delete candidateSeq; return 1; } + + //print results + bool isChimeric = chimera->print(outMPI, outAccMPI); + if (isChimeric) { MPIWroteAccnos = true; } + } + } + delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); } + } + //report progress + if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "driverMPI"); + exit(1); + } +} +#endif + +/**************************************************************************************************/ + +int ChimeraSlayerCommand::createProcesses(string outputFileName, string filename, string accnos) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + // processIDS.resize(0); + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); + exit(0); + }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;ierrorOut(e, "ChimeraSlayerCommand", "createProcesses"); + exit(1); + } +} + +/**************************************************************************************************/ + + diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h new file mode 100644 index 0000000..926326b --- /dev/null +++ b/chimeraslayercommand.h @@ -0,0 +1,58 @@ +#ifndef CHIMERASLAYERCOMMAND_H +#define CHIMERASLAYERCOMMAND_H + +/* + * chimeraslayercommand.h + * Mothur + * + * Created by westcott on 3/31/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" +#include "chimera.h" + + +/***********************************************************/ + +class ChimeraSlayerCommand : public Command { +public: + ChimeraSlayerCommand(string); + ~ChimeraSlayerCommand(); + int execute(); + void help(); + + +private: + + struct linePair { + int start; + int numSeqs; + linePair(long int i, int j) : start(i), numSeqs(j) {} + }; + vector processIDS; //processid + vector lines; + + int driver(linePair*, string, string, string); + int createProcesses(string, string, string); + + #ifdef USE_MPI + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); + #endif + + bool abort, realign, MPIWroteAccnos; + string fastafile, templatefile, outputDir, search; + int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength; + float divR; + Chimera* chimera; + + +}; + +/***********************************************************/ + +#endif + + diff --git a/classify.cpp b/classify.cpp index 2db1973..557f17c 100644 --- a/classify.cpp +++ b/classify.cpp @@ -22,6 +22,66 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, f int start = time(NULL); int numSeqs = 0; + + m->mothurOut("Generating search database... "); cout.flush(); +#ifdef USE_MPI + int pid; + vector positions; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + char inFileName[tempFile.length()]; + strcpy(inFileName, tempFile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + + if (pid == 0) { //only one process needs to scan file + positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + }else{ + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + positions.resize(numSeqs); + MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + } + + //create database + if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); } + else if(method == "suffix") { database = new SuffixDB(numSeqs); } + else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } + else if(method == "distance") { database = new DistanceDB(); } + else { + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine(); + database = new KmerDB(tempFile, 8); + } + + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + + istringstream iss (tempBuf,istringstream::in); + + Sequence temp(iss); + if (temp.getName() != "") { + names.push_back(temp.getName()); + database->addSequence(temp); + } + } + + database->generateDB(); + MPI_File_close(&inMPI); + #else + //need to know number of template seqs for suffixdb if (method == "suffix") { ifstream inFASTA; @@ -30,8 +90,6 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, f inFASTA.close(); } - m->mothurOut("Generating search database... "); cout.flush(); - bool needToGenerate = true; string kmerDBName; if(method == "kmer") { @@ -81,7 +139,7 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, f } fastaFile.close(); } - +#endif database->setNumSeqs(names.size()); m->mothurOut("DONE."); m->mothurOutEndLine(); @@ -99,14 +157,58 @@ void Classify::readTaxonomy(string file) { try { phyloTree = new PhyloTree(); + string name, taxInfo; - ifstream inTax; - openInputFile(file, inTax); - m->mothurOutEndLine(); m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush(); + +#ifdef USE_MPI + int pid, num; + vector positions; - string name, taxInfo; + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + char inFileName[file.length()]; + strcpy(inFileName, file.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + + if (pid == 0) { + positions = setFilePosEachLine(file, num); + + //send file positions to all processes + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + }else{ + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + positions.resize(num); + MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + } + + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + + istringstream iss (tempBuf,istringstream::in); + iss >> name >> taxInfo; + taxonomy[name] = taxInfo; + phyloTree->addSeqToTree(name, taxInfo); + } + + MPI_File_close(&inMPI); +#else + ifstream inTax; + openInputFile(file, inTax); + //read template seqs and save while (!inTax.eof()) { inTax >> name >> taxInfo; @@ -117,10 +219,11 @@ void Classify::readTaxonomy(string file) { gobble(inTax); } - - phyloTree->assignHeirarchyIDs(0); inTax.close(); +#endif + phyloTree->assignHeirarchyIDs(0); + m->mothurOut("DONE."); m->mothurOutEndLine(); cout.flush(); diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index ba854e9..a9f0a36 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -87,15 +87,37 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { } int ableToOpen; + + #ifdef USE_MPI + int pid; + MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + #endif + ifstream in; ableToOpen = openInputFile(fastaFileNames[i], in); + in.close(); + + #ifdef USE_MPI + for (int j = 1; j < processors; j++) { + MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); + } + }else{ + MPI_Status status; + MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + } + + #endif + if (ableToOpen == 1) { m->mothurOut(fastaFileNames[i] + " will be disregarded."); m->mothurOutEndLine(); //erase from file list fastaFileNames.erase(fastaFileNames.begin()+i); i--; } - in.close(); + } //make sure there is at least one valid file left @@ -125,12 +147,32 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { namefileNames[i] = inputDir + namefileNames[i]; } } - int ableToOpen; + + #ifdef USE_MPI + int pid; + MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + #endif + ifstream in; ableToOpen = openInputFile(namefileNames[i], in); - if (ableToOpen == 1) { m->mothurOut("Unable to match name file with fasta file."); m->mothurOutEndLine(); abort = true; } in.close(); + + #ifdef USE_MPI + for (int j = 1; j < processors; j++) { + MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); + } + }else{ + MPI_Status status; + MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + } + + #endif + if (ableToOpen == 1) { m->mothurOut("Unable to match name file with fasta file."); m->mothurOutEndLine(); abort = true; } + } } @@ -211,6 +253,9 @@ void ClassifySeqsCommand::help(){ m->mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n"); m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n"); + #ifdef USE_MPI + m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); + #endif m->mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"); m->mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"); m->mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"); @@ -253,21 +298,6 @@ int ClassifySeqsCommand::execute(){ for (int s = 0; s < fastaFileNames.size(); s++) { - //read namefile - if(namefile != "") { - nameMap.clear(); //remove old names - - ifstream inNames; - openInputFile(namefileNames[s], inNames); - - string firstCol, secondCol; - while(!inNames.eof()) { - inNames >> firstCol >> secondCol; gobble(inNames); - nameMap[firstCol] = getNumNames(secondCol); //ex. seq1 seq1,seq3,seq5 -> seq1 = 3. - } - inNames.close(); - } - m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); if (outputDir == "") { outputDir += hasPath(fastaFileNames[s]); } @@ -282,7 +312,102 @@ int ClassifySeqsCommand::execute(){ int numFastaSeqs = 0; for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#ifdef USE_MPI + int pid, end, numSeqsPerProcessor; + int tag = 2001; + vector MPIPos; + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + MPI_File inMPI; + MPI_File outMPINewTax; + MPI_File outMPITempTax; + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + int inMode=MPI_MODE_RDONLY; + + char outNewTax[newTaxonomyFile.length()]; + strcpy(outNewTax, newTaxonomyFile.c_str()); + + char outTempTax[tempTaxonomyFile.length()]; + strcpy(outTempTax, tempTaxonomyFile.c_str()); + + char inFileName[fastaFileNames[s].length()]; + strcpy(inFileName, fastaFileNames[s].c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax); + MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + + if(namefile != "") { MPIReadNamesFile(namefileNames[s]); } + + if (pid == 0) { //you are the root process + + MPIPos = setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + + //figure out how many sequences you have to align + numSeqsPerProcessor = numFastaSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } + + for (int i = 1; i < processors; i++) { + int done; + MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); + } + }else{ //you are a child process + MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + MPIPos.resize(numFastaSeqs+1); + MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + + //figure out how many sequences you have to align + numSeqsPerProcessor = numFastaSeqs / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + + int done = 0; + MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); + } + + //close files + MPI_File_close(&inMPI); + MPI_File_close(&outMPINewTax); + MPI_File_close(&outMPITempTax); + +#else + //read namefile + if(namefile != "") { + nameMap.clear(); //remove old names + + ifstream inNames; + openInputFile(namefileNames[s], inNames); + + string firstCol, secondCol; + while(!inNames.eof()) { + inNames >> firstCol >> secondCol; gobble(inNames); + nameMap[firstCol] = getNumNames(secondCol); //ex. seq1 seq1,seq3,seq5 -> seq1 = 3. + } + inNames.close(); + } + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ ifstream inFASTA; openInputFile(fastaFileNames[s], inFASTA); @@ -333,7 +458,7 @@ int ClassifySeqsCommand::execute(){ } } -#else + #else ifstream inFASTA; openInputFile(fastaFileNames[s], inFASTA); numFastaSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); @@ -342,7 +467,13 @@ int ClassifySeqsCommand::execute(){ lines.push_back(new linePair(0, numFastaSeqs)); driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); -#endif + #endif +#endif + + #ifdef USE_MPI + if (pid == 0) { //this part does not need to be paralellized + #endif + //make taxonomy tree from new taxonomy file PhyloTree taxaBrowser; @@ -416,6 +547,10 @@ int ClassifySeqsCommand::execute(){ remove(newTaxonomyFile.c_str()); rename(unclass.c_str(), newTaxonomyFile.c_str()); + #ifdef USE_MPI + } + #endif + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } @@ -577,5 +712,113 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa exit(1); } } +//********************************************************************************************************************** +#ifdef USE_MPI +int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector& MPIPos){ + try { + MPI_Status statusNew; + MPI_Status statusTemp; + MPI_Status status; + + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + string taxonomy; + string outputString; + + for(int i=0;icontrol_pressed) { return 0; } + + //read next sequence + int length = MPIPos[start+i+1] - MPIPos[start+i]; + char buf4[length]; + MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + istringstream iss (tempBuf,istringstream::in); + + Sequence* candidateSeq = new Sequence(iss); + + if (candidateSeq->getName() != "") { + taxonomy = classify->getTaxonomy(candidateSeq); + + if (taxonomy != "bad seq") { + //output confidence scores or not + if (probs) { + outputString = candidateSeq->getName() + "\t" + taxonomy + "\n"; + }else{ + outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; + } + + int length = outputString.length(); + char buf2[length]; + strcpy(buf2, outputString.c_str()); + + MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew); + + outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; + length = outputString.length(); + char buf[length]; + strcpy(buf, outputString.c_str()); + + MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp); + } + } + delete candidateSeq; + + if((i+1) % 100 == 0){ cout << "Classifying sequence " << (i+1) << endl; } + } + + if(num % 100 != 0){ cout << "Classifying sequence " << (num) << endl; } + + + return 1; + } + catch(exception& e) { + m->errorOut(e, "ClassifySeqsCommand", "driverMPI"); + exit(1); + } +} +//********************************************************************************************************************** +int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){ + try { + + nameMap.clear(); //remove old names + + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + + char inFileName[nameFilename.length()]; + strcpy(inFileName, nameFilename.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); + + char buffer[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + + string firstCol, secondCol; + while(!iss.eof()) { + iss >> firstCol >> secondCol; gobble(iss); + nameMap[firstCol] = getNumNames(secondCol); //ex. seq1 seq1,seq3,seq5 -> seq1 = 3. + } + + MPI_File_close(&inMPI); + + return 1; + } + catch(exception& e) { + m->errorOut(e, "ClassifySeqsCommand", "MPIReadNamesFile"); + exit(1); + } +} +#endif /**************************************************************************************************/ diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 890085e..0ffd18c 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -57,6 +57,11 @@ private: void appendTaxFiles(string, string); void createProcesses(string, string, string); string addUnclassifieds(string, int); + + int MPIReadNamesFile(string); + #ifdef USE_MPI + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); + #endif }; #endif diff --git a/cluster.cpp b/cluster.cpp index bd5986e..e2d307e 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -50,11 +50,14 @@ rabund(rav), list(lv), dMatrix(dm), method(f) // a list contains pointers (iterators) to the all distances related // to a certain sequence. The Vector is accessed via the index of a // sequence in the distance matrix. + + seqVec = vector(lv->size()); for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) { seqVec[currentCell->row].push_back(currentCell); seqVec[currentCell->column].push_back(currentCell); } + mapWanted = false; //set to true by mgcluster to speed up overlap merge //save so you can modify as it changes in average neighbor @@ -86,50 +89,58 @@ void Cluster::getRowColCells() { /***********************************************************************/ // Remove the specified cell from the seqVec and from the sparse // matrix -void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix) -{ - ull drow = cell->row; - ull dcol = cell->column; - if (((vrow >=0) && (drow != smallRow)) || - ((vcol >=0) && (dcol != smallCol))) { - ull dtemp = drow; - drow = dcol; - dcol = dtemp; - } +void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix){ + try { + + ull drow = cell->row; + ull dcol = cell->column; + if (((vrow >=0) && (drow != smallRow)) || + ((vcol >=0) && (dcol != smallCol))) { + ull dtemp = drow; + drow = dcol; + dcol = dtemp; + } - ull crow; - ull ccol; - int nCells; - if (vrow < 0) { - nCells = seqVec[drow].size(); - for (vrow=0; vrowrow; - ccol = seqVec[drow][vrow]->column; - if (((crow == drow) && (ccol == dcol)) || - ((ccol == drow) && (crow == dcol))) { - break; + ull crow; + ull ccol; + int nCells; + if (vrow < 0) { + nCells = seqVec[drow].size(); + for (vrow=0; vrowrow; + ccol = seqVec[drow][vrow]->column; + if (((crow == drow) && (ccol == dcol)) || + ((ccol == drow) && (crow == dcol))) { + break; + } + } } - } - } - seqVec[drow].erase(seqVec[drow].begin()+vrow); - if (vcol < 0) { - nCells = seqVec[dcol].size(); - for (vcol=0; vcolrow; - ccol = seqVec[dcol][vcol]->column; - if (((crow == drow) && (ccol == dcol)) || - ((ccol == drow) && (crow == dcol))) { - break; + + seqVec[drow].erase(seqVec[drow].begin()+vrow); + if (vcol < 0) { + nCells = seqVec[dcol].size(); + for (vcol=0; vcolrow; + ccol = seqVec[dcol][vcol]->column; + if (((crow == drow) && (ccol == dcol)) || + ((ccol == drow) && (crow == dcol))) { + break; + } + } } - } + + seqVec[dcol].erase(seqVec[dcol].begin()+vcol); + + if (rmMatrix) { + dMatrix->rmCell(cell); + } + } - seqVec[dcol].erase(seqVec[dcol].begin()+vcol); - if (rmMatrix) { - dMatrix->rmCell(cell); + catch(exception& e) { + m->errorOut(e, "Cluster", "removeCell"); + exit(1); } } - - /***********************************************************************/ void Cluster::clusterBins(){ @@ -177,7 +188,7 @@ void Cluster::clusterNames(){ void Cluster::update(double& cutOFF){ try { getRowColCells(); - + vector foundCol(nColCells, 0); int search; diff --git a/clustercommand.cpp b/clustercommand.cpp index 7484a62..5e6a9b4 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -154,7 +154,7 @@ int ClusterCommand::execute(){ double saveCutoff = cutoff; while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){ - + if (m->control_pressed) { //clean up delete globaldata->gSparseMatrix; globaldata->gSparseMatrix = NULL; delete globaldata->gListVector; globaldata->gListVector = NULL; @@ -176,6 +176,7 @@ int ClusterCommand::execute(){ loops++; cluster->update(cutoff); + float dist = matrix->getSmallDist(); float rndDist = roundDist(dist, precision); @@ -209,7 +210,7 @@ int ClusterCommand::execute(){ //delete globaldata's copy of the sparsematrix and listvector to free up memory delete globaldata->gSparseMatrix; globaldata->gSparseMatrix = NULL; delete globaldata->gListVector; globaldata->gListVector = NULL; - + //saves .list file so you can do the collect, rarefaction and summary commands without doing a read.list if (globaldata->getFormat() == "phylip") { globaldata->setPhylipFile(""); } else if (globaldata->getFormat() == "column") { globaldata->setColumnFile(""); } @@ -221,7 +222,7 @@ int ClusterCommand::execute(){ sabundFile.close(); rabundFile.close(); listFile.close(); - + if (saveCutoff != cutoff) { m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); } m->mothurOutEndLine(); diff --git a/commandfactory.cpp b/commandfactory.cpp index 727f6e8..aaeec67 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -65,6 +65,12 @@ #include "otuhierarchycommand.h" #include "setdircommand.h" #include "parselistscommand.h" +#include "parsesffcommand.h" +#include "chimeraccodecommand.h" +#include "chimeracheckcommand.h" +#include "chimeraslayercommand.h" +#include "chimerapintailcommand.h" +#include "chimerabellerophoncommand.h" /*******************************************************/ @@ -78,6 +84,12 @@ CommandFactory* CommandFactory::getInstance() { /***********************************************************/ /***********************************************************/ +//note: This class is resposible for knowing which commands are mpiEnabled, +//If a command is not enabled only process 0 will execute the command. +//This avoids redundant outputs on pieces of code we have not paralellized. +//If you add mpi code to a existing command you need to modify the list below or the code will hang on MPI blocking commands like FIle_open. +//example: commands["dist.seqs"] = "MPIEnabled"; + CommandFactory::CommandFactory(){ string s = ""; m = MothurOut::getInstance(); @@ -94,7 +106,6 @@ CommandFactory::CommandFactory(){ commands["get.oturep"] = "get.oturep"; commands["cluster"] = "cluster"; commands["unique.seqs"] = "unique.seqs"; - commands["dist.seqs"] = "MPIEnabled"; commands["dist.shared"] = "dist.shared"; commands["collect.single"] = "collect.single"; commands["collect.shared"] = "collect.shared"; @@ -117,13 +128,10 @@ CommandFactory::CommandFactory(){ commands["bootstrap.shared"] = "bootstrap.shared"; //commands["consensus"] = "consensus"; commands["help"] = "help"; - commands["filter.seqs"] = "MPIEnabled"; - commands["align.seqs"] = "align.seqs"; commands["summary.seqs"] = "summary.seqs"; commands["screen.seqs"] = "screen.seqs"; commands["reverse.seqs"] = "reverse.seqs"; commands["trim.seqs"] = "trim.seqs"; - commands["chimera.seqs"] = "chimera.seqs"; commands["list.seqs"] = "list.seqs"; commands["get.seqs"] = "get.seqs"; commands["remove.seqs"] = "get.seqs"; @@ -131,9 +139,7 @@ CommandFactory::CommandFactory(){ commands["align.check"] = "align.check"; commands["get.sharedseqs"] = "get.sharedseqs"; commands["get.otulist"] = "get.otulist"; - commands["quit"] = "MPIEnabled"; commands["hcluster"] = "hcluster"; - commands["classify.seqs"] = "classify.seqs"; commands["phylotype"] = "phylotype"; commands["mgcluster"] = "mgcluster"; commands["pre.cluster"] = "pre.cluster"; @@ -142,6 +148,19 @@ CommandFactory::CommandFactory(){ commands["set.dir"] = "set.dir"; commands["merge.files"] = "merge.files"; commands["parse.list"] = "parse.list"; + commands["parse.sff"] = "parse.sff"; + commands["classify.seqs"] = "MPIEnabled"; + commands["dist.seqs"] = "MPIEnabled"; + commands["filter.seqs"] = "MPIEnabled"; + commands["align.seqs"] = "MPIEnabled"; + commands["chimera.seqs"] = "chimera.seqs"; + commands["chimera.ccode"] = "MPIEnabled"; + commands["chimera.check"] = "MPIEnabled"; + commands["chimera.slayer"] = "MPIEnabled"; + commands["chimera.pintail"] = "MPIEnabled"; + commands["chimera.bellerophon"] = "MPIEnabled"; + commands["quit"] = "MPIEnabled"; + } /***********************************************************/ @@ -230,6 +249,11 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "get.otulist") { command = new GetListCountCommand(optionString); } else if(commandName == "hcluster") { command = new HClusterCommand(optionString); } else if(commandName == "classify.seqs") { command = new ClassifySeqsCommand(optionString); } + else if(commandName == "chimera.ccode") { command = new ChimeraCcodeCommand(optionString); } + else if(commandName == "chimera.check") { command = new ChimeraCheckCommand(optionString); } + else if(commandName == "chimera.slayer") { command = new ChimeraSlayerCommand(optionString); } + else if(commandName == "chimera.pintail") { command = new ChimeraPintailCommand(optionString); } + else if(commandName == "chimera.bellerophon") { command = new ChimeraBellerophonCommand(optionString); } else if(commandName == "phylotype") { command = new PhylotypeCommand(optionString); } else if(commandName == "mgcluster") { command = new MGClusterCommand(optionString); } else if(commandName == "pre.cluster") { command = new PreClusterCommand(optionString); } @@ -237,6 +261,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "otu.hierarchy") { command = new OtuHierarchyCommand(optionString); } else if(commandName == "set.dir") { command = new SetDirectoryCommand(optionString); } else if(commandName == "parse.list") { command = new ParseListCommand(optionString); } + else if(commandName == "parse.sff") { command = new ParseSFFCommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/database.hpp b/database.hpp index 994c845..efc7ba7 100644 --- a/database.hpp +++ b/database.hpp @@ -55,7 +55,11 @@ public: virtual void setNumSeqs(int i) { numSeqs = i; } virtual vector getSequencesWithKmer(int){ vector filler; return filler; }; virtual int getMaxKmer(){ return 1; }; - + + #ifdef USE_MPI + virtual int MPISend(int) = 0; + virtual int MPIRecv(int) = 0; + #endif protected: MothurOut* m; diff --git a/distancecommand.cpp b/distancecommand.cpp index 4720df3..47e22ad 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -162,6 +162,8 @@ int DistanceCommand::execute(){ if (abort == true) { return 0; } + int startTime = time(NULL); + int numSeqs = alignDB.getNumSeqs(); cutoff += 0.005; @@ -193,43 +195,104 @@ int DistanceCommand::execute(){ //each process gets where it should start and stop in the file start = int (sqrt(float(pid)/float(processors)) * numSeqs); end = int (sqrt(float(pid+1)/float(processors)) * numSeqs); - - MPI_File outMPI; - int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; - - char filename[outputFile.length()]; - strcpy(filename, outputFile.c_str()); - MPI_File_open(MPI_COMM_WORLD, filename, amode, MPI_INFO_NULL, &outMPI); - - if (pid == 0) { //you are the root process - - //do your part - string outputMyPart; - driverMPI(start, end, outMPI, cutoff); + if (output != "lt") { + MPI_File outMPI; + int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + + char filename[outputFile.length()]; + strcpy(filename, outputFile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, filename, amode, MPI_INFO_NULL, &outMPI); + + if (m->control_pressed) { MPI_File_close(&outMPI); delete distCalculator; return 0; } + + if (pid == 0) { //you are the root process + + //do your part + string outputMyPart; + driverMPI(start, end, outMPI, cutoff); + + if (m->control_pressed) { MPI_File_close(&outMPI); delete distCalculator; return 0; } + + //wait on chidren + for(int i = 1; i < processors; i++) { + if (m->control_pressed) { MPI_File_close(&outMPI); delete distCalculator; return 0; } + + char buf[4]; + MPI_Recv(buf, 4, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); + } + }else { //you are a child process + //do your part + driverMPI(start, end, outMPI, cutoff); + + if (m->control_pressed) { MPI_File_close(&outMPI); delete distCalculator; return 0; } - //wait on chidren - for(int i = 1; i < processors; i++) { char buf[4]; - MPI_Recv(buf, 4, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); + strcpy(buf, "done"); + //tell parent you are done. + MPI_Send(buf, 4, MPI_CHAR, 0, tag, MPI_COMM_WORLD); } - if (output == "lt") { - convertToLowerTriangle(outputFile); - } + MPI_File_close(&outMPI); - }else { //you are a child process - //do your part - driverMPI(start, end, outMPI, cutoff); - - char buf[4]; - strcpy(buf, "done"); + }else { //lower triangle format + if (pid == 0) { //you are the root process + + //do your part + string outputMyPart; + long mySize; + driverMPI(start, end, outputFile, mySize); + + if (m->control_pressed) { delete distCalculator; return 0; } + + int amode=MPI_MODE_APPEND|MPI_MODE_WRONLY|MPI_MODE_CREATE; // + MPI_File outMPI; + MPI_File inMPI; + + char filename[outputFile.length()]; + strcpy(filename, outputFile.c_str()); - //tell parent you are done. - MPI_Send(buf, 4, MPI_CHAR, 0, tag, MPI_COMM_WORLD); + MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI); + + //wait on chidren + for(int b = 1; b < processors; b++) { + long fileSize; + + if (m->control_pressed) { MPI_File_close(&outMPI); delete distCalculator; return 0; } + + MPI_Recv(&fileSize, 1, MPI_LONG, b, tag, MPI_COMM_WORLD, &status); + + string outTemp = outputFile + toString(b) + ".temp"; + char buf[outTemp.length()]; + strcpy(buf, outTemp.c_str()); + + MPI_File_open(MPI_COMM_SELF, buf, MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + + int count = 0; + while (count < fileSize) { //read 1000 characters at a time + //send freqs + char buf2[1]; + MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status); + MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status); + count += 1; + } + + MPI_File_close(&inMPI); //deleted on close + } + + MPI_File_close(&outMPI); + }else { //you are a child process + //do your part + long size; + driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); + + if (m->control_pressed) { delete distCalculator; return 0; } + + //tell parent you are done. + MPI_Send(&size, 1, MPI_LONG, 0, tag, MPI_COMM_WORLD); + } } - - MPI_File_close(&outMPI); #else #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) @@ -284,7 +347,7 @@ int DistanceCommand::execute(){ m->mothurOut("Output File Name: "); m->mothurOutEndLine(); m->mothurOut(outputFile); m->mothurOutEndLine(); m->mothurOutEndLine(); - + m->mothurOut("It took " + toString(time(NULL) - startTime) + " to calculate the distances for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); return 0; } @@ -408,40 +471,101 @@ int DistanceCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, flo if (output == "column") { outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n'); } } - if ((output == "square") || (output == "lt")){ //make a square column you can convert to square phylip + if (output == "square") { //make a square column you can convert to square phylip outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n'); outputString += (alignDB.get(j).getName() + ' ' + alignDB.get(i).getName() + ' ' + toString(dist) + '\n'); } - } if(i % 100 == 0){ - m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); + //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); + cout << i << '\t' << (time(NULL) - startTime) << endl; } - if(i % 10 == 0){ //output to file - //send results to parent - int length = outputString.length(); - char buf[length]; - strcpy(buf, outputString.c_str()); - - MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status); - outputString = ""; - } + + //send results to parent + int length = outputString.length(); + char buf[length]; + strcpy(buf, outputString.c_str()); + + MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status); + outputString = ""; } - m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); - if(outputString != ""){ //output to file - //send results to parent - int length = outputString.length(); - char buf[length]; - strcpy(buf, outputString.c_str()); + //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); + cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl; + return 1; + } + catch(exception& e) { + m->errorOut(e, "DistanceCommand", "driverMPI"); + exit(1); + } +} +/**************************************************************************************************/ +/////// need to fix to work with calcs and sequencedb +int DistanceCommand::driverMPI(int startLine, int endLine, string file, long& size){ + try { + MPI_Status status; + + MPI_File outMPI; + int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + + char filename[file.length()]; + strcpy(filename, file.c_str()); + + MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI); + + int startTime = time(NULL); + + string outputString = ""; + size = 0; + + if((output == "lt") && startLine == 0){ outputString += toString(alignDB.getNumSeqs()) + "\n"; } + + for(int i=startLine;icontrol_pressed) { return 0; } + + distCalculator->calcDist(alignDB.get(i), alignDB.get(j)); + double dist = distCalculator->getDist(); - MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status); - outputString = ""; + if (output == "lt") { outputString += toString(dist) + "\t"; } + } + + if (output == "lt") { outputString += "\n"; } + + + if(i % 100 == 0){ + //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); + cout << i << '\t' << (time(NULL) - startTime) << endl; + } + + + //send results to parent + int length = outputString.length(); + char buf[length]; + strcpy(buf, outputString.c_str()); + + MPI_File_write(outMPI, buf, length, MPI_CHAR, &status); + size += outputString.length(); + outputString = ""; + + } + //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); + cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl; + MPI_File_close(&outMPI); + return 1; } catch(exception& e) { diff --git a/distancecommand.h b/distancecommand.h index c1dac14..f825cfb 100644 --- a/distancecommand.h +++ b/distancecommand.h @@ -49,6 +49,7 @@ private: #ifdef USE_MPI int driverMPI(int, int, MPI_File&, float); + int driverMPI(int, int, string, long&); #endif int convertMatrix(string); diff --git a/distancedb.hpp b/distancedb.hpp index 01fea59..47e5fd7 100644 --- a/distancedb.hpp +++ b/distancedb.hpp @@ -25,6 +25,11 @@ public: void addSequence(Sequence); vector findClosestSequences(Sequence*, int); // returns indexes of n closest sequences to query + #ifdef USE_MPI + int MPISend(int) {return 0;} + int MPIRecv(int) {return 0;} + #endif + private: vector data; Dist* distCalculator; diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index b402a5b..3d3062b 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -167,9 +167,12 @@ int FilterSeqsCommand::execute() { inFASTA.close(); ////////////create filter///////////////// + m->mothurOut("Creating Filter... "); m->mothurOutEndLine(); filter = createFilter(); + m->mothurOutEndLine(); m->mothurOutEndLine(); + if (m->control_pressed) { return 0; } #ifdef USE_MPI @@ -193,8 +196,12 @@ int FilterSeqsCommand::execute() { ////////////run filter///////////////// + m->mothurOut("Running Filter... "); m->mothurOutEndLine(); + filterSequences(); - + + m->mothurOutEndLine(); m->mothurOutEndLine(); + int filteredLength = 0; for(int i=0;iMPIPos; MPI_Status status; MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are MPI_File outMPI; + MPI_File tempMPI; MPI_File inMPI; int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; int inMode=MPI_MODE_RDONLY; @@ -256,28 +265,26 @@ int FilterSeqsCommand::filterSequences() { MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI); + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); return 0; } + if (pid == 0) { //you are the root process - setLines(fastafileNames[s]); - - char bufF[alignmentLength]; - strcpy(bufF, filter.c_str()); - - for (int j = 0; j < lines.size(); j++) { //each process - if (j != 0) { //don't send to yourself - MPI_Send(&lines[j]->start, 1, MPI_INT, j, tag, MPI_COMM_WORLD); //start position in file - MPI_Send(&bufferSizes[j], 1, MPI_INT, j, tag, MPI_COMM_WORLD); //how bytes for the read - MPI_Send(bufF, alignmentLength, MPI_CHAR, j, tag, MPI_COMM_WORLD); - } - } + MPIPos = setFilePosFasta(fastafileNames[s], num); //fills MPIPos, returns numSeqs + numSeqs += num; - //read your peice of file - char buf[bufferSizes[0]]; - MPI_File_read_at(inMPI, lines[0]->start, buf, bufferSizes[0], MPI_CHAR, &status); - istringstream iss (buf,istringstream::in); + //send file positions to all processes + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&MPIPos[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + //figure out how many sequences you have to do + numSeqsPerProcessor = num / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = num - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + //do your part - driverMPIRun(iss, outMPI); + driverMPIRun(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); return 0; } //wait on chidren for(int i = 1; i < processors; i++) { @@ -286,23 +293,21 @@ int FilterSeqsCommand::filterSequences() { } }else { //you are a child process - //receive your section of file - int startPos, bufferSize; - char bufF[alignmentLength]; - MPI_Recv(&startPos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - MPI_Recv(&bufferSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - MPI_Recv(bufF, alignmentLength, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + numSeqs += num; + MPIPos.resize(num+1); + MPI_Bcast(&MPIPos[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions - filter = bufF; //filter was made by process 0 so other processes need to get it - - //read your peice of file - char buf2[bufferSize]; - MPI_File_read_at(inMPI, startPos, buf2, bufferSize, MPI_CHAR, &status); - istringstream iss (buf2,istringstream::in); + //figure out how many sequences you have to align + numSeqsPerProcessor = num / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = num - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //align your part + driverMPIRun(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); return 0; } - //do your part - driverMPIRun(iss, outMPI); - char buf[4]; strcpy(buf, "done"); @@ -361,16 +366,28 @@ int FilterSeqsCommand::filterSequences() { exit(1); } } +#ifdef USE_MPI /**************************************************************************************/ -int FilterSeqsCommand::driverMPIRun(istringstream& in, MPI_File& outMPI) { +int FilterSeqsCommand::driverMPIRun(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector& MPIPos) { try { string outputString = ""; int count = 0; MPI_Status status; - while (!in.eof()) { + for(int i=0;icontrol_pressed) { return 0; } + + //read next sequence + int length = MPIPos[start+i+1] - MPIPos[start+i]; + char buf4[length]; + MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); - Sequence seq(in); gobble(in); + string tempBuf = buf4; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + istringstream iss (tempBuf,istringstream::in); + + Sequence seq(iss); gobble(iss); if (seq.getName() != "") { string align = seq.getAligned(); @@ -396,6 +413,8 @@ int FilterSeqsCommand::driverMPIRun(istringstream& in, MPI_File& outMPI) { } } + + if((i+1) % 100 == 0){ cout << (i+1) << endl; m->mothurOutJustToLog(toString(i+1) + "\n"); } } if(outputString != ""){ //output to file @@ -407,7 +426,8 @@ int FilterSeqsCommand::driverMPIRun(istringstream& in, MPI_File& outMPI) { MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status); outputString = ""; } - + + if((num) % 100 != 0){ cout << (num) << endl; m->mothurOutJustToLog(toString(num) + "\n"); } return 0; } @@ -416,6 +436,7 @@ int FilterSeqsCommand::driverMPIRun(istringstream& in, MPI_File& outMPI) { exit(1); } } +#endif /**************************************************************************************/ int FilterSeqsCommand::driverRunFilter(string F, string outputFilename, string inputFilename, linePair* line) { try { @@ -518,9 +539,9 @@ string FilterSeqsCommand::createFilter() { for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); #ifdef USE_MPI - int pid; - int Atag = 1; int Ttag = 2; int Ctag = 3; int Gtag = 4; int Gaptag = 5; + int pid, numSeqsPerProcessor, num; int tag = 2001; + vector MPIPos; MPI_Status status; MPI_File inMPI; @@ -532,80 +553,44 @@ string FilterSeqsCommand::createFilter() { MPI_File_open(MPI_COMM_WORLD, tempFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + if (m->control_pressed) { MPI_File_close(&inMPI); return 0; } + if (pid == 0) { //you are the root process - setLines(fastafileNames[s]); - - for (int j = 0; j < lines.size(); j++) { //each process - if (j != 0) { //don't send to yourself - MPI_Send(&lines[j]->start, 1, MPI_INT, j, tag, MPI_COMM_WORLD); //start position in file - MPI_Send(&numSeqs, 1, MPI_INT, j, tag, MPI_COMM_WORLD); - MPI_Send(&bufferSizes[j], 1, MPI_INT, j, tag, MPI_COMM_WORLD); //how bytes for the read - } - } - - char buf[bufferSizes[0]]; - MPI_File_read_at(inMPI, 0, buf, bufferSizes[0], MPI_CHAR, &status); - - string tempBuf = buf; - if (tempBuf.length() > bufferSizes[0]) { tempBuf = tempBuf.substr(0, bufferSizes[0]); } - - MPICreateFilter(F, tempBuf); - - if (m->control_pressed) { MPI_File_close(&inMPI); return filterString; } - - vector temp; temp.resize(alignmentLength+1); + MPIPos = setFilePosFasta(fastafileNames[s], num); //fills MPIPos, returns numSeqs + numSeqs += num; - //get the frequencies from the child processes - for(int i = 0; i < ((processors-1)*5); i++) { - MPI_Recv(&temp[0], (alignmentLength+1), MPI_INT, MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &status); - int receiveTag = temp[temp.size()-1]; //child process added a int to the end to indicate what letter count this is for - - if (receiveTag == Atag) { //you are recieveing the A frequencies - for (int k = 0; k < alignmentLength; k++) { F.a[k] += temp[k]; } - }else if (receiveTag == Ttag) { //you are recieveing the T frequencies - for (int k = 0; k < alignmentLength; k++) { F.t[k] += temp[k]; } - }else if (receiveTag == Ctag) { //you are recieveing the C frequencies - for (int k = 0; k < alignmentLength; k++) { F.c[k] += temp[k]; } - }else if (receiveTag == Gtag) { //you are recieveing the G frequencies - for (int k = 0; k < alignmentLength; k++) { F.g[k] += temp[k]; } - }else if (receiveTag == Gaptag) { //you are recieveing the gap frequencies - for (int k = 0; k < alignmentLength; k++) { F.gap[k] += temp[k]; } - } - } - + //send file positions to all processes + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&MPIPos[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + + //figure out how many sequences you have to do + numSeqsPerProcessor = num / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = num - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //do your part + MPICreateFilter(startIndex, numSeqsPerProcessor, F, inMPI, MPIPos); + if (m->control_pressed) { MPI_File_close(&inMPI); return 0; } + }else { //i am the child process - int startPos, bufferSize; - MPI_Recv(&startPos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - MPI_Recv(&bufferSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - - //send freqs - char buf2[bufferSize]; - MPI_File_read_at(inMPI, startPos, buf2, bufferSize, MPI_CHAR, &status); - - string tempBuf = buf2; - if (tempBuf.length() > bufferSize) { tempBuf = tempBuf.substr(0, bufferSize); } - - MPICreateFilter(F, tempBuf); - - if (m->control_pressed) { MPI_File_close(&inMPI); return filterString; } + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + MPIPos.resize(num+1); + numSeqs += num; + MPI_Bcast(&MPIPos[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + + //figure out how many sequences you have to align + numSeqsPerProcessor = num / processors; + if(pid == (processors - 1)){ numSeqsPerProcessor = num - pid * numSeqsPerProcessor; } + int startIndex = pid * numSeqsPerProcessor; + + //do your part + MPICreateFilter(startIndex, numSeqsPerProcessor, F, inMPI, MPIPos); - //send my fequency counts - F.a.push_back(Atag); - int ierr = MPI_Send(&(F.a[0]), (alignmentLength+1), MPI_INT, 0, tag, MPI_COMM_WORLD); - F.t.push_back(Ttag); - ierr = MPI_Send (&(F.t[0]), (alignmentLength+1), MPI_INT, 0, tag, MPI_COMM_WORLD); - F.c.push_back(Ctag); - ierr = MPI_Send(&(F.c[0]), (alignmentLength+1), MPI_INT, 0, tag, MPI_COMM_WORLD); - F.g.push_back(Gtag); - ierr = MPI_Send(&(F.g[0]), (alignmentLength+1), MPI_INT, 0, tag, MPI_COMM_WORLD); - F.gap.push_back(Gaptag); - ierr = MPI_Send(&(F.gap[0]), (alignmentLength+1), MPI_INT, 0, tag, MPI_COMM_WORLD); + if (m->control_pressed) { MPI_File_close(&inMPI); return 0; } } - MPI_Barrier(MPI_COMM_WORLD); MPI_File_close(&inMPI); #else @@ -645,13 +630,74 @@ string FilterSeqsCommand::createFilter() { } } + +#ifdef USE_MPI + int pid; + int Atag = 1; int Ttag = 2; int Ctag = 3; int Gtag = 4; int Gaptag = 5; + MPI_Status status; + + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + if (pid == 0) { //only one process should output the filter + + vector temp; temp.resize(alignmentLength+1); + + //get the frequencies from the child processes + for(int i = 0; i < ((processors-1)*5); i++) { + MPI_Recv(&temp[0], (alignmentLength+1), MPI_INT, MPI_ANY_SOURCE, 2001, MPI_COMM_WORLD, &status); + int receiveTag = temp[temp.size()-1]; //child process added a int to the end to indicate what letter count this is for + + if (receiveTag == Atag) { //you are recieveing the A frequencies + for (int k = 0; k < alignmentLength; k++) { F.a[k] += temp[k]; } + }else if (receiveTag == Ttag) { //you are recieveing the T frequencies + for (int k = 0; k < alignmentLength; k++) { F.t[k] += temp[k]; } + }else if (receiveTag == Ctag) { //you are recieveing the C frequencies + for (int k = 0; k < alignmentLength; k++) { F.c[k] += temp[k]; } + }else if (receiveTag == Gtag) { //you are recieveing the G frequencies + for (int k = 0; k < alignmentLength; k++) { F.g[k] += temp[k]; } + }else if (receiveTag == Gaptag) { //you are recieveing the gap frequencies + for (int k = 0; k < alignmentLength; k++) { F.gap[k] += temp[k]; } + } + } + }else{ + + //send my fequency counts + F.a.push_back(Atag); + int ierr = MPI_Send(&(F.a[0]), (alignmentLength+1), MPI_INT, 0, 2001, MPI_COMM_WORLD); + F.t.push_back(Ttag); + ierr = MPI_Send (&(F.t[0]), (alignmentLength+1), MPI_INT, 0, 2001, MPI_COMM_WORLD); + F.c.push_back(Ctag); + ierr = MPI_Send(&(F.c[0]), (alignmentLength+1), MPI_INT, 0, 2001, MPI_COMM_WORLD); + F.g.push_back(Gtag); + ierr = MPI_Send(&(F.g[0]), (alignmentLength+1), MPI_INT, 0, 2001, MPI_COMM_WORLD); + F.gap.push_back(Gaptag); + ierr = MPI_Send(&(F.gap[0]), (alignmentLength+1), MPI_INT, 0, 2001, MPI_COMM_WORLD); + } + + if (pid == 0) { //only one process should output the filter +#endif F.setNumSeqs(numSeqs); if(isTrue(vertical) == 1) { F.doVertical(); } if(soft != 0) { F.doSoft(); } filterString = F.getFilter(); - + +#ifdef USE_MPI + //send filter string to kids + MPI_Bcast(&filterString[0], alignmentLength, MPI_CHAR, 0, MPI_COMM_WORLD); + }else{ + //recieve filterString + char tempBuf[alignmentLength]; + MPI_Bcast(tempBuf, alignmentLength, MPI_CHAR, 0, MPI_COMM_WORLD); + + filterString = tempBuf; + if (filterString.length() > alignmentLength) { filterString = filterString.substr(0, alignmentLength); } + } + + MPI_Barrier(MPI_COMM_WORLD); +#endif + + return filterString; } catch(exception& e) { @@ -697,31 +743,43 @@ int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair* exit(1); } } +#ifdef USE_MPI /**************************************************************************************/ -int FilterSeqsCommand::MPICreateFilter(Filters& F, string input) { +int FilterSeqsCommand::MPICreateFilter(int start, int num, Filters& F, MPI_File& inMPI, vector& MPIPos) { try { - vector seqStrings; - parseBuffer(input, seqStrings); + MPI_Status status; + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - for(int i=0;icontrol_pressed = true; } - - if (m->control_pressed) { return 1; } + if (m->control_pressed) { return 0; } + + //read next sequence + int length = MPIPos[start+i+1] - MPIPos[start+i]; + + char buf4[length]; + MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); - Sequence seq("", seqStrings[i]); + string tempBuf = buf4; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + istringstream iss (tempBuf,istringstream::in); + + Sequence seq(iss); + + if (seq.getAligned().length() != alignmentLength) { cout << "Alignment length is " << alignmentLength << " and sequence " << seq.getName() << " has length " << seq.getAligned().length() << ", please correct." << endl; exit(1); } if(trump != '*'){ F.doTrump(seq); } if(isTrue(vertical) || soft != 0){ F.getFreqs(seq); } cout.flush(); //report progress - if((i+1) % 100 == 0){ m->mothurOut(toString(i+1)); m->mothurOutEndLine(); } + if((i+1) % 100 == 0){ cout << (i+1) << endl; m->mothurOutJustToLog(toString(i+1) + "\n"); } } //report progress - if((seqStrings.size()) % 100 != 0){ m->mothurOut(toString(seqStrings.size())); m->mothurOutEndLine(); } + if((num) % 100 != 0){ cout << num << endl; m->mothurOutJustToLog(toString(num) + "\n"); } return 0; } @@ -730,7 +788,7 @@ int FilterSeqsCommand::MPICreateFilter(Filters& F, string input) { exit(1); } } - +#endif /**************************************************************************************************/ int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) { @@ -826,28 +884,4 @@ int FilterSeqsCommand::setLines(string filename) { exit(1); } } -/**************************************************************************************************/ -int FilterSeqsCommand::parseBuffer(string file, vector& seqs) { - try { - istringstream iss (file); //,istringstream::in - string name, seqstring; - - while (!iss.eof()) { - - if (m->control_pressed) { return 0; } - - Sequence seq(iss); gobble(iss); - - if (seq.getName() != "") { - seqs.push_back(seq.getAligned()); - } - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "FilterSeqsCommand", "parseBuffer"); - exit(1); - } -} /**************************************************************************************/ diff --git a/filterseqscommand.h b/filterseqscommand.h index 1d2526f..3c46036 100644 --- a/filterseqscommand.h +++ b/filterseqscommand.h @@ -47,12 +47,14 @@ private: int filterSequences(); int createProcessesCreateFilter(Filters&, string); int createProcessesRunFilter(string, string); - int driverCreateFilter(Filters&, string, linePair*); - int driverRunFilter(string, string, string, linePair*); - int driverMPIRun(istringstream&, MPI_File&); - int MPICreateFilter(Filters&, string); + int driverRunFilter(string, string, string, linePair*); + int driverCreateFilter(Filters& F, string filename, linePair* line); + #ifdef USE_MPI + int driverMPIRun(int, int, MPI_File&, MPI_File&, vector&); + int MPICreateFilter(int, int, Filters&, MPI_File&, vector&); + #endif int setLines(string); - int parseBuffer(string, vector&); + }; diff --git a/fullmatrix.cpp b/fullmatrix.cpp index 7286649..43c6f2e 100644 --- a/fullmatrix.cpp +++ b/fullmatrix.cpp @@ -44,6 +44,7 @@ FullMatrix::FullMatrix(ifstream& filehandle) { for(int i=0;i> matrix[0][i]; + if (globaldata->sim) { matrix[0][i] = 1.0 - matrix[0][i]; } } break; } @@ -93,6 +94,7 @@ int FullMatrix::readSquareMatrix(ifstream& filehandle) { if (m->control_pressed) { delete reading; return 0; } filehandle >> matrix[i][j]; + if (globaldata->sim) { matrix[i][j] = 1.0 - matrix[i][j]; } count++; reading->update(count); @@ -135,8 +137,10 @@ int FullMatrix::readLTMatrix(ifstream& filehandle) { if (m->control_pressed) { delete reading; return 0; } filehandle >> distance; - + if (globaldata->sim) { distance = 1.0 - distance; } + matrix[i][j] = distance; matrix[j][i] = distance; + count++; reading->update(count); } diff --git a/globaldata.hpp b/globaldata.hpp index b4b8e0d..35415c7 100644 --- a/globaldata.hpp +++ b/globaldata.hpp @@ -43,7 +43,7 @@ public: TreeMap* gTreemap; SequenceDB* gSequenceDB; string inputFileName, argv; - bool allLines, runParse, jumble; + bool allLines, runParse, jumble, sim; vector Estimators, Groups; //holds estimators to be used set labels; //holds labels to be used vector Treenames; diff --git a/kmerdb.cpp b/kmerdb.cpp index 7f7f265..77d93da 100644 --- a/kmerdb.cpp +++ b/kmerdb.cpp @@ -48,6 +48,8 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) { } /**************************************************************************************************/ +KmerDB::KmerDB() : Database() {} +/**************************************************************************************************/ KmerDB::~KmerDB(){} @@ -204,6 +206,44 @@ vector KmerDB::getSequencesWithKmer(int kmer) { exit(1); } } +#ifdef USE_MPI +/**************************************************************************************************/ +int KmerDB::MPISend(int receiver) { + try { + + //send kmerSize - int + MPI_Send(&kmerSize, 1, MPI_INT, receiver, 2001, MPI_COMM_WORLD); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "KmerDB", "MPISend"); + exit(1); + } +} +/**************************************************************************************************/ +int KmerDB::MPIRecv(int sender) { + try { + MPI_Status status; + + //receive kmerSize - int + MPI_Recv(&kmerSize, 1, MPI_INT, sender, 2001, MPI_COMM_WORLD, &status); + + //set maxKmer + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + count = 0; + maxKmer = power4s[kmerSize]; + kmerLocations.resize(maxKmer+1); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "KmerDB", "MPIRecv"); + exit(1); + } +} +#endif +/**************************************************************************************************/ /**************************************************************************************************/ diff --git a/kmerdb.hpp b/kmerdb.hpp index bdd9ca5..513f3f0 100644 --- a/kmerdb.hpp +++ b/kmerdb.hpp @@ -26,6 +26,7 @@ class KmerDB : public Database { public: KmerDB(string, int); + KmerDB(); ~KmerDB(); void generateDB(); @@ -36,6 +37,11 @@ public: vector getSequencesWithKmer(int); //returns vector of sequences that contain kmer passed in int getMaxKmer() { return maxKmer; } + #ifdef USE_MPI + int MPISend(int); //just sends kmersize + int MPIRecv(int); + #endif + private: int kmerSize; diff --git a/mothur.h b/mothur.h index 7904425..dcb026c 100644 --- a/mothur.h +++ b/mothur.h @@ -941,6 +941,78 @@ inline string sortFile(string distFile){ exit(1); } } +/**************************************************************************************************/ +inline vector setFilePosFasta(string filename, int& num) { + + vector positions; + ifstream inFASTA; + openInputFile(filename, inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); gobble(inFASTA); + if (input.length() != 0) { + if(input[0] == '>'){ long pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + num = positions.size(); + + FILE * pFile; + long size; + + //get num bytes in file + pFile = fopen (filename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + + positions.push_back(size); + + return positions; +} +/**************************************************************************************************/ +inline vector setFilePosEachLine(string filename, int& num) { + + vector positions; + ifstream in; + openInputFile(filename, in); + + string input; + while(!in.eof()){ + long lastpos = in.tellg(); + input = getline(in); gobble(in); + if (input.length() != 0) { + long pos = in.tellg(); + if (pos != -1) { positions.push_back(pos - input.length() - 1); } + else { positions.push_back(lastpos); } + } + } + in.close(); + + num = positions.size(); + + FILE * pFile; + long size; + + //get num bytes in file + pFile = fopen (filename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + + positions.push_back(size); + + return positions; +} + /**************************************************************************************************/ #endif diff --git a/nastreport.cpp b/nastreport.cpp index aa7d393..132886b 100644 --- a/nastreport.cpp +++ b/nastreport.cpp @@ -12,6 +12,25 @@ #include "alignment.hpp" #include "nastreport.hpp" + +/******************************************************************************************************************/ + +NastReport::NastReport() { + output = ""; +} +/******************************************************************************************************************/ +string NastReport::getHeaders() { + output = ""; + + output += "QueryName\tQueryLength\tTemplateName\tTemplateLength\t"; + output += "SearchMethod\tSearchScore\t"; + output += "AlignmentMethod\tQueryStart\tQueryEnd\tTemplateStart\tTemplateEnd\t"; + output += "PairwiseAlignmentLength\tGapsInQuery\tGapsInTemplate\t"; + output += "LongestInsert\t"; + output += "SimBtwnQuery&Template\n"; + + return output; +} /******************************************************************************************************************/ NastReport::NastReport(string candidateReportFName) { @@ -47,6 +66,38 @@ void NastReport::print(){ candidateReportFile << endl; candidateReportFile.flush(); } +/******************************************************************************************************************/ + +string NastReport::getReport(){ + + output = ""; + + output += queryName + '\t' + toString(queryLength) + '\t' + templateName + '\t' + toString(templateLength) + '\t'; + + string temp = toString(searchScore); + int pos = temp.find_last_of('.'); //find deicmal point if their is one + + //if there is a decimal + if (pos != -1) { temp = temp.substr(0, pos+3); } //set precision to 2 places + else{ temp += ".00"; } + + output += searchMethod + '\t' + temp + '\t'; + output += alignmentMethod + '\t' + toString(candidateStartPosition) + "\t" + toString(candidateEndPosition) + '\t'; + output += toString(templateStartPosition) + "\t" + toString(templateEndPosition) + '\t'; + output += toString(pairwiseAlignmentLength) + '\t' + toString(totalGapsInQuery) + '\t' + toString(totalGapsInTemplate) + '\t'; + output += toString(longestInsert) + '\t'; + + temp = toString(similarityToTemplate); + pos = temp.find_last_of('.'); //find deicmal point if their is one + + //if there is a decimal + if (pos != -1) { temp = temp.substr(0, pos+3); } //set precision to 2 places + else{ temp += ".00"; } + + output += temp + '\n'; + + return output; +} /******************************************************************************************************************/ diff --git a/nastreport.hpp b/nastreport.hpp index 2be289c..80c9949 100644 --- a/nastreport.hpp +++ b/nastreport.hpp @@ -19,6 +19,7 @@ class NastReport { public: NastReport(string); + NastReport(); ~NastReport(); void setCandidate(Sequence*); void setTemplate(Sequence*); @@ -26,9 +27,12 @@ public: void setAlignmentParameters(string, Alignment*); void setNastParameters(Nast); void print(); + string getReport(); + string getHeaders(); private: string queryName; + string output; int queryLength; string templateName; int templateLength; diff --git a/parsesffcommand.cpp b/parsesffcommand.cpp new file mode 100644 index 0000000..ed26be0 --- /dev/null +++ b/parsesffcommand.cpp @@ -0,0 +1,562 @@ +/* + * parsesffcommand.cpp + * Mothur + * + * Created by Pat Schloss on 2/6/10. + * Copyright 2010 Patrick D. Schloss. All rights reserved. + * + */ + +#include "parsesffcommand.h" +#include "sequence.hpp" + +//********************************************************************************************************************** + +ParseSFFCommand::ParseSFFCommand(string option){ + try { + abort = false; + + if(option == "help") { + help(); + abort = true; + } + else { + //valid paramters for this command + string Array[] = {"sff", "oligos", "minlength", "outputdir", "inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("sff"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["sff"] = inputDir + it->second; } + } + + it = parameters.find("oligos"); + //user has given an oligos file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["oligos"] = inputDir + it->second; } + } + } + + + //check for required parameters + sffFile = validParameter.validFile(parameters, "sff", true); + if (sffFile == "not found"){ + m->mothurOut("sff is a required parameter for the parse.sff command."); + m->mothurOutEndLine(); + abort = true; + } + else if (sffFile == "not open") { abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); + if (outputDir == "not found"){ + outputDir = ""; + outputDir += hasPath(sffFile); //if user entered a file with a path then preserve it + } + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + oligoFile = validParameter.validFile(parameters, "oligos", true); + if (oligoFile == "not found") { oligoFile = ""; } + else if(oligoFile == "not open"){ abort = true; } + + string temp = validParameter.validFile(parameters, "minlength", false); + if (temp == "not found") { temp = "0"; } + convert(temp, minLength); + } + } + catch(exception& e) { + m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand"); + exit(1); + } +} + +//********************************************************************************************************************** + +ParseSFFCommand::~ParseSFFCommand() { /* do nothing */ } + +//********************************************************************************************************************** + +int ParseSFFCommand::execute(){ + try { + if (abort == true) { return 0; } + + ifstream inSFF; + openInputFile(sffFile, inSFF); + + cout.setf(ios::fixed, ios::floatfield); + cout.setf(ios::showpoint); + cout << setprecision(2); + + vector flowFileNames; + if(oligoFile != ""){ + getOligos(flowFileNames); + } + else{ + flowFileNames.push_back(new ofstream((outputDir + getRootName(getSimpleName(sffFile)) + "flow").c_str(), ios::ate)); + outputNames.push_back((outputDir + getRootName(getSimpleName(sffFile)) + "flow")); + } + + for(int i=0;isetf(ios::fixed, ios::floatfield); + flowFileNames[i]->setf(ios::showpoint); + *flowFileNames[i] << setprecision(2); + } + + if (m->control_pressed) { for(int i=0;iclose(); } return 0; } + +// ofstream fastaFile; +// openOutputFile(getRootName(sffFile) + "fasta", fastaFile); + +// ofstream qualFile; +// openOutputFile(getRootName(sffFile) + "qual", qualFile); + + string commonHeader = getline(inSFF); + string magicNumber = getline(inSFF); + string version = getline(inSFF); + string indexOffset = getline(inSFF); + string indexLength = getline(inSFF); + int numReads = parseHeaderLineToInt(inSFF); + string headerLength = getline(inSFF); + string keyLength = getline(inSFF); + int numFlows = parseHeaderLineToInt(inSFF); + string flowgramCode = getline(inSFF); + string flowChars = getline(inSFF); + string keySequence = getline(inSFF); + gobble(inSFF); + + string seqName; + bool good = 0; + + for(int i=0;icontrol_pressed) { for(int i=0;iclose(); } return 0; } + + inSFF >> seqName; + seqName = seqName.substr(1); + gobble(inSFF); + + string runPrefix = parseHeaderLineToString(inSFF); + string regionNumber = parseHeaderLineToString(inSFF); + string xyLocation = parseHeaderLineToString(inSFF); + gobble(inSFF); + + string runName = parseHeaderLineToString(inSFF); + string analysisName = parseHeaderLineToString(inSFF); + string fullPath = parseHeaderLineToString(inSFF); + gobble(inSFF); + + string readHeaderLen = parseHeaderLineToString(inSFF); + string nameLength = parseHeaderLineToString(inSFF); + int numBases = parseHeaderLineToInt(inSFF); + string clipQualLeft = parseHeaderLineToString(inSFF); + int clipQualRight = parseHeaderLineToInt(inSFF); + string clipAdapLeft = parseHeaderLineToString(inSFF); + string clipAdapRight = parseHeaderLineToString(inSFF); + gobble(inSFF); + + vector flowVector = parseHeaderLineToFloatVector(inSFF, numFlows); + vector flowIndices = parseHeaderLineToIntVector(inSFF, numBases); + string bases = parseHeaderLineToString(inSFF); + string qualityScores = parseHeaderLineToString(inSFF); + gobble(inSFF); + + + + int flowLength = flowIndices[clipQualRight-1]; + + screenFlow(flowVector, flowLength); + string sequence = flow2seq(flowVector, flowLength); + + int group = 0; + + if(minLength != 0 || numFPrimers != 0 || numBarcodes != 0 || numRPrimers != 0){ + good = screenSeq(sequence, group); + } + + if(good){ + *flowFileNames[group] << seqName << ' ' << flowLength; + for(int i=0;imothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + +// fastaFile.close(); +// qualFile.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ParseSFFCommand", "execute"); + exit(1); + } +} + +//********************************************************************************************************************** + +void ParseSFFCommand::help(){ + try { + m->mothurOut("The parse.sff command..."); + m->mothurOutEndLine(); + } + catch(exception& e) { + m->errorOut(e, "ParseSFFCommand", "help"); + exit(1); + } +} + +//********************************************************************************************************************** + +void ParseSFFCommand::getOligos(vector& outSFFFlowVec){ + try { + + ifstream inOligos; + openInputFile(oligoFile, inOligos); + + string type, oligo, group; + + int index = 0; + + while(!inOligos.eof()){ + inOligos >> type; + + if(type[0] == '#'){ getline(inOligos); } // get rest of line if there's any crap there + else{ + inOligos >> oligo; + + for(int i=0;i> group; + barcodes[oligo]=index++; + groupVector.push_back(group); + + outSFFFlowVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(sffFile)) + group + ".flow").c_str(), ios::ate)); + outputNames.push_back((outputDir + getRootName(getSimpleName(sffFile)) + group + "flow")); + } + } + gobble(inOligos); + } + + inOligos.close(); + + numFPrimers = forPrimer.size(); + numRPrimers = revPrimer.size(); + numBarcodes = barcodes.size(); + } + catch(exception& e) { + m->errorOut(e, "ParseSFFCommand", "getOligos"); + exit(1); + } + +} + +//********************************************************************************************************************** + +int ParseSFFCommand::parseHeaderLineToInt(ifstream& file){ + + int number; + + while (!file.eof()) { + + char c = file.get(); + if (c == ':'){ + file >> number; + break; + } + + } + gobble(file); + return number; +} + +//********************************************************************************************************************** + +string ParseSFFCommand::parseHeaderLineToString(ifstream& file){ + + string text; + + while (!file.eof()) { + char c = file.get(); + + if (c == ':'){ + gobble(file); + text = getline(file); + break; + } + } + gobble(file); + + return text; +} + +//********************************************************************************************************************** + +vector ParseSFFCommand::parseHeaderLineToFloatVector(ifstream& file, int length){ + + vector floatVector(length); + + while (!file.eof()) { + char c = file.get(); + if (c == ':'){ + for(int i=0;i> floatVector[i]; + } + break; + } + } + gobble(file); + return floatVector; +} + +//********************************************************************************************************************** + +vector ParseSFFCommand::parseHeaderLineToIntVector(ifstream& file, int length){ + + vector intVector(length); + + while (!file.eof()) { + char c = file.get(); + if (c == ':'){ + for(int i=0;i> intVector[i]; + } + break; + } + } + gobble(file); + return intVector; +} + +//********************************************************************************************************************** + + +void ParseSFFCommand::screenFlow(vector flowgram, int& length){ + try{ + + int newLength = 0; + + while(newLength * 4 < length){ + + int signal = 0; + int noise = 0; + for(int i=0;i<4;i++){ + float flow = flowgram[i + 4 * newLength]; + + if(flow > 0.50){ + signal++; + if(flow <= 0.69){ // not sure why, but if i make it <0.70 it doesn't work... + noise++; + } + } + } + if(noise > 0 || signal == 0){ + break; + } + newLength++; + } + length = newLength * 4; + } + + catch(exception& e) { + m->errorOut(e, "ParseSFFCommand", "screenFlow"); + exit(1); + } +} + +//********************************************************************************************************************** + +string ParseSFFCommand::flow2seq(vector flowgram, int length){ + + string flow = "TACG"; + string sequence = ""; + for(int i=8;i::iterator it=barcodes.begin();it!=barcodes.end();it++){ + if(compareDNASeq(it->first, sequence.substr(0,(it->first).length()))){ + barcode = 1; + barcodeLength = (it->first).size(); + group = it->second; + break; + } + else{ + barcode = 0; + } + } + + + int fPrimer = 1; + for(int i=0;ierrorOut(e, "TrimSeqsCommand", "compareDNASeq"); + exit(1); + } +} + +//********************************************************************************************************************** + +//string ParseSFFCommand::stripSeqQual(string qScores, int start, int end){ +// +// +// return qScores.substr(start-1, end-start+1); +// +//} + +//********************************************************************************************************************** + +//string ParseSFFCommand::stripQualQual(string qScores, int start, int end){ +// +// start--; +// +// int startCount = 0; +// int startIndex = 0; +// +// while(startCount < start && startIndex < qScores.length()){ +// if(isspace(qScores[startIndex])){ +// startCount++; +// } +// startIndex++; +// } +// +// int endCount = startCount; +// int endIndex = startIndex; +// +// while(endCount < end && endIndex < qScores.length()){ +// if(isspace(qScores[endIndex])){ +// endCount++; +// } +// endIndex++; +// } +// +// return qScores.substr(startIndex, endIndex-startIndex-1);//, endCount-startCount); +// +//} + +//********************************************************************************************************************** + + diff --git a/parsesffcommand.h b/parsesffcommand.h new file mode 100644 index 0000000..409293c --- /dev/null +++ b/parsesffcommand.h @@ -0,0 +1,55 @@ +#ifndef PARSESFFCOMMAND_H +#define PARSESFFCOMMAND_H + +/* + * parsesffcommand.h + * Mothur + * + * Created by Pat Schloss on 2/6/10. + * Copyright 2010 Patrick D. Schloss. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" + +class ParseSFFCommand : public Command { +public: + ParseSFFCommand(string); + ~ParseSFFCommand(); + int execute(); + void help(); + +private: + + int parseHeaderLineToInt(ifstream&); + vector parseHeaderLineToFloatVector(ifstream&, int); + vector parseHeaderLineToIntVector(ifstream&, int); + string parseHeaderLineToString(ifstream&); + void screenFlow(vector, int&); + string flow2seq(vector, int); + bool screenSeq(string&, int&); + bool compareDNASeq(string, string); + void getOligos(vector&); + + + string sffFile; + string oligoFile; + + int minLength; + int numFPrimers, numRPrimers, numBarcodes; + vector forPrimer, revPrimer; + map barcodes; + vector groupVector; + vector outputNames; + +// string stripSeqQual(string, int, int); +// string stripQualQual(string, int, int); + + string outputDir; + bool abort; +}; + +#endif + + diff --git a/pintail.cpp b/pintail.cpp index 6e9e95c..84c3219 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -18,10 +18,30 @@ inline bool compareQuanMembers(quanMember left, quanMember right){ } //*************************************************************************************************************** -Pintail::Pintail(string filename, string o) { - fastafile = filename; outputDir = o; - distcalculator = new eachGapDist(); - decalc = new DeCalculator(); +Pintail::Pintail(string filename, string temp, bool f, int p, string mask, string cons, string q, int win, int inc, string o) : Chimera() { + try { + + fastafile = filename; + templateFileName = temp; templateSeqs = readSeqs(temp); + filter = f; + processors = p; + setMask(mask); + consfile = cons; + quanfile = q; + window = win; + increment = inc; + outputDir = o; + + distcalculator = new eachGapDist(); + decalc = new DeCalculator(); + + doPrep(); + } + catch(exception& e) { + m->errorOut(e, "Pintail", "Pintail"); + exit(1); + } + } //*************************************************************************************************************** @@ -51,6 +71,9 @@ int Pintail::doPrep() { decalc->setMask(seqMask); + #ifdef USE_MPI + //do nothing + #else #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) //find breakup of templatefile for quantiles if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); } @@ -64,7 +87,7 @@ int Pintail::doPrep() { #else templateLines.push_back(new linePair(0, templateSeqs.size())); #endif - + #endif m->mothurOut("Getting conservation... "); cout.flush(); if (consfile == "") { @@ -76,30 +99,15 @@ int Pintail::doPrep() { m->mothurOutEndLine(); //make P into Q - for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl; + for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } // bool reRead = false; //create filter if needed for later if (filter) { //read in all query seqs - ifstream in; - openInputFile(fastafile, in); - - vector tempQuerySeqs; - while(!in.eof()){ - if (m->control_pressed) { - for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } - return 0; - } + vector tempQuerySeqs = readSeqs(fastafile); - Sequence* s = new Sequence(in); - gobble(in); - - if (s->getName() != "") { tempQuerySeqs.push_back(s); } - } - in.close(); - vector temp; //merge query seqs and template seqs temp = templateSeqs; @@ -159,7 +167,6 @@ int Pintail::doPrep() { if (m->control_pressed) { return 0; } - ofstream out4, out5; string noOutliers, outliers; if ((!filter) && (seqMask == "")) { @@ -175,8 +182,9 @@ int Pintail::doPrep() { decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); if (m->control_pressed) { return 0; } + + string outputString = ""; - openOutputFile(noOutliers, out5); //adjust quantiles for (int i = 0; i < quantilesMembers.size(); i++) { vector temp; @@ -206,14 +214,16 @@ int Pintail::doPrep() { } //output quan value - out5 << i+1 << '\t'; - for (int u = 0; u < temp.size(); u++) { out5 << temp[u] << '\t'; } - out5 << endl; + outputString += toString(i+1) + "\t"; + for (int u = 0; u < temp.size(); u++) { outputString += toString(temp[u]) + "\t"; } + outputString += "\n"; quantiles[i] = temp; } - + + printQuanFile(noOutliers, outputString); + m->mothurOut("Done."); m->mothurOutEndLine(); } @@ -274,7 +284,64 @@ int Pintail::print(ostream& out, ostream& outAcc) { exit(1); } } +#ifdef USE_MPI +//*************************************************************************************************************** +int Pintail::print(MPI_File& out, MPI_File& outAcc) { + try { + bool results = false; + string outputString = ""; + int index = ceil(deviation); + + //is your DE value higher than the 95% + string chimera; + if (index != 0) { //if index is 0 then its an exact match to a template seq + if (quantiles[index][4] == 0.0) { + chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index); + }else { + if (DE > quantiles[index][4]) { chimera = "Yes"; } + else { chimera = "No"; } + } + }else{ chimera = "No"; } + outputString += querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera + "\n"; + if (chimera == "Yes") { + cout << querySeq->getName() << "\tdiv: " << toString(deviation) << "\tstDev: " << toString(DE) << "\tchimera flag: " << chimera << endl; + string outAccString = querySeq->getName() + "\n"; + + MPI_Status statusAcc; + int length = outAccString.length(); + char buf[length]; + strcpy(buf, outAccString.c_str()); + + MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc); + + results = true; + } + outputString += "Observed\t"; + + for (int j = 0; j < obsDistance.size(); j++) { outputString += toString(obsDistance[j]) + "\t"; } + outputString += "\n"; + + outputString += "Expected\t"; + + for (int m = 0; m < expectedDistance.size(); m++) { outputString += toString(expectedDistance[m]) + "\t"; } + outputString += "\n"; + + MPI_Status status; + int length = outputString.length(); + char buf2[length]; + strcpy(buf2, outputString.c_str()); + + MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status); + + return results; + } + catch(exception& e) { + m->errorOut(e, "Pintail", "print"); + exit(1); + } +} +#endif //*************************************************************************************************************** int Pintail::getChimeras(Sequence* query) { try { @@ -346,16 +413,56 @@ int Pintail::getChimeras(Sequence* query) { vector Pintail::readFreq() { try { - - ifstream in; - openInputFile(consfile, in); - + //read in probabilities and store in vector + int pos; float num; + vector prob; set h = decalc->getPos(); //positions of bases in masking sequence - //read in probabilities and store in vector - int pos; float num; + #ifdef USE_MPI + + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + char inFileName[consfile.length()]; + strcpy(inFileName, consfile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); + + char buffer[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + + while(!iss.eof()) { + iss >> pos >> num; + + if (h.count(pos) > 0) { + float Pi; + Pi = (num - 0.25) / 0.75; + + //cannot have probability less than 0. + if (Pi < 0) { Pi = 0.0; } + + //do you want this spot + prob.push_back(Pi); + } + + gobble(iss); + } + + MPI_File_close(&inMPI); + + #else + + ifstream in; + openInputFile(consfile, in); + while(!in.eof()){ in >> pos >> num; @@ -373,8 +480,10 @@ vector Pintail::readFreq() { gobble(in); } - in.close(); + + #endif + return prob; } @@ -400,7 +509,7 @@ Sequence* Pintail::findPairs(Sequence* q) { exit(1); } } -/**************************************************************************************************/ +//************************************************************************************************** void Pintail::createProcessesQuan() { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) @@ -464,13 +573,12 @@ void Pintail::createProcessesQuan() { vector q; float w; int b, n; for (int j = 0; j < num; j++) { in >> w >> b >> n; - //cout << w << '\t' << b << '\t' n << endl; + quanMember newMember(w, b, n); q.push_back(newMember); } -//cout << "here" << endl; + quan[m] = q; -//cout << "now here" << endl; gobble(in); } @@ -495,8 +603,134 @@ void Pintail::createProcessesQuan() { exit(1); } } +//*************************************************************************************************************** +vector< vector > Pintail::readQuantiles() { + try { + int num; + float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; + + vector< vector > quan; + vector temp; temp.resize(6, 0); + + //to fill 0 + quan.push_back(temp); + #ifdef USE_MPI + + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + + char inFileName[quanfile.length()]; + strcpy(inFileName, quanfile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); + + char buffer[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + + while(!iss.eof()) { + iss >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; + + temp.clear(); + + temp.push_back(ten); + temp.push_back(twentyfive); + temp.push_back(fifty); + temp.push_back(seventyfive); + temp.push_back(ninetyfive); + temp.push_back(ninetynine); + + quan.push_back(temp); + + gobble(iss); + } + + MPI_File_close(&inMPI); + + #else + + ifstream in; + openInputFile(quanfile, in); + + while(!in.eof()){ + + in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; + + temp.clear(); + + temp.push_back(ten); + temp.push_back(twentyfive); + temp.push_back(fifty); + temp.push_back(seventyfive); + temp.push_back(ninetyfive); + temp.push_back(ninetynine); + + quan.push_back(temp); + + gobble(in); + } + in.close(); + #endif + + return quan; + + } + catch(exception& e) { + m->errorOut(e, "Pintail", "readQuantiles"); + exit(1); + } +} +//***************************************************************************************************************/ + +void Pintail::printQuanFile(string file, string outputString) { + try { + + #ifdef USE_MPI + + MPI_File outQuan; + MPI_Status status; + + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + + char FileName[file.length()]; + strcpy(FileName, file.c_str()); + + if (pid == 0) { + MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outQuan); //comm, filename, mode, info, filepointer + + int length = outputString.length(); + char buf[length]; + strcpy(buf, outputString.c_str()); + + MPI_File_write(outQuan, buf, length, MPI_CHAR, &status); + + MPI_File_close(&outQuan); + } + #else + ofstream outQuan; + openOutputFile(file, outQuan); + + outQuan << outputString; + + outQuan.close(); + #endif + } + catch(exception& e) { + m->errorOut(e, "Pintail", "printQuanFile"); + exit(1); + } +} + +//***************************************************************************************************************/ -//*************************************************************************************************************** diff --git a/pintail.h b/pintail.h index 59d4fee..7164842 100644 --- a/pintail.h +++ b/pintail.h @@ -24,7 +24,7 @@ class Pintail : public Chimera { public: - Pintail(string, string); + Pintail(string, string, bool, int, string, string, string, int, int, string); //fastafile, templatefile, filter, processors, mask, conservation, quantile, window, increment, outputDir) ~Pintail(); int getChimeras(Sequence*); @@ -33,13 +33,16 @@ class Pintail : public Chimera { void setCons(string c) { consfile = c; } void setQuantiles(string q) { quanfile = q; } + #ifdef USE_MPI + int print(MPI_File&, MPI_File&); + #endif private: Dist* distcalculator; DeCalculator* decalc; - int iters; - string fastafile, consfile; + int iters, window, increment, processors; + string fastafile, quanfile, consfile; vector templateLines; Sequence* querySeq; @@ -52,7 +55,7 @@ class Pintail : public Chimera { vector windowsForeachQuery; // windowsForeachQuery is a vector containing the starting spot in query aligned sequence for each window. //this is needed so you can move by bases and not just spots in the alignment - int windowSizes; //windowSizes = window size of query + int windowSizes; //windowSizes = window size of query vector windowSizesTemplate; //windowSizesTemplate[0] = window size of templateSeqs[0] map trimmed; //trimmed = start and stop of trimmed sequences for query @@ -67,12 +70,13 @@ class Pintail : public Chimera { set h; string mergedFilterString; - + vector< vector > readQuantiles(); vector readFreq(); Sequence* findPairs(Sequence*); void createProcessesQuan(); int doPrep(); + void printQuanFile(string, string); }; diff --git a/readcolumn.cpp b/readcolumn.cpp index d58b037..f61a40c 100644 --- a/readcolumn.cpp +++ b/readcolumn.cpp @@ -1,151 +1,154 @@ -/* - * readcolumn.cpp - * Mothur - * - * Created by Sarah Westcott on 4/21/09. - * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. - * - */ - -#include "readcolumn.h" -#include "progress.hpp" - -/***********************************************************************/ - -ReadColumnMatrix::ReadColumnMatrix(string df) : distFile(df){ - - successOpen = openInputFile(distFile, fileHandle); - -} - -/***********************************************************************/ - -int ReadColumnMatrix::read(NameAssignment* nameMap){ - try { - - string firstName, secondName; - float distance; - int nseqs = nameMap->size(); - - list = new ListVector(nameMap->getListVector()); - - Progress* reading = new Progress("Reading matrix: ", nseqs * nseqs); - - int lt = 1; - int refRow = 0; //we'll keep track of one cell - Cell(refRow,refCol) - and see if it's transpose - int refCol = 0; //shows up later - Cell(refCol,refRow). If it does, then its a square matrix - - //need to see if this is a square or a triangular matrix... - - while(fileHandle && lt == 1){ //let's assume it's a triangular matrix... - - - fileHandle >> firstName >> secondName >> distance; // get the row and column names and distance - - if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } - - map::iterator itA = nameMap->find(firstName); - map::iterator itB = nameMap->find(secondName); - - if(itA == nameMap->end()){ - cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); - } - if(itB == nameMap->end()){ - cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); - } - - if (distance == -1) { distance = 1000000; } - - if(distance < cutoff && itA != itB){ - if(itA->second > itB->second){ - PCell value(itA->second, itB->second, distance); - - if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol... - refRow = itA->second; - refCol = itB->second; - D->addCell(value); - } - else if(refRow == itA->second && refCol == itB->second){ - lt = 0; - } - else{ - D->addCell(value); - } - } - else if(itA->second < itB->second){ - PCell value(itB->second, itA->second, distance); - - if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol... - refRow = itA->second; - refCol = itB->second; - D->addCell(value); - } - else if(refRow == itB->second && refCol == itA->second){ - lt = 0; - } - else{ - D->addCell(value); - } - } - reading->update(itA->second * nseqs); - } - gobble(fileHandle); - } - - if(lt == 0){ // oops, it was square - fileHandle.close(); //let's start over - D->clear(); //let's start over - - openInputFile(distFile, fileHandle); //let's start over - - while(fileHandle){ - fileHandle >> firstName >> secondName >> distance; - - if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } - - map::iterator itA = nameMap->find(firstName); - map::iterator itB = nameMap->find(secondName); - - if(itA == nameMap->end()){ - cerr << "BError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; - } - if(itB == nameMap->end()){ - cerr << "BError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; - } - - if (distance == -1) { distance = 1000000; } - - if(distance < cutoff && itA->second > itB->second){ - PCell value(itA->second, itB->second, distance); - D->addCell(value); - reading->update(itA->second * nseqs); - } - - gobble(fileHandle); - } - } - - if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } - - reading->finish(); - fileHandle.close(); - - list->setLabel("0"); - - return 1; - - } - catch(exception& e) { - m->errorOut(e, "ReadColumnMatrix", "read"); - exit(1); - } -} - -/***********************************************************************/ - -ReadColumnMatrix::~ReadColumnMatrix(){ - //delete D; - //delete list; -} - - +/* + * readcolumn.cpp + * Mothur + * + * Created by Sarah Westcott on 4/21/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "readcolumn.h" +#include "progress.hpp" + +/***********************************************************************/ + +ReadColumnMatrix::ReadColumnMatrix(string df) : distFile(df){ + + successOpen = openInputFile(distFile, fileHandle); + +} + +/***********************************************************************/ + +int ReadColumnMatrix::read(NameAssignment* nameMap){ + try { + + string firstName, secondName; + float distance; + int nseqs = nameMap->size(); + + list = new ListVector(nameMap->getListVector()); + + Progress* reading = new Progress("Reading matrix: ", nseqs * nseqs); + + int lt = 1; + int refRow = 0; //we'll keep track of one cell - Cell(refRow,refCol) - and see if it's transpose + int refCol = 0; //shows up later - Cell(refCol,refRow). If it does, then its a square matrix + + //need to see if this is a square or a triangular matrix... + + while(fileHandle && lt == 1){ //let's assume it's a triangular matrix... + + + fileHandle >> firstName >> secondName >> distance; // get the row and column names and distance + + if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } + + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + + if(itA == nameMap->end()){ + cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); + } + if(itB == nameMap->end()){ + cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); + } + + if (distance == -1) { distance = 1000000; } + else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. + + if(distance < cutoff && itA != itB){ + if(itA->second > itB->second){ + PCell value(itA->second, itB->second, distance); + + if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol... + refRow = itA->second; + refCol = itB->second; + D->addCell(value); + } + else if(refRow == itA->second && refCol == itB->second){ + lt = 0; + } + else{ + D->addCell(value); + } + } + else if(itA->second < itB->second){ + PCell value(itB->second, itA->second, distance); + + if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol... + refRow = itA->second; + refCol = itB->second; + D->addCell(value); + } + else if(refRow == itB->second && refCol == itA->second){ + lt = 0; + } + else{ + D->addCell(value); + } + } + reading->update(itA->second * nseqs); + } + gobble(fileHandle); + } + + if(lt == 0){ // oops, it was square + + fileHandle.close(); //let's start over + D->clear(); //let's start over + + openInputFile(distFile, fileHandle); //let's start over + + while(fileHandle){ + fileHandle >> firstName >> secondName >> distance; + + if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } + + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + + if(itA == nameMap->end()){ + cerr << "BError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; + } + if(itB == nameMap->end()){ + cerr << "BError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; + } + + if (distance == -1) { distance = 1000000; } + else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. + + if(distance < cutoff && itA->second > itB->second){ + PCell value(itA->second, itB->second, distance); + D->addCell(value); + reading->update(itA->second * nseqs); + } + + gobble(fileHandle); + } + } + + if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } + + reading->finish(); + fileHandle.close(); + + list->setLabel("0"); + + return 1; + + } + catch(exception& e) { + m->errorOut(e, "ReadColumnMatrix", "read"); + exit(1); + } +} + +/***********************************************************************/ + +ReadColumnMatrix::~ReadColumnMatrix(){ + //delete D; + //delete list; +} + + diff --git a/readdistcommand.cpp b/readdistcommand.cpp index 60d300f..bcecb78 100644 --- a/readdistcommand.cpp +++ b/readdistcommand.cpp @@ -22,7 +22,7 @@ ReadDistCommand::ReadDistCommand(string option) { else { //valid paramters for this command - string Array[] = {"phylip", "column", "name", "cutoff", "precision", "group","outputdir","inputdir"}; + string Array[] = {"phylip", "column", "name", "cutoff", "precision", "group","outputdir","inputdir","sim"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -119,9 +119,13 @@ ReadDistCommand::ReadDistCommand(string option) { // ...at some point should added some additional type checking... //get user cutoff and precision or use defaults string temp; - temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; } + temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; } convert(temp, precision); + temp = validParameter.validFile(parameters, "sim", false); if (temp == "not found") { temp = "F"; } + sim = isTrue(temp); + globaldata->sim = sim; + temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; } convert(temp, cutoff); cutoff += (5 / (precision * 10.0)); @@ -166,11 +170,12 @@ ReadDistCommand::ReadDistCommand(string option) { void ReadDistCommand::help(){ try { - m->mothurOut("The read.dist command parameter options are phylip or column, group, name, cutoff and precision\n"); + m->mothurOut("The read.dist command parameter options are phylip or column, group, name, sim, cutoff and precision\n"); m->mothurOut("The read.dist command can be used in two ways. The first is to read a phylip or column and run the cluster command\n"); m->mothurOut("For this use the read.dist command should be in the following format: \n"); m->mothurOut("read.dist(phylip=yourDistFile, name=yourNameFile, cutoff=yourCutoff, precision=yourPrecision) \n"); m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n"); + m->mothurOut("The sim parameter is used to indicate that your distance file contains similiarity values instead of distance values. The default is false, if sim=true then mothur will convert the similairity values to distances. \n"); m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n"); m->mothurOut("The second way to use the read.dist command is to read a phylip or column and a group, so you can use the libshuff command.\n"); m->mothurOut("For this use the read.dist command should be in the following format: \n"); @@ -204,7 +209,7 @@ int ReadDistCommand::execute(){ size_t numDists = 0; vector outputNames; -cout << format << endl; + if (format == "matrix") { ifstream in; openInputFile(distFileName, in); diff --git a/readdistcommand.h b/readdistcommand.h index 1f852be..937ca3f 100644 --- a/readdistcommand.h +++ b/readdistcommand.h @@ -42,7 +42,7 @@ private: string phylipfile, columnfile, namefile, groupfile, outputDir; NameAssignment* nameMap; - bool abort; + bool abort, sim; }; diff --git a/readmatrix.hpp b/readmatrix.hpp index d4edb5b..31a4da4 100644 --- a/readmatrix.hpp +++ b/readmatrix.hpp @@ -21,7 +21,7 @@ class SparseMatrix; class ReadMatrix { public: - ReadMatrix(){ D = new SparseMatrix(); m = MothurOut::getInstance(); } + ReadMatrix(){ D = new SparseMatrix(); m = MothurOut::getInstance(); globaldata = GlobalData::getInstance(); } virtual ~ReadMatrix() {} virtual int read(NameAssignment*){ return 1; } @@ -38,6 +38,7 @@ protected: GlobalData* globaldata; float cutoff; MothurOut* m; + bool sim; }; diff --git a/readphylip.cpp b/readphylip.cpp index edda415..f155456 100644 --- a/readphylip.cpp +++ b/readphylip.cpp @@ -85,6 +85,7 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ if (distance == -1) { distance = 1000000; } + else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. if(distance < cutoff){ PCell value(i, j, distance); @@ -104,6 +105,7 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ if (m->control_pressed) { delete reading; fileHandle.close(); return 0; } if (distance == -1) { distance = 1000000; } + else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. if(distance < cutoff){ PCell value(nameMap->get(matrixNames[i]), nameMap->get(matrixNames[j]), distance); @@ -135,6 +137,7 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } if (distance == -1) { distance = 1000000; } + else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. if(distance < cutoff && j < i){ PCell value(i, j, distance); @@ -153,9 +156,10 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } - if (distance == -1) { distance = 1000000; } + if (distance == -1) { distance = 1000000; } + else if (globaldata->sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. - if(distance < cutoff && j < i){ + if(distance < cutoff && j < i){ PCell value(nameMap->get(matrixNames[i]), nameMap->get(matrixNames[j]), distance); D->addCell(value); } diff --git a/sequence.cpp b/sequence.cpp index 4b56675..19adf79 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -443,5 +443,65 @@ void Sequence::reverseComplement(){ aligned = temp; } - +#ifdef USE_MPI //******************************************************************************************************************** +int Sequence::MPISend(int receiver) { + try { + //send name - string + int length = name.length(); + char buf[name.length()]; + strcpy(buf, name.c_str()); + + MPI_Send(&length, 1, MPI_INT, receiver, 2001, MPI_COMM_WORLD); + + MPI_Send(&buf, length, MPI_CHAR, receiver, 2001, MPI_COMM_WORLD); + + //send aligned - string + length = aligned.length(); + char buf2[aligned.length()]; + strcpy(buf2, aligned.c_str()); + + MPI_Send(&length, 1, MPI_INT, receiver, 2001, MPI_COMM_WORLD); + + MPI_Send(&buf2, length, MPI_CHAR, receiver, 2001, MPI_COMM_WORLD); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "Sequence", "MPISend"); + exit(1); + } +} +/**************************************************************************************************/ +int Sequence::MPIRecv(int sender) { + try { + MPI_Status status; + + //receive name - string + int length; + MPI_Recv(&length, 1, MPI_INT, sender, 2001, MPI_COMM_WORLD, &status); + + char buf[length]; + MPI_Recv(&buf, length, MPI_CHAR, sender, 2001, MPI_COMM_WORLD, &status); + name = buf; + + //receive aligned - string + MPI_Recv(&length, 1, MPI_INT, sender, 2001, MPI_COMM_WORLD, &status); + + char buf2[length]; + MPI_Recv(&buf2, length, MPI_CHAR, sender, 2001, MPI_COMM_WORLD, &status); + aligned = buf2; + + setAligned(aligned); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "Sequence", "MPIRecv"); + exit(1); + } +} +#endif +/**************************************************************************************************/ diff --git a/sequence.hpp b/sequence.hpp index 5f84d44..21b4c38 100644 --- a/sequence.hpp +++ b/sequence.hpp @@ -47,6 +47,9 @@ public: bool getIsAligned(); void printSequence(ostream&); + int MPISend(int); //not working at the moment... + int MPIRecv(int); //not working at the moment... + private: MothurOut* m; void initialize(); diff --git a/suffixdb.cpp b/suffixdb.cpp index 2aa8f3f..b3496f6 100644 --- a/suffixdb.cpp +++ b/suffixdb.cpp @@ -26,6 +26,12 @@ SuffixDB::SuffixDB(int numSeqs) : Database() { suffixForest.resize(numSeqs); count = 0; } +/**************************************************************************************************/ + +SuffixDB::SuffixDB() : Database() { + count = 0; +} + /**************************************************************************************************/ //assumes sequences have been added using addSequence vector SuffixDB::findClosestSequences(Sequence* candidateSeq, int num){ @@ -76,4 +82,36 @@ void SuffixDB::addSequence(Sequence seq) { SuffixDB::~SuffixDB(){ for (int i = (suffixForest.size()-1); i >= 0; i--) { suffixForest.pop_back(); } } +#ifdef USE_MPI +/**************************************************************************************************/ +int SuffixDB::MPISend(int receiver) { + try { + + //send numSeqs - int + MPI_Send(&numSeqs, 1, MPI_INT, receiver, 2001, MPI_COMM_WORLD); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SuffixDB", "MPISend"); + exit(1); + } +} +/**************************************************************************************************/ +int SuffixDB::MPIRecv(int sender) { + try { + MPI_Status status; + //receive numSeqs - int + MPI_Recv(&numSeqs, 1, MPI_INT, sender, 2001, MPI_COMM_WORLD, &status); + + suffixForest.resize(numSeqs); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SuffixDB", "MPIRecv"); + exit(1); + } +} +#endif /**************************************************************************************************/ diff --git a/suffixdb.hpp b/suffixdb.hpp index 1baa99e..4dc7e0f 100644 --- a/suffixdb.hpp +++ b/suffixdb.hpp @@ -27,11 +27,17 @@ class SuffixDB : public Database { public: SuffixDB(int); + SuffixDB(); ~SuffixDB(); void generateDB() {}; //adding sequences generates the db void addSequence(Sequence); vector findClosestSequences(Sequence*, int); + + #ifdef USE_MPI + int MPISend(int); //just sends numSeqs + int MPIRecv(int); + #endif private: vector suffixForest; diff --git a/validparameter.cpp b/validparameter.cpp index 0c32a3e..2d29ee3 100644 --- a/validparameter.cpp +++ b/validparameter.cpp @@ -207,11 +207,23 @@ string ValidParameters::validFile(map container, string paramete if(it != container.end()){ //no parameter given if(isFile == true) { + + #ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + #endif ableToOpen = openInputFile(it->second, in); if (ableToOpen == 1) { return "not open"; } in.close(); + + #ifdef USE_MPI + } + #endif + } }else { return "not found"; } -- 2.39.2