X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=chimera.cpp;h=53e77322f52ba40eb478e0b385a84982de752d6b;hp=bf16de4e34b795694d1a894ea1005f32353be110;hb=cf9987b67aa49777a4c91c2d21f96e58bf17aa82;hpb=5a1e62397b91f57d0d3aff635891df04b8999a88 diff --git a/chimera.cpp b/chimera.cpp index bf16de4..53e7732 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -8,13 +8,14 @@ */ #include "chimera.h" +#include "referencedb.h" //*************************************************************************************************************** //this is a vertical soft filter -void Chimera::createFilter(vector seqs) { +string Chimera::createFilter(vector seqs, float t) { try { filterString = ""; - int threshold = int (0.5 * seqs.size()); + int threshold = int (t * seqs.size()); //cout << "threshhold = " << threshold << endl; vector gaps; gaps.resize(seqs[0]->getAligned().length(), 0); @@ -22,14 +23,18 @@ void Chimera::createFilter(vector seqs) { vector t; t.resize(seqs[0]->getAligned().length(), 0); vector g; g.resize(seqs[0]->getAligned().length(), 0); vector c; c.resize(seqs[0]->getAligned().length(), 0); - + filterString = (string(seqs[0]->getAligned().length(), '1')); //for each sequence for (int i = 0; i < seqs.size(); i++) { + if (m->control_pressed) { return filterString; } + string seqAligned = seqs[i]->getAligned(); + if (seqAligned.length() != filterString.length()) { m->mothurOut(seqs[i]->getName() + " is not the same length as the template sequences. Aborting!\n"); exit(1); } + for (int j = 0; j < seqAligned.length(); j++) { //if this spot is a gap if ((seqAligned[j] == '-') || (seqAligned[j] == '.')) { gaps[j]++; } @@ -40,76 +45,218 @@ void Chimera::createFilter(vector seqs) { } } - //zero out spot where all sequences have blanks //zero out spot where all sequences have blanks int numColRemoved = 0; for(int i = 0;i < seqs[0]->getAligned().length(); i++){ + + if (m->control_pressed) { return filterString; } + if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; } else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) { filterString[i] = '0'; numColRemoved++; } //cout << "a = " << a[i] << " t = " << t[i] << " g = " << g[i] << " c = " << c[i] << endl; } - -//cout << "filter = " << filterString << endl; - mothurOut("Filter removed " + toString(numColRemoved) + " columns."); mothurOutEndLine(); + if (threshold != 0.0) { m->mothurOut("Filter removed " + toString(numColRemoved) + " columns."); m->mothurOutEndLine(); } + + return filterString; } catch(exception& e) { - errorOut(e, "Chimera", "createFilter"); + m->errorOut(e, "Chimera", "createFilter"); exit(1); } } //*************************************************************************************************************** -void Chimera::runFilter(Sequence* seq) { +map Chimera::runFilter(Sequence* seq) { try { - + map maskMap; string seqAligned = seq->getAligned(); string newAligned = ""; + int count = 0; for (int j = 0; j < seqAligned.length(); j++) { //if this spot is a gap - if (filterString[j] == '1') { newAligned += seqAligned[j]; } + if (filterString[j] == '1') { + newAligned += seqAligned[j]; + maskMap[count] = j; + count++; + } } seq->setAligned(newAligned); + + return maskMap; } catch(exception& e) { - errorOut(e, "Chimera", "runFilter"); + m->errorOut(e, "Chimera", "runFilter"); exit(1); } } //*************************************************************************************************************** vector Chimera::readSeqs(string file) { try { - - mothurOut("Reading sequences... "); cout.flush(); - ifstream in; - openInputFile(file, in); + vector container; int count = 0; - int length = 0; + length = 0; unaligned = false; + ReferenceDB* rdb = ReferenceDB::getInstance(); - //read in seqs and store in vector - while(!in.eof()){ + if (file == "saved") { + - Sequence* current = new Sequence(in); gobble(in); + m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); - if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length - else if (length != current->getAligned().length()) { //seqs are unaligned - unaligned = true; + for (int i = 0; i < rdb->referenceSeqs.size(); i++) { + Sequence* temp = new Sequence(rdb->referenceSeqs[i].getName(), rdb->referenceSeqs[i].getAligned()); + + if (count == 0) { length = temp->getAligned().length(); count++; } //gets first seqs length + else if (length != temp->getAligned().length()) { unaligned = true; } + + if (temp->getName() != "") { container.push_back(temp); } } - if (current->getName() != "") { container.push_back(current); } - } + templateFileName = rdb->getSavedReference(); + + }else { + + m->mothurOut("Reading sequences from " + file + "..."); cout.flush(); + + #ifdef USE_MPI + int pid, processors; + vector positions; + int numSeqs; + int tag = 2001; + MPI_File inMPI; + MPI_Status status; + + if (byGroup) { + char inFileName[1024]; + strcpy(inFileName, file.c_str()); + + MPI_File_open(MPI_COMM_SELF, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + + positions = m->setFilePosFasta(file, numSeqs); //fills MPIPos, returns numSeqs + + //read file + for(int i=0;icontrol_pressed) { MPI_File_close(&inMPI); return container; } + + //read next sequence + int seqlength = positions[i+1] - positions[i]; + char* buf4 = new char[seqlength]; + + MPI_File_read_at(inMPI, positions[i], buf4, seqlength, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > seqlength) { tempBuf = tempBuf.substr(0, seqlength); } + delete buf4; + + istringstream iss (tempBuf,istringstream::in); + + Sequence* current = new Sequence(iss); + if (current->getName() != "") { + if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length + else if (length != current->getAligned().length()) { unaligned = true; } + + container.push_back(current); + if (rdb->save) { rdb->referenceSeqs.push_back(*current); } + } + } + + MPI_File_close(&inMPI); + + }else { + + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + //char* inFileName = new char[file.length()]; + //memcpy(inFileName, file.c_str(), file.length()); + + char inFileName[1024]; + strcpy(inFileName, file.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + //delete inFileName; + + if (pid == 0) { + positions = m->setFilePosFasta(file, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } + }else{ + MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + positions.resize(numSeqs+1); + MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); + } + + //read file + for(int i=0;icontrol_pressed) { MPI_File_close(&inMPI); return container; } + + //read next sequence + int seqlength = positions[i+1] - positions[i]; + char* buf4 = new char[seqlength]; + + MPI_File_read_at(inMPI, positions[i], buf4, seqlength, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > seqlength) { tempBuf = tempBuf.substr(0, seqlength); } + delete buf4; + + istringstream iss (tempBuf,istringstream::in); + + Sequence* current = new Sequence(iss); + if (current->getName() != "") { + if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length + else if (length != current->getAligned().length()) { unaligned = true; } + + container.push_back(current); + if (rdb->save) { rdb->referenceSeqs.push_back(*current); } + } + } + + MPI_File_close(&inMPI); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case + } + #else + + ifstream in; + m->openInputFile(file, in); + + //read in seqs and store in vector + while(!in.eof()){ + + if (m->control_pressed) { return container; } + + Sequence* current = new Sequence(in); m->gobble(in); + + if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length + else if (length != current->getAligned().length()) { unaligned = true; } + + if (current->getName() != "") { + container.push_back(current); + if (rdb->save) { rdb->referenceSeqs.push_back(*current); } + } + } + in.close(); + #endif - in.close(); - mothurOut("Done."); mothurOutEndLine(); + m->mothurOut("Done."); m->mothurOutEndLine(); + + filterString = (string(container[0]->getAligned().length(), '1')); + } return container; } catch(exception& e) { - errorOut(e, "Chimera", "readSeqs"); + m->errorOut(e, "Chimera", "readSeqs"); exit(1); } } @@ -123,64 +270,60 @@ void Chimera::setMask(string filename) { }else if (filename == "") { //do nothing seqMask = ""; }else{ - ifstream infile; - openInputFile(filename, infile); + + #ifdef USE_MPI + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; - while (!infile.eof()) { - Sequence temp(infile); - seqMask = temp.getAligned(); - - gobble(infile); - } + //char* inFileName = new char[filename.length()]; + //memcpy(inFileName, filename.c_str(), filename.length()); - infile.close(); - } - } - catch(exception& e) { - errorOut(e, "Chimera", "setMask"); - exit(1); - } -} -//*************************************************************************************************************** - -vector< vector > Chimera::readQuantiles() { - try { - - ifstream in; - openInputFile(quanfile, in); - - vector< vector > quan; - vector temp; temp.resize(6, 0); - - //to fill 0 - quan.push_back(temp); + char inFileName[1024]; + strcpy(inFileName, filename.c_str()); - int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; - - while(!in.eof()){ + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_get_size(inMPI, &size); + + //delete inFileName; - in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; + char* buffer = new char[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); - temp.clear(); + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + + delete buffer; + + if (!iss.eof()) { + Sequence temp(iss); + seqMask = temp.getAligned(); + }else { + m->mothurOut("Problem with mask."); m->mothurOutEndLine(); + seqMask = ""; + } - temp.push_back(ten); - temp.push_back(twentyfive); - temp.push_back(fifty); - temp.push_back(seventyfive); - temp.push_back(ninetyfive); - temp.push_back(ninetynine); + MPI_File_close(&inMPI); + #else + + ifstream infile; + m->openInputFile(filename, infile); - quan.push_back(temp); + if (!infile.eof()) { + Sequence temp(infile); + seqMask = temp.getAligned(); + }else { + m->mothurOut("Problem with mask."); m->mothurOutEndLine(); + seqMask = ""; + } + infile.close(); + #endif - gobble(in); } - - in.close(); - return quan; - } catch(exception& e) { - errorOut(e, "Chimera", "readQuantiles"); + m->errorOut(e, "Chimera", "setMask"); exit(1); } } @@ -198,14 +341,14 @@ Sequence* Chimera::getSequence(string name) { } } - if(spot == -1) { mothurOut("Error: Could not find sequence."); mothurOutEndLine(); return NULL; } + if(spot == -1) { m->mothurOut("Error: Could not find sequence."); m->mothurOutEndLine(); return NULL; } temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned()); return temp; } catch(exception& e) { - errorOut(e, "Chimera", "getSequence"); + m->errorOut(e, "Chimera", "getSequence"); exit(1); } }