X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=chimeracheckrdp.cpp;h=42b531282c5f020664277a9e907699a3bf227658;hp=73bc1a06c0fe49d8997965c16b4cb3c0485b16db;hb=1a20e24ee786195ab0e1cccd4f5aede7a88f3f4e;hpb=1244c4907c07baea86b0f0676d098a29d2e95a39 diff --git a/chimeracheckrdp.cpp b/chimeracheckrdp.cpp index 73bc1a0..42b5312 100644 --- a/chimeracheckrdp.cpp +++ b/chimeracheckrdp.cpp @@ -10,168 +10,176 @@ #include "chimeracheckrdp.h" //*************************************************************************************************************** -ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp) { fastafile = filename; templateFile = temp; } +ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp, string n, bool s, int inc, int k, string o) : Chimera() { + try { + fastafile = filename; + templateFileName = temp; + name = n; + svg = s; + increment = inc; + kmerSize = k; + outputDir = o; + + templateDB = new AlignmentDB(templateFileName, "kmer", kmerSize, 0.0,0.0,0.0,0.0, rand()); + m->mothurOutEndLine(); + + kmer = new Kmer(kmerSize); + + if (name != "") { + readName(name); //fills name map with names of seqs the user wants to have .svg for. + } + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckRDP", "ChimeraCheckRDP"); + exit(1); + } +} //*************************************************************************************************************** ChimeraCheckRDP::~ChimeraCheckRDP() { try { - for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } delete templateDB; delete kmer; } catch(exception& e) { - errorOut(e, "ChimeraCheckRDP", "~AlignSim"); + m->errorOut(e, "ChimeraCheckRDP", "~ChimeraCheckRDP"); exit(1); } } //*************************************************************************************************************** -void ChimeraCheckRDP::print(ostream& out) { +Sequence ChimeraCheckRDP::print(ostream& out, ostream& outAcc) { try { - mothurOutEndLine(); - /* - for (int i = 0; i < querySeqs.size(); i++) { + m->mothurOut("Processing: " + querySeq->getName()); m->mothurOutEndLine(); + + out << querySeq->getName() << endl; + out << "IS scores: " << '\t'; - int j = 0; float largest = -10; - //find largest sim value - for (int k = 0; k < IS[i].size(); k++) { - //is this score larger - if (IS[i][k].score > largest) { - j = k; - largest = IS[i][k].score; + for (int k = 0; k < IS.size(); k++) { + out << IS[k].score << '\t'; + } + out << endl; + + if (svg) { + if (name != "") { //if user has specific names + map::iterator it = names.find(querySeq->getName()); + + if (it != names.end()) { //user wants pic of this + makeSVGpic(IS); //zeros out negative results } + }else{//output them all + makeSVGpic(IS); //zeros out negative results } - - //find parental similarity - distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent)); - float dist = distCalc->getDist(); - - //convert to similarity - dist = (1 - dist) * 100; - - //warn about parental similarity - if its above 82% may not detect a chimera - if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly."); mothurOutEndLine(); } - - int index = ceil(dist); - - if (index == 0) { index=1; } - - //is your DE value higher than the 95% - string chimera; - if (IS[i][j].score > quantile[index-1][4]) { chimera = "Yes"; } - else { chimera = "No"; } - - out << querySeqs[i]->getName() << "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl; - - if (chimera == "Yes") { - mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine(); - } - out << "Improvement Score\t"; - - for (int r = 0; r < IS[i].size(); r++) { out << IS[i][r].score << '\t'; } - out << endl; - }*/ + } + + return *querySeq; } catch(exception& e) { - errorOut(e, "ChimeraCheckRDP", "print"); + m->errorOut(e, "ChimeraCheckRDP", "print"); exit(1); } } - +#ifdef USE_MPI //*************************************************************************************************************** -void ChimeraCheckRDP::getChimeras() { +Sequence ChimeraCheckRDP::print(MPI_File& out, MPI_File& outAcc) { try { - //read in query sequences and subject sequences - mothurOut("Reading sequences and template file... "); cout.flush(); - querySeqs = readSeqs(fastafile); - //templateSeqs = readSeqs(templateFile); - templateDB = new KmerDB(templateFile, kmerSize); - mothurOut("Done."); mothurOutEndLine(); - - int numSeqs = querySeqs.size(); - - IS.resize(numSeqs); - closest.resize(numSeqs); + cout << "Processing: " << querySeq->getName() << endl; - //break up file if needed - int linesPerProcess = numSeqs / processors ; + string outString = ""; - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - //find breakup of sequences for all times we will Parallelize - if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } - else { - //fill line pairs - for (int i = 0; i < (processors-1); i++) { - lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); - } - //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end - int i = processors - 1; - lines.push_back(new linePair((i*linesPerProcess), numSeqs)); - } + outString += querySeq->getName() + "\nIS scores: \t"; - #else - lines.push_back(new linePair(0, numSeqs)); - #endif - - kmer = new Kmer(kmerSize); - - //find closest seq to each querySeq - for (int i = 0; i < querySeqs.size(); i++) { - closest[i] = templateDB->findClosestSequence(querySeqs[i]); + for (int k = 0; k < IS.size(); k++) { + outString += toString(IS[k].score) + "\t"; } + outString += "\n"; - //fill seqKmerInfo for query seqs - for (int i = 0; i < querySeqs.size(); i++) { - seqKmerInfo[querySeqs[i]->getName()] = kmer->getKmerCounts(querySeqs[i]->getUnaligned()); - } - - //fill seqKmerInfo for closest - for (int i = 0; i < closest.size(); i++) { - seqKmerInfo[closest[i].getName()] = kmer->getKmerCounts(closest[i].getUnaligned()); - } + MPI_Status status; + int length = outString.length(); + char* buf = new char[length]; + memcpy(buf, outString.c_str(), length); + + MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); + delete buf; - - //for each query find IS value - this should be paralellized, - //but paralellizing may cause you to have to recalculate some seqKmerInfo since the separate processes don't share memory after they split - for (int i = 0; i < querySeqs.size(); i++) { - IS[i] = findIS(i); //fills seqKmerInfo + if (svg) { + if (name != "") { //if user has specific names + map::iterator it = names.find(querySeq->getName()); + + if (it != names.end()) { //user wants pic of this + makeSVGpic(IS); //zeros out negative results + } + }else{//output them all + makeSVGpic(IS); //zeros out negative results + } } + return *querySeq; + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckRDP", "print"); + exit(1); + } +} +#endif +//*************************************************************************************************************** +int ChimeraCheckRDP::getChimeras(Sequence* query) { + try { - //free memory - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } - + IS.clear(); + + querySeq = query; + closest = templateDB->findClosestSequence(query); + + IS = findIS(); + + //determine chimera report cutoff - window score above 95% + //getCutoff(); - not very acurate predictor + + return 0; } catch(exception& e) { - errorOut(e, "ChimeraCheckRDP", "getChimeras"); + m->errorOut(e, "ChimeraCheckRDP", "getChimeras"); exit(1); } } //*************************************************************************************************************** -vector ChimeraCheckRDP::findIS(int query) { +vector ChimeraCheckRDP::findIS() { try { + + vector< map > queryKmerInfo; //vector of maps - each entry in the vector is a map of the kmers up to that spot in the unaligned seq + //example: seqKmerInfo[50] = map containing the kmers found in the first 50 + kmersize characters of ecoli. + //i chose to store the kmers numbers in a map so you wouldn't have to check for dupilcate entries and could easily find the + //kmers 2 seqs had in common. There may be a better way to do this thats why I am leaving so many comments... + vector< map > subjectKmerInfo; + vector isValues; - string queryName = querySeqs[query]->getName(); - string seq = querySeqs[query]->getUnaligned(); + string queryName = querySeq->getName(); + string seq = querySeq->getUnaligned(); - mothurOut("Finding IS values for sequence " + query); mothurOutEndLine(); + queryKmerInfo = kmer->getKmerCounts(seq); + subjectKmerInfo = kmer->getKmerCounts(closest.getUnaligned()); //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each - int nTotal = calcKmers(seqKmerInfo[queryName][(seqKmerInfo[queryName].size()-1)], seqKmerInfo[closest[query].getName()][(seqKmerInfo[closest[query].getName()].size()-1)]); - + int nTotal = calcKmers(queryKmerInfo[(queryKmerInfo.size()-1)], subjectKmerInfo[(subjectKmerInfo.size()-1)]); + //you don't want the starting point to be virtually at hte end so move it in 10% int start = seq.length() / 10; //for each window - for (int m = start; m < (seq.length() - start); m+=increment) { + for (int f = start; f < (seq.length() - start); f+=increment) { + + if (m->control_pressed) { return isValues; } + + if ((f - kmerSize) < 0) { m->mothurOut("Your sequence is too short for your kmerSize."); m->mothurOutEndLine(); exit(1); } sim temp; - string fragLeft = seq.substr(0, m); //left side of breakpoint - string fragRight = seq.substr(m, seq.length()); //right side of breakpoint + string fragLeft = seq.substr(0, f); //left side of breakpoint + string fragRight = seq.substr(f); //right side of breakpoint //make a sequence of the left side and right side Sequence* left = new Sequence(queryName, fragLeft); @@ -179,77 +187,124 @@ vector ChimeraCheckRDP::findIS(int query) { //find seqs closest to each fragment Sequence closestLeft = templateDB->findClosestSequence(left); + Sequence closestRight = templateDB->findClosestSequence(right); + + //get kmerinfo for the closest left + vector< map > closeLeftKmerInfo = kmer->getKmerCounts(closestLeft.getUnaligned()); - map::iterator itleft; - map::iterator itleftclose; - - //get kmer in the closest seqs - //if it's not found calc kmer info and save, otherwise use already calculated data - //left - it = seqKmerInfo.find(closestLeft.getName()); - if (it == seqKmerInfo.end()) { //you have to calc it - seqKmerInfo[closestLeft.getName()] = kmer->getKmerCounts(closestLeft.getUnaligned()); - } - - //right - it = seqKmerInfo.find(closestRight.getName()); - if (it == seqKmerInfo.end()) { //you have to calc it - seqKmerInfo[closestRight.getName()] = kmer->getKmerCounts(closestRight.getUnaligned()); - } + //get kmerinfo for the closest right + vector< map > closeRightKmerInfo = kmer->getKmerCounts(closestRight.getUnaligned()); //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side //iterate through left sides map to subtract the number of times you saw things before you got the the right side - map rightside; - for (itleft = seqKmerInfo[queryName][m-kmerSize].begin(); itleft != seqKmerInfo[queryName][m-kmerSize].end(); itleft++) { - int howManyTotal = seqKmerInfo[queryName][seqKmerInfo[queryName].size()-1][itleft->first]; //times that kmer was seen in total - + map rightside = queryKmerInfo[queryKmerInfo.size()-1]; + for (map::iterator itleft = queryKmerInfo[f-kmerSize].begin(); itleft != queryKmerInfo[f-kmerSize].end(); itleft++) { + int howManyTotal = queryKmerInfo[queryKmerInfo.size()-1][itleft->first]; //times that kmer was seen in total + //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side int howmanyright = howManyTotal - itleft->second; - //if any were seen just on the right add that ammount to map - if (howmanyright > 0) { - rightside[itleft->first] = howmanyright; + //if any were seen just on the left erase + if (howmanyright == 0) { + rightside.erase(itleft->first); } } - //iterate through left side of the seq closest to the right fragment of query to subtract the number you saw before you reached the right side of the closest right - //this way you can get the map for just the fragment you want to compare and not hte whole sequence - map rightsideclose; - for (itleftclose = seqKmerInfo[closestRight.getName()][m-kmerSize].begin(); itleftclose != seqKmerInfo[closestRight.getName()][m-kmerSize].end(); itleftclose++) { - int howManyTotal = seqKmerInfo[closestRight.getName()][seqKmerInfo[closestRight.getName()].size()-1][itleftclose->first]; //times that kmer was seen in total - + map closerightside = closeRightKmerInfo[closeRightKmerInfo.size()-1]; + for (map::iterator itright = closeRightKmerInfo[f-kmerSize].begin(); itright != closeRightKmerInfo[f-kmerSize].end(); itright++) { + int howManyTotal = closeRightKmerInfo[(closeRightKmerInfo.size()-1)][itright->first]; //times that kmer was seen in total + //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side - int howmanyright = howManyTotal - itleftclose->second; + int howmanyright = howManyTotal - itright->second; - //if any were seen just on the right add that ammount to map - if (howmanyright > 0) { - rightsideclose[itleftclose->first] = howmanyright; + //if any were seen just on the left erase + if (howmanyright == 0) { + closerightside.erase(itright->first); } } - int nLeft = calcKmers(seqKmerInfo[closestLeft.getName()][m-kmerSize], seqKmerInfo[queryName][m-kmerSize]); - int nRight = calcKmers(rightsideclose, rightside); + int nLeft = calcKmers(closeLeftKmerInfo[f-kmerSize], queryKmerInfo[f-kmerSize]); + + int nRight = calcKmers(closerightside, rightside); + int is = nLeft + nRight - nTotal; - + //save IS, leftparent, rightparent, breakpoint temp.leftParent = closestLeft.getName(); temp.rightParent = closestRight.getName(); temp.score = is; - temp.midpoint = m; + temp.midpoint = f; isValues.push_back(temp); delete left; delete right; - } + } return isValues; } catch(exception& e) { - errorOut(e, "ChimeraCheckRDP", "findIS"); + m->errorOut(e, "ChimeraCheckRDP", "findIS"); + exit(1); + } +} +//*************************************************************************************************************** +void ChimeraCheckRDP::readName(string namefile) { + try{ + + string name; + + #ifdef USE_MPI + + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + + //char* inFileName = new char[namefile.length()]; + //memcpy(inFileName, namefile.c_str(), namefile.length()); + + char inFileName[1024]; + strcpy(inFileName, namefile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); + + //delete inFileName; + + char* buffer = new char[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + delete buffer; + + while(!iss.eof()) { + iss >> name; m->gobble(iss); + names[name] = name; + } + + MPI_File_close(&inMPI); + + #else + + ifstream in; + m->openInputFile(namefile, in); + + while (!in.eof()) { + in >> name; m->gobble(in); + names[name] = name; + } + in.close(); + + #endif + + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckRDP", "readName"); exit(1); } } @@ -260,25 +315,26 @@ int ChimeraCheckRDP::calcKmers(map query, map subject) { try{ int common = 0; - map::iterator small; - map::iterator large; + map::iterator smallone; + map::iterator largeone; + if (query.size() < subject.size()) { - for (small = query.begin(); small != query.end(); small++) { - large = subject.find(small->first); + for (smallone = query.begin(); smallone != query.end(); smallone++) { + largeone = subject.find(smallone->first); //if you found it they have that kmer in common - if (large != subject.end()) { common++; } + if (largeone != subject.end()) { common++; } } }else { - for (small = subject.begin(); small != subject.end(); small++) { - large = query.find(small->first); + for (smallone = subject.begin(); smallone != subject.end(); smallone++) { + largeone = query.find(smallone->first); //if you found it they have that kmer in common - if (large != query.end()) { common++; } + if (largeone != query.end()) { common++; } } } @@ -286,10 +342,148 @@ int ChimeraCheckRDP::calcKmers(map query, map subject) { } catch(exception& e) { - errorOut(e, "ChimeraCheckRDP", "calcKmers"); + m->errorOut(e, "ChimeraCheckRDP", "calcKmers"); exit(1); } } +#ifdef USE_MPI +//*************************************************************************************************************** +void ChimeraCheckRDP::makeSVGpic(vector info) { + try{ + + string file = outputDir + querySeq->getName() + ".chimeracheck.svg"; + + MPI_File outSVG; + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + + //char* FileName = new char[file.length()]; + //memcpy(FileName, file.c_str(), file.length()); + + char FileName[1024]; + strcpy(FileName, file.c_str()); + + MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outSVG); //comm, filename, mode, info, filepointer + + //delete FileName; + + int width = (info.size()*5) + 150; + + string outString = ""; + + outString += "\n"; + outString += "\n"; + outString += "Plotted IS values for " + querySeq->getName() + "\n"; + + outString += "\n"; + outString += "\n"; + + outString += "" + toString(info[0].midpoint) + "\n"; + outString += "" + toString(info[info.size()-1].midpoint) + "\n"; + outString += "Base Positions\n"; + + outString += "0\n"; + + outString += "IS\n"; + + + //find max is score + float biggest = 0.0; + for (int i = 0; i < info.size(); i++) { + if (info[i].score > biggest) { + biggest = info[i].score; + } + } + + outString += "" + toString(biggest) + "\n"; + + int scaler2 = 500 / biggest; + + + outString += " "; + for (int i = 0; i < info.size(); i++) { + if(info[i].score < 0) { info[i].score = 0; } + outString += toString(((i*5) + 75)) + "," + toString((600 - (info[i].score * scaler2))) + " "; + } + + outString += "\"/> "; + outString += "\n\n"; + + MPI_Status status; + int length = outString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outString.c_str(), length); + + MPI_File_write(outSVG, buf2, length, MPI_CHAR, &status); + delete buf2; + + MPI_File_close(&outSVG); + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckRDP", "makeSVGpic"); + exit(1); + } +} +#else //*************************************************************************************************************** +void ChimeraCheckRDP::makeSVGpic(vector info) { + try{ + + string file = outputDir + querySeq->getName() + ".chimeracheck.svg"; + ofstream outsvg; + m->openOutputFile(file, outsvg); + + int width = (info.size()*5) + 150; + + outsvg << "\n"; + outsvg << "\n"; + outsvg << "Plotted IS values for " + querySeq->getName() + "\n"; + + outsvg << "\n"; + outsvg << "\n"; + + outsvg << "" + toString(info[0].midpoint) + "\n"; + outsvg << "" + toString(info[info.size()-1].midpoint) + "\n"; + outsvg << "Base Positions\n"; + + outsvg << "0\n"; + + outsvg << "IS\n"; + + + //find max is score + float biggest = 0.0; + for (int i = 0; i < info.size(); i++) { + if (info[i].score > biggest) { + biggest = info[i].score; + } + } + + outsvg << "" + toString(biggest) + "\n"; + + int scaler2 = 500 / biggest; + + + outsvg << " "; + for (int i = 0; i < info.size(); i++) { + if(info[i].score < 0) { info[i].score = 0; } + outsvg << ((i*5) + 75) << "," << (600 - (info[i].score * scaler2)) << " "; + } + + outsvg << "\"/> "; + outsvg << "\n\n"; + + outsvg.close(); + + } + catch(exception& e) { + m->errorOut(e, "ChimeraCheckRDP", "makeSVGpic"); + exit(1); + } +} +#endif +//***************************************************************************************************************/ +