X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=chimeraseqscommand.cpp;h=b451721ca8a0d951b8e01ed5b398dd3b94dbe22a;hb=28d65de5f06f5b033109a3b8bbb6d3c4060914d3;hp=182c25e4b909ba2dd49fc1971beb7abee3b99188;hpb=6e63c5ff52bd2830b689417df8ba3db831e63a96;p=mothur.git diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index 182c25e..b451721 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -8,6 +8,7 @@ */ #include "chimeraseqscommand.h" +#include "eachgapdist.h" //*************************************************************************************************************** @@ -20,7 +21,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method" }; + string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -39,16 +40,24 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true; } string temp; - temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; } + temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; } filter = isTrue(temp); temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; } correction = isTrue(temp); - temp = validParameter.validFile(parameters, "processors", true); if (temp == "not found") { temp = "1"; } + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } convert(temp, processors); + temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; } + convert(temp, window); + + temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "10"; } + convert(temp, increment); + method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "bellerophon"; } + + if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine(); abort = true; } } } @@ -77,6 +86,11 @@ void ChimeraSeqsCommand::help(){ exit(1); } } +//******************************************************************************************************************** +//sorts highest score to lowest +inline bool comparePref(Preference left, Preference right){ + return (left.score[0] > right.score[0]); +} //*************************************************************************************************************** @@ -89,9 +103,10 @@ int ChimeraSeqsCommand::execute(){ if (abort == true) { return 0; } + //do soft filter if (filter) { - string optionString = "fasta=" + fastafile + ", soft=50.0, vertical=F"; + string optionString = "fasta=" + fastafile + ", soft=50, vertical=F"; filterSeqs = new FilterSeqsCommand(optionString); filterSeqs->execute(); delete filterSeqs; @@ -100,76 +115,169 @@ int ChimeraSeqsCommand::execute(){ fastafile = getRootName(fastafile) + "filter.fasta"; } + distCalculator = new eachGapDist(); + //read in sequences readSeqs(); - //int numSeqs = seqs.size(); + int numSeqs = seqs.size(); - //find average midpoint of seqs - midpoint = findAverageMidPoint(); + if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; } - //create 2 vectors of sequences, 1 for left side and one for right side - vector left; vector right; + //set default window to 25% of sequence length + string seq0 = seqs[0].getAligned(); + if (window == 0) { window = seq0.length() / 4; } + else if (window > (seq0.length() / 2)) { + mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine(); + window = (seq0.length() / 2); + } - for (int i = 0; i < seqs.size(); i++) { - //save left side - string seqLeft = seqs[i].getAligned(); - seqLeft = seqLeft.substr(0, midpoint); - Sequence tempLeft(seqs[i].getName(), seqLeft); - left.push_back(tempLeft); + if (increment > (seqs[0].getAlignLength() - (2*window))) { + if (increment != 10) { - //save right side - string seqRight = seqs[i].getAligned(); - seqRight = seqRight.substr(midpoint+1, (seqRight.length()-midpoint-1)); - Sequence tempRight(seqs[i].getName(), seqRight); - right.push_back(tempRight); + mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine(); + increment = 10; + if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; } + + }else{ increment = 0; } } +cout << "increment = " << increment << endl; + if (increment == 0) { iters = 1; } + else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); } + + //initialize pref + pref.resize(numSeqs); + + for (int i = 0; i < numSeqs; i++ ) { + pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3); + pref[i].name = seqs[i].getName(); + pref[i].score[0] = 0.0; pref[i].score[1] = 0.0; + pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0; + pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0; + } + + int midpoint = window; + int count = 0; + while (count < iters) { + + //create 2 vectors of sequences, 1 for left side and one for right side + vector left; vector right; + + for (int i = 0; i < seqs.size(); i++) { +//cout << "whole = " << seqs[i].getAligned() << endl; + //save left side + string seqLeft = seqs[i].getAligned().substr(midpoint-window, window); + Sequence tempLeft; + tempLeft.setName(seqs[i].getName()); + tempLeft.setAligned(seqLeft); + left.push_back(tempLeft); +//cout << "left = " << tempLeft.getAligned() << endl; + //save right side + string seqRight = seqs[i].getAligned().substr(midpoint, window); + Sequence tempRight; + tempRight.setName(seqs[i].getName()); + tempRight.setAligned(seqRight); + right.push_back(tempRight); +//cout << "right = " << seqRight << endl; + } + + //adjust midpoint by increment + midpoint += increment; + + + //this should be parallelized + //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | ) + //create a matrix containing the distance from left to left and right to right + //calculate distances + SparseMatrix* SparseLeft = new SparseMatrix(); + SparseMatrix* SparseRight = new SparseMatrix(); + + createSparseMatrix(0, left.size(), SparseLeft, left); + createSparseMatrix(0, right.size(), SparseRight, right); + + vector distMapRight; + vector distMapLeft; + + // Create a data structure to quickly access the distance information. + // It consists of a vector of distance maps, where each map contains + // all distances of a certain sequence. Vector and maps are accessed + // via the index of a sequence in the distance matrix + distMapRight = vector(numSeqs); + distMapLeft = vector(numSeqs); + //cout << "left" << endl << endl; + for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) { + distMapLeft[currentCell->row][currentCell->column] = currentCell->dist; + //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; + } + //cout << "right" << endl << endl; + for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) { + distMapRight[currentCell->row][currentCell->column] = currentCell->dist; + //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; + } + + delete SparseLeft; + delete SparseRight; + - //this should be parallelized - //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | ) - //create a matrix containing the distance from left to left and right to right - //calculate distances - SparseMatrix* SparseLeft = new SparseMatrix(); - SparseMatrix* SparseRight = new SparseMatrix(); + //fill preference structure + generatePreferences(distMapLeft, distMapRight, midpoint); + + count++; + + } - createSparseMatrix(0, left.size(), SparseLeft, left); - createSparseMatrix(0, right.size(), SparseRight, right); + delete distCalculator; + //find average pref score across windows + //if (increment != 0) { + + //for (int i = 0; i < pref.size(); i++) { + //pref[i].score[0] = pref[i].score[0] / iters; + //} + //} - //vector distMapRight; - //vector distMapLeft; + //sort Preferences highest to lowest + sort(pref.begin(), pref.end(), comparePref); - // Create a data structure to quickly access the distance information. - // It consists of a vector of distance maps, where each map contains - // all distances of a certain sequence. Vector and maps are accessed - // via the index of a sequence in the distance matrix - //distMapRight = vector(globaldata->gListVector->size()); - //distMapLeft = vector(globaldata->gListVector->size()); - for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) { - //distMapLeft[currentCell->row][currentCell->column] = currentCell->dist; - } - for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) { - //distMapRight[currentCell->row][currentCell->column] = currentCell->dist; - } - + string outputFileName = getRootName(fastafile) + "chimeras"; + ofstream out; + openOutputFile(outputFileName, out); - //fill preference structure - //generatePreferences(distMapLeft, distMapRight); + int above1 = 0; + out << "Name\tScore\tLeft\tRight\t" << endl; + //output prefenence structure to .chimeras file + for (int i = 0; i < pref.size(); i++) { + out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl; + + //calc # of seqs with preference above 1.0 + if (pref[i].score[0] > 1.0) { + above1++; + mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine(); + mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine(); + } + + + } - - //output results to screen + //output results to screen mothurOutEndLine(); - mothurOut("\t\t"); mothurOutEndLine(); - //mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); mothurOutEndLine(); - //mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25])); mothurOutEndLine(); - //mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25])); mothurOutEndLine(); - //mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50])); mothurOutEndLine(); - //mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75])); mothurOutEndLine(); - //mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5])); mothurOutEndLine(); - //mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100])); mothurOutEndLine(); - //mothurOut("# of Seqs:\t" + toString(numSeqs)); mothurOutEndLine(); + mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine(); + int spot; + spot = pref.size()-1; + mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.975; + mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.75; + mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.50; + mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.25; + mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.025; + mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = 0; + mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - //outSummary.close(); return 0; } catch(exception& e) { @@ -188,6 +296,8 @@ void ChimeraSeqsCommand::readSeqs(){ while(!inFASTA.eof()){ Sequence current(inFASTA); + if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); } + seqs.push_back(current); gobble(inFASTA); @@ -201,53 +311,6 @@ void ChimeraSeqsCommand::readSeqs(){ } } - -//*************************************************************************************************************** -int ChimeraSeqsCommand::findAverageMidPoint(){ - try { - int totalMids = 0; - int averageMid = 0; - - //loop through the seqs and find midpoint - for (int i = 0; i < seqs.size(); i++) { - - //get unaligned sequence - seqs[i].setUnaligned(seqs[i].getUnaligned()); //if you read an aligned file the unaligned is really aligned, so we need to make sure its unaligned - - string unaligned = seqs[i].getUnaligned(); - string aligned = seqs[i].getAligned(); - - //find midpoint of this seq - int count = 0; - int thismid = 0; - for (int j = 0; j < aligned.length(); j++) { - - thismid++; - - //if you are part of the unaligned sequence increment - if (isalpha(aligned[j])) { count++; } - - //if you have reached the halfway point stop - if (count >= (unaligned.length() / 2)) { break; } - } - - //add this mid to total - totalMids += thismid; - - } - - averageMid = (totalMids / seqs.size()); - - return averageMid; - - - } - catch(exception& e) { - errorOut(e, "ChimeraSeqsCommand", "findAverageMidPoint"); - exit(1); - } -} - /***************************************************************************************************************/ int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector s){ try { @@ -256,9 +319,9 @@ int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatri for(int j=0;jcalcDist(s.get(i), s.get(j)); + distCalculator->calcDist(s[i], s[j]); float dist = distCalculator->getDist(); - + PCell temp(i, j, dist); sparse->addCell(temp); @@ -273,23 +336,122 @@ int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatri exit(1); } } -/*************************************************************************************************************** -void ChimeraSeqsCommand::generatePreferences(vector left, vector right){ +/***************************************************************************************************************/ +void ChimeraSeqsCommand::generatePreferences(vector left, vector right, int mid){ try { - + + float dme = 0.0; + SeqMap::iterator itR; + SeqMap::iterator itL; + + //initialize pref[i] + for (int i = 0; i < pref.size(); i++) { + pref[i].score[1] = 0.0; + pref[i].closestLeft[1] = 100000.0; + pref[i].closestRight[1] = 100000.0; + pref[i].leftParent[1] = ""; + pref[i].rightParent[1] = ""; + } + for (int i = 0; i < left.size(); i++) { - int iscore = 0; - float closestLeft = 100000.0; - float closestRight = 100000.0; + SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0, + // 1 to the distance of sequence 3 to sequence 1, + // 2 to the distance of sequence 3 to sequence 2. + SeqMap currentRight = right[i]; // same as left but with distances on the right side. - for (int j = 0; j < left.size(); j++) { + for (int j = 0; j < i; j++) { - //iscore += abs(left - + itL = currentLeft.find(j); + itR = currentRight.find(j); +cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl; +cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl; + + //if you can find this entry update the preferences + if ((itL != currentLeft.end()) && (itR != currentRight.end())) { + + if (!correction) { + pref[i].score[1] += abs((itL->second - itR->second)); + pref[j].score[1] += abs((itL->second - itR->second)); +cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; +cout << "abs = " << abs((itL->second - itR->second)) << endl; +cout << i << " score = " << pref[i].score[1] << endl; +cout << j << " score = " << pref[j].score[1] << endl; + }else { + pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); + pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); +cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; +cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl; +cout << i << " score = " << pref[i].score[1] << endl; +cout << j << " score = " << pref[j].score[1] << endl; + } +cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl; + //are you the closest left sequence + if (itL->second < pref[i].closestLeft[1]) { + + pref[i].closestLeft[1] = itL->second; + pref[i].leftParent[1] = seqs[j].getName(); +cout << "updating closest left to " << pref[i].leftParent[1] << endl; + } +cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl; + if (itL->second < pref[j].closestLeft[1]) { + pref[j].closestLeft[1] = itL->second; + pref[j].leftParent[1] = seqs[i].getName(); +cout << "updating closest left to " << pref[j].leftParent[1] << endl; + } + + //are you the closest right sequence + if (itR->second < pref[i].closestRight[1]) { + pref[i].closestRight[1] = itR->second; + pref[i].rightParent[1] = seqs[j].getName(); + } + if (itR->second < pref[j].closestRight[1]) { + pref[j].closestRight[1] = itR->second; + pref[j].rightParent[1] = seqs[i].getName(); + } + + } } } + + + + //calculate the dme + int count0 = 0; + for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } } + + float expectedPercent = 1 / (float) (pref.size() - count0); +cout << endl << "dme = " << dme << endl; + //recalculate prefernences based on dme + for (int i = 0; i < pref.size(); i++) { +cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl; + // gives the actual percentage of the dme this seq adds + pref[i].score[1] = pref[i].score[1] / dme; + + //how much higher or lower is this than expected + pref[i].score[1] = pref[i].score[1] / expectedPercent; + + //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher. +cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl; + } + + //is this score bigger then the last score + for (int i = 0; i < pref.size(); i++) { + + //update biggest score + if (pref[i].score[1] > pref[i].score[0]) { + pref[i].score[0] = pref[i].score[1]; + pref[i].leftParent[0] = pref[i].leftParent[1]; + pref[i].rightParent[0] = pref[i].rightParent[1]; + pref[i].closestLeft[0] = pref[i].closestLeft[1]; + pref[i].closestRight[0] = pref[i].closestRight[1]; + pref[i].midpoint = mid; + } + + //total of preference scores across windows + //pref[i].score[0] += pref[i].score[1]; + } } catch(exception& e) {