X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=chimeraseqscommand.cpp;h=aaed3bab7b56b14460d06de90c9551df70c4351d;hb=dbd5da8043df1cb9f5ff7c6ddb5f550ea49b52c2;hp=b451721ca8a0d951b8e01ed5b398dd3b94dbe22a;hpb=28d65de5f06f5b033109a3b8bbb6d3c4060914d3;p=mothur.git diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index b451721..aaed3ba 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -8,7 +8,8 @@ */ #include "chimeraseqscommand.h" -#include "eachgapdist.h" +#include "bellerophon.h" +#include "pintail.h" //*************************************************************************************************************** @@ -21,7 +22,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment" }; + string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -39,6 +40,19 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ if (fastafile == "not open") { abort = true; } else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true; } + templatefile = validParameter.validFile(parameters, "template", true); + if (templatefile == "not open") { abort = true; } + else if (templatefile == "not found") { templatefile = ""; } + + consfile = validParameter.validFile(parameters, "conservation", true); + if (consfile == "not open") { abort = true; } + else if (consfile == "not found") { consfile = ""; } + + quanfile = validParameter.validFile(parameters, "quantile", true); + if (quanfile == "not open") { abort = true; } + else if (quanfile == "not found") { quanfile = ""; } + + string temp; temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; } filter = isTrue(temp); @@ -52,12 +66,13 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; } convert(temp, window); - temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "10"; } + temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; } convert(temp, increment); - method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "bellerophon"; } + method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; } + + if ((method == "pintail") && (templatefile == "")) { mothurOut("You must provide a template file with the pintail method."); mothurOutEndLine(); abort = true; } - if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine(); abort = true; } } } @@ -73,9 +88,9 @@ void ChimeraSeqsCommand::help(){ mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n"); mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method. fasta is required.\n"); mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter. The default is false. \n"); - mothurOut("The correction parameter allows you to ..... The default is true. \n"); + mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. \n"); mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); - mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is bellerophon. \n"); + mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. \n"); mothurOut("The chimera.seqs command should be in the following format: \n"); mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n"); mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n"); @@ -86,11 +101,6 @@ void ChimeraSeqsCommand::help(){ exit(1); } } -//******************************************************************************************************************** -//sorts highest score to lowest -inline bool comparePref(Preference left, Preference right){ - return (left.score[0] > right.score[0]); -} //*************************************************************************************************************** @@ -103,363 +113,45 @@ int ChimeraSeqsCommand::execute(){ if (abort == true) { return 0; } - - //do soft filter - if (filter) { - string optionString = "fasta=" + fastafile + ", soft=50, vertical=F"; - filterSeqs = new FilterSeqsCommand(optionString); - filterSeqs->execute(); - delete filterSeqs; + if (method == "bellerophon") { chimera = new Bellerophon(fastafile); } + else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); + //saves time to avoid generating it + if (consfile != "") { chimera->setCons(consfile); } + else { chimera->setCons(""); } - //reset fastafile to filtered file - fastafile = getRootName(fastafile) + "filter.fasta"; - } - - distCalculator = new eachGapDist(); - - //read in sequences - readSeqs(); - - int numSeqs = seqs.size(); - - if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; } - - //set default window to 25% of sequence length - string seq0 = seqs[0].getAligned(); - if (window == 0) { window = seq0.length() / 4; } - else if (window > (seq0.length() / 2)) { - mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine(); - window = (seq0.length() / 2); - } + //saves time to avoid generating it + if (quanfile != "") { chimera->setQuantiles(quanfile); } + else { chimera->setQuantiles(""); } + }else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } - if (increment > (seqs[0].getAlignLength() - (2*window))) { - if (increment != 10) { - - mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine(); - increment = 10; - if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; } - - }else{ increment = 0; } - } -cout << "increment = " << increment << endl; - if (increment == 0) { iters = 1; } - else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); } - - //initialize pref - pref.resize(numSeqs); - - for (int i = 0; i < numSeqs; i++ ) { - pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3); - pref[i].name = seqs[i].getName(); - pref[i].score[0] = 0.0; pref[i].score[1] = 0.0; - pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0; - pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0; - } - - int midpoint = window; - int count = 0; - while (count < iters) { - - //create 2 vectors of sequences, 1 for left side and one for right side - vector left; vector right; - - for (int i = 0; i < seqs.size(); i++) { -//cout << "whole = " << seqs[i].getAligned() << endl; - //save left side - string seqLeft = seqs[i].getAligned().substr(midpoint-window, window); - Sequence tempLeft; - tempLeft.setName(seqs[i].getName()); - tempLeft.setAligned(seqLeft); - left.push_back(tempLeft); -//cout << "left = " << tempLeft.getAligned() << endl; - //save right side - string seqRight = seqs[i].getAligned().substr(midpoint, window); - Sequence tempRight; - tempRight.setName(seqs[i].getName()); - tempRight.setAligned(seqRight); - right.push_back(tempRight); -//cout << "right = " << seqRight << endl; - } - - //adjust midpoint by increment - midpoint += increment; - - - //this should be parallelized - //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | ) - //create a matrix containing the distance from left to left and right to right - //calculate distances - SparseMatrix* SparseLeft = new SparseMatrix(); - SparseMatrix* SparseRight = new SparseMatrix(); - - createSparseMatrix(0, left.size(), SparseLeft, left); - createSparseMatrix(0, right.size(), SparseRight, right); + //set user options + chimera->setFilter(filter); + chimera->setCorrection(correction); + chimera->setProcessors(processors); + chimera->setWindow(window); + chimera->setIncrement(increment); - vector distMapRight; - vector distMapLeft; - - // Create a data structure to quickly access the distance information. - // It consists of a vector of distance maps, where each map contains - // all distances of a certain sequence. Vector and maps are accessed - // via the index of a sequence in the distance matrix - distMapRight = vector(numSeqs); - distMapLeft = vector(numSeqs); - //cout << "left" << endl << endl; - for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) { - distMapLeft[currentCell->row][currentCell->column] = currentCell->dist; - //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; - } - //cout << "right" << endl << endl; - for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) { - distMapRight[currentCell->row][currentCell->column] = currentCell->dist; - //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; - } - - delete SparseLeft; - delete SparseRight; - - - //fill preference structure - generatePreferences(distMapLeft, distMapRight, midpoint); - - count++; - - } + //find chimeras + chimera->getChimeras(); - delete distCalculator; - - //find average pref score across windows - //if (increment != 0) { - - //for (int i = 0; i < pref.size(); i++) { - //pref[i].score[0] = pref[i].score[0] / iters; - //} - //} - - //sort Preferences highest to lowest - sort(pref.begin(), pref.end(), comparePref); - - string outputFileName = getRootName(fastafile) + "chimeras"; + string outputFileName = getRootName(fastafile) + method + ".chimeras"; ofstream out; openOutputFile(outputFileName, out); - int above1 = 0; - out << "Name\tScore\tLeft\tRight\t" << endl; - //output prefenence structure to .chimeras file - for (int i = 0; i < pref.size(); i++) { - out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl; - - //calc # of seqs with preference above 1.0 - if (pref[i].score[0] > 1.0) { - above1++; - mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine(); - mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine(); - } - - - } + //print results + chimera->print(out); - //output results to screen - mothurOutEndLine(); - mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine(); - int spot; - spot = pref.size()-1; - mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = pref.size() * 0.975; - mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = pref.size() * 0.75; - mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = pref.size() * 0.50; - mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = pref.size() * 0.25; - mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = pref.size() * 0.025; - mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = 0; - mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + out.close(); - return 0; - } - catch(exception& e) { - errorOut(e, "ChimeraSeqsCommand", "execute"); - exit(1); - } -} - -//*************************************************************************************************************** -void ChimeraSeqsCommand::readSeqs(){ - try { - ifstream inFASTA; - openInputFile(fastafile, inFASTA); + delete chimera; - //read in seqs and store in vector - while(!inFASTA.eof()){ - Sequence current(inFASTA); - - if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); } - - seqs.push_back(current); - - gobble(inFASTA); - } - inFASTA.close(); - - } - catch(exception& e) { - errorOut(e, "ChimeraSeqsCommand", "readSeqs"); - exit(1); - } -} - -/***************************************************************************************************************/ -int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector s){ - try { - - for(int i=startSeq; icalcDist(s[i], s[j]); - float dist = distCalculator->getDist(); - - PCell temp(i, j, dist); - sparse->addCell(temp); - - } - } - - - return 1; - } - catch(exception& e) { - errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix"); - exit(1); - } -} -/***************************************************************************************************************/ -void ChimeraSeqsCommand::generatePreferences(vector left, vector right, int mid){ - try { - - float dme = 0.0; - SeqMap::iterator itR; - SeqMap::iterator itL; - - //initialize pref[i] - for (int i = 0; i < pref.size(); i++) { - pref[i].score[1] = 0.0; - pref[i].closestLeft[1] = 100000.0; - pref[i].closestRight[1] = 100000.0; - pref[i].leftParent[1] = ""; - pref[i].rightParent[1] = ""; - } - - for (int i = 0; i < left.size(); i++) { - - SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0, - // 1 to the distance of sequence 3 to sequence 1, - // 2 to the distance of sequence 3 to sequence 2. - SeqMap currentRight = right[i]; // same as left but with distances on the right side. - - for (int j = 0; j < i; j++) { - - itL = currentLeft.find(j); - itR = currentRight.find(j); -cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl; -cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl; - - //if you can find this entry update the preferences - if ((itL != currentLeft.end()) && (itR != currentRight.end())) { - - if (!correction) { - pref[i].score[1] += abs((itL->second - itR->second)); - pref[j].score[1] += abs((itL->second - itR->second)); -cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; -cout << "abs = " << abs((itL->second - itR->second)) << endl; -cout << i << " score = " << pref[i].score[1] << endl; -cout << j << " score = " << pref[j].score[1] << endl; - }else { - pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); - pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); -cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; -cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl; -cout << i << " score = " << pref[i].score[1] << endl; -cout << j << " score = " << pref[j].score[1] << endl; - } -cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl; - //are you the closest left sequence - if (itL->second < pref[i].closestLeft[1]) { - - pref[i].closestLeft[1] = itL->second; - pref[i].leftParent[1] = seqs[j].getName(); -cout << "updating closest left to " << pref[i].leftParent[1] << endl; - } -cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl; - if (itL->second < pref[j].closestLeft[1]) { - pref[j].closestLeft[1] = itL->second; - pref[j].leftParent[1] = seqs[i].getName(); -cout << "updating closest left to " << pref[j].leftParent[1] << endl; - } - - //are you the closest right sequence - if (itR->second < pref[i].closestRight[1]) { - pref[i].closestRight[1] = itR->second; - pref[i].rightParent[1] = seqs[j].getName(); - } - if (itR->second < pref[j].closestRight[1]) { - pref[j].closestRight[1] = itR->second; - pref[j].rightParent[1] = seqs[i].getName(); - } - - } - } - - } - - - - //calculate the dme - int count0 = 0; - for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } } - - float expectedPercent = 1 / (float) (pref.size() - count0); -cout << endl << "dme = " << dme << endl; - //recalculate prefernences based on dme - for (int i = 0; i < pref.size(); i++) { -cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl; - // gives the actual percentage of the dme this seq adds - pref[i].score[1] = pref[i].score[1] / dme; - - //how much higher or lower is this than expected - pref[i].score[1] = pref[i].score[1] / expectedPercent; - - //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher. -cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl; - } + return 0; - //is this score bigger then the last score - for (int i = 0; i < pref.size(); i++) { - - //update biggest score - if (pref[i].score[1] > pref[i].score[0]) { - pref[i].score[0] = pref[i].score[1]; - pref[i].leftParent[0] = pref[i].leftParent[1]; - pref[i].rightParent[0] = pref[i].rightParent[1]; - pref[i].closestLeft[0] = pref[i].closestLeft[1]; - pref[i].closestRight[0] = pref[i].closestRight[1]; - pref[i].midpoint = mid; - } - - //total of preference scores across windows - //pref[i].score[0] += pref[i].score[1]; - } - } catch(exception& e) { - errorOut(e, "ChimeraSeqsCommand", "generatePreferences"); + errorOut(e, "ChimeraSeqsCommand", "execute"); exit(1); } } /**************************************************************************************************/ -/**************************************************************************************************/ -