2 * chimeraseqscommand.cpp
5 * Created by Sarah Westcott on 6/29/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "chimeraseqscommand.h"
11 #include "eachgapdist.h"
12 #include "ignoregaps.h"
13 #include "onegapdist.h"
15 //***************************************************************************************************************
17 ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
21 //allow user to run help
22 if(option == "help") { help(); abort = true; }
25 //valid paramters for this command
26 string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment" };
27 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
29 OptionParser parser(option);
30 map<string,string> parameters = parser.getParameters();
32 ValidParameters validParameter;
34 //check to make sure all parameters are valid for command
35 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
36 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
39 //check for required parameters
40 fastafile = validParameter.validFile(parameters, "fasta", true);
41 if (fastafile == "not open") { abort = true; }
42 else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true; }
45 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; }
46 filter = isTrue(temp);
48 temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; }
49 correction = isTrue(temp);
51 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
52 convert(temp, processors);
54 temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
55 convert(temp, window);
57 temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "10"; }
58 convert(temp, increment);
60 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "bellerophon"; }
62 if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine(); abort = true; }
67 errorOut(e, "ChimeraSeqsCommand", "ChimeraSeqsCommand");
71 //**********************************************************************************************************************
73 void ChimeraSeqsCommand::help(){
75 mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
76 mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method. fasta is required.\n");
77 mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter. The default is false. \n");
78 mothurOut("The correction parameter allows you to ..... The default is true. \n");
79 mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
80 mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is bellerophon. \n");
81 mothurOut("The chimera.seqs command should be in the following format: \n");
82 mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
83 mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
84 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
87 errorOut(e, "ChimeraSeqsCommand", "help");
91 //********************************************************************************************************************
92 //sorts highest score to lowest
93 inline bool comparePref(Preference left, Preference right){
94 return (left.score[0] > right.score[0]);
97 //***************************************************************************************************************
99 ChimeraSeqsCommand::~ChimeraSeqsCommand(){ /* do nothing */ }
101 //***************************************************************************************************************
103 int ChimeraSeqsCommand::execute(){
106 if (abort == true) { return 0; }
111 string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
112 filterSeqs = new FilterSeqsCommand(optionString);
113 filterSeqs->execute();
116 //reset fastafile to filtered file
117 fastafile = getRootName(fastafile) + "filter.fasta";
120 distCalculator = new eachGapDist();
125 int numSeqs = seqs.size();
127 if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; }
129 //set default window to 25% of sequence length
130 string seq0 = seqs[0].getAligned();
131 if (window == 0) { window = seq0.length() / 4; }
132 else if (window > (seq0.length() / 2)) {
133 mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
134 window = (seq0.length() / 2);
137 if (increment > (seqs[0].getAlignLength() - (2*window))) {
138 if (increment != 10) {
140 mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
142 if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; }
144 }else{ increment = 0; }
146 cout << "increment = " << increment << endl;
147 if (increment == 0) { iters = 1; }
148 else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
151 pref.resize(numSeqs);
153 for (int i = 0; i < numSeqs; i++ ) {
154 pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
155 pref[i].name = seqs[i].getName();
156 pref[i].score[0] = 0.0; pref[i].score[1] = 0.0;
157 pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0;
158 pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0;
161 int midpoint = window;
163 while (count < iters) {
165 //create 2 vectors of sequences, 1 for left side and one for right side
166 vector<Sequence> left; vector<Sequence> right;
168 for (int i = 0; i < seqs.size(); i++) {
169 //cout << "whole = " << seqs[i].getAligned() << endl;
171 string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
173 tempLeft.setName(seqs[i].getName());
174 tempLeft.setAligned(seqLeft);
175 left.push_back(tempLeft);
176 //cout << "left = " << tempLeft.getAligned() << endl;
178 string seqRight = seqs[i].getAligned().substr(midpoint, window);
180 tempRight.setName(seqs[i].getName());
181 tempRight.setAligned(seqRight);
182 right.push_back(tempRight);
183 //cout << "right = " << seqRight << endl;
186 //adjust midpoint by increment
187 midpoint += increment;
190 //this should be parallelized
191 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
192 //create a matrix containing the distance from left to left and right to right
193 //calculate distances
194 SparseMatrix* SparseLeft = new SparseMatrix();
195 SparseMatrix* SparseRight = new SparseMatrix();
197 createSparseMatrix(0, left.size(), SparseLeft, left);
198 createSparseMatrix(0, right.size(), SparseRight, right);
200 vector<SeqMap> distMapRight;
201 vector<SeqMap> distMapLeft;
203 // Create a data structure to quickly access the distance information.
204 // It consists of a vector of distance maps, where each map contains
205 // all distances of a certain sequence. Vector and maps are accessed
206 // via the index of a sequence in the distance matrix
207 distMapRight = vector<SeqMap>(numSeqs);
208 distMapLeft = vector<SeqMap>(numSeqs);
209 //cout << "left" << endl << endl;
210 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
211 distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
212 //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
214 //cout << "right" << endl << endl;
215 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
216 distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
217 //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
224 //fill preference structure
225 generatePreferences(distMapLeft, distMapRight, midpoint);
231 delete distCalculator;
233 //rank preference score to eachother
235 float expectedPercent = 1 / (float) (pref.size());
237 for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[0]; }
239 for (int i = 0; i < pref.size(); i++) {
241 //gives the actual percentage of the dme this seq adds
242 pref[i].score[0] = pref[i].score[0] / dme;
244 //how much higher or lower is this than expected
245 pref[i].score[0] = pref[i].score[0] / expectedPercent;
250 //sort Preferences highest to lowest
251 sort(pref.begin(), pref.end(), comparePref);
253 string outputFileName = getRootName(fastafile) + "chimeras";
255 openOutputFile(outputFileName, out);
258 out << "Name\tScore\tLeft\tRight\t" << endl;
259 //output prefenence structure to .chimeras file
260 for (int i = 0; i < pref.size(); i++) {
261 out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
263 //calc # of seqs with preference above 1.0
264 if (pref[i].score[0] > 1.0) {
266 mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
267 mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
273 //output results to screen
275 mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
277 spot = pref.size()-1;
278 mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
279 spot = pref.size() * 0.975;
280 mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
281 spot = pref.size() * 0.75;
282 mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
283 spot = pref.size() * 0.50;
284 mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
285 spot = pref.size() * 0.25;
286 mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
287 spot = pref.size() * 0.025;
288 mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
290 mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
294 catch(exception& e) {
295 errorOut(e, "ChimeraSeqsCommand", "execute");
300 //***************************************************************************************************************
301 void ChimeraSeqsCommand::readSeqs(){
304 openInputFile(fastafile, inFASTA);
306 //read in seqs and store in vector
307 while(!inFASTA.eof()){
308 Sequence current(inFASTA);
310 if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
312 seqs.push_back(current);
319 catch(exception& e) {
320 errorOut(e, "ChimeraSeqsCommand", "readSeqs");
325 /***************************************************************************************************************/
326 int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
329 for(int i=startSeq; i<endSeq; i++){
331 for(int j=0;j<i;j++){
333 distCalculator->calcDist(s[i], s[j]);
334 float dist = distCalculator->getDist();
336 PCell temp(i, j, dist);
337 sparse->addCell(temp);
345 catch(exception& e) {
346 errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix");
350 /***************************************************************************************************************/
351 void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
355 SeqMap::iterator itR;
356 SeqMap::iterator itL;
359 for (int i = 0; i < pref.size(); i++) {
360 pref[i].score[1] = 0.0;
361 pref[i].closestLeft[1] = 100000.0;
362 pref[i].closestRight[1] = 100000.0;
363 pref[i].leftParent[1] = "";
364 pref[i].rightParent[1] = "";
367 for (int i = 0; i < left.size(); i++) {
369 SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
370 // 1 to the distance of sequence 3 to sequence 1,
371 // 2 to the distance of sequence 3 to sequence 2.
372 SeqMap currentRight = right[i]; // same as left but with distances on the right side.
374 for (int j = 0; j < i; j++) {
376 itL = currentLeft.find(j);
377 itR = currentRight.find(j);
378 //cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
379 //cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
381 //if you can find this entry update the preferences
382 if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
385 pref[i].score[1] += abs((itL->second - itR->second));
386 pref[j].score[1] += abs((itL->second - itR->second));
387 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
388 //cout << "abs = " << abs((itL->second - itR->second)) << endl;
389 //cout << i << " score = " << pref[i].score[1] << endl;
390 //cout << j << " score = " << pref[j].score[1] << endl;
392 pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
393 pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
394 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
395 //cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
396 //cout << i << " score = " << pref[i].score[1] << endl;
397 //cout << j << " score = " << pref[j].score[1] << endl;
399 //cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;
400 //are you the closest left sequence
401 if (itL->second < pref[i].closestLeft[1]) {
403 pref[i].closestLeft[1] = itL->second;
404 pref[i].leftParent[1] = seqs[j].getName();
405 //cout << "updating closest left to " << pref[i].leftParent[1] << endl;
407 //cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;
408 if (itL->second < pref[j].closestLeft[1]) {
409 pref[j].closestLeft[1] = itL->second;
410 pref[j].leftParent[1] = seqs[i].getName();
411 //cout << "updating closest left to " << pref[j].leftParent[1] << endl;
414 //are you the closest right sequence
415 if (itR->second < pref[i].closestRight[1]) {
416 pref[i].closestRight[1] = itR->second;
417 pref[i].rightParent[1] = seqs[j].getName();
419 if (itR->second < pref[j].closestRight[1]) {
420 pref[j].closestRight[1] = itR->second;
421 pref[j].rightParent[1] = seqs[i].getName();
434 for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } }
436 float expectedPercent = 1 / (float) (pref.size() - count0);
437 //cout << endl << "dme = " << dme << endl;
438 //recalculate prefernences based on dme
439 for (int i = 0; i < pref.size(); i++) {
440 //cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;
441 // gives the actual percentage of the dme this seq adds
442 pref[i].score[1] = pref[i].score[1] / dme;
444 //how much higher or lower is this than expected
445 pref[i].score[1] = pref[i].score[1] / expectedPercent;
447 //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
449 //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
450 //cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;
453 //is this score bigger then the last score
454 for (int i = 0; i < pref.size(); i++) {
456 //update biggest score
457 if (pref[i].score[1] > pref[i].score[0]) {
458 pref[i].score[0] = pref[i].score[1];
459 pref[i].leftParent[0] = pref[i].leftParent[1];
460 pref[i].rightParent[0] = pref[i].rightParent[1];
461 pref[i].closestLeft[0] = pref[i].closestLeft[1];
462 pref[i].closestRight[0] = pref[i].closestRight[1];
463 pref[i].midpoint = mid;
469 catch(exception& e) {
470 errorOut(e, "ChimeraSeqsCommand", "generatePreferences");
474 /**************************************************************************************************/
476 /**************************************************************************************************/