2 * chimeraseqscommand.cpp
5 * Created by Sarah Westcott on 6/29/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "chimeraseqscommand.h"
11 #include "eachgapdist.h"
13 //***************************************************************************************************************
15 ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment" };
25 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27 OptionParser parser(option);
28 map<string,string> parameters = parser.getParameters();
30 ValidParameters validParameter;
32 //check to make sure all parameters are valid for command
33 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
34 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
37 //check for required parameters
38 fastafile = validParameter.validFile(parameters, "fasta", true);
39 if (fastafile == "not open") { abort = true; }
40 else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true; }
43 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; }
44 filter = isTrue(temp);
46 temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; }
47 correction = isTrue(temp);
49 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
50 convert(temp, processors);
52 temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
53 convert(temp, window);
55 temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "10"; }
56 convert(temp, increment);
58 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "bellerophon"; }
60 if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine(); abort = true; }
65 errorOut(e, "ChimeraSeqsCommand", "ChimeraSeqsCommand");
69 //**********************************************************************************************************************
71 void ChimeraSeqsCommand::help(){
73 mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
74 mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method. fasta is required.\n");
75 mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter. The default is false. \n");
76 mothurOut("The correction parameter allows you to ..... The default is true. \n");
77 mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
78 mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is bellerophon. \n");
79 mothurOut("The chimera.seqs command should be in the following format: \n");
80 mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
81 mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
82 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
85 errorOut(e, "ChimeraSeqsCommand", "help");
89 //********************************************************************************************************************
90 //sorts highest score to lowest
91 inline bool comparePref(Preference left, Preference right){
92 return (left.score[0] > right.score[0]);
95 //***************************************************************************************************************
97 ChimeraSeqsCommand::~ChimeraSeqsCommand(){ /* do nothing */ }
99 //***************************************************************************************************************
101 int ChimeraSeqsCommand::execute(){
104 if (abort == true) { return 0; }
109 string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
110 filterSeqs = new FilterSeqsCommand(optionString);
111 filterSeqs->execute();
114 //reset fastafile to filtered file
115 fastafile = getRootName(fastafile) + "filter.fasta";
118 distCalculator = new eachGapDist();
123 int numSeqs = seqs.size();
125 if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; }
127 //set default window to 25% of sequence length
128 string seq0 = seqs[0].getAligned();
129 if (window == 0) { window = seq0.length() / 4; }
130 else if (window > (seq0.length() / 2)) {
131 mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
132 window = (seq0.length() / 2);
135 if (increment > (seqs[0].getAlignLength() - (2*window))) {
136 if (increment != 10) {
138 mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
140 if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; }
142 }else{ increment = 0; }
144 cout << "increment = " << increment << endl;
145 if (increment == 0) { iters = 1; }
146 else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
149 pref.resize(numSeqs);
151 for (int i = 0; i < numSeqs; i++ ) {
152 pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
153 pref[i].name = seqs[i].getName();
154 pref[i].score[0] = 0.0; pref[i].score[1] = 0.0;
155 pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0;
156 pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0;
159 int midpoint = window;
161 while (count < iters) {
163 //create 2 vectors of sequences, 1 for left side and one for right side
164 vector<Sequence> left; vector<Sequence> right;
166 for (int i = 0; i < seqs.size(); i++) {
167 //cout << "whole = " << seqs[i].getAligned() << endl;
169 string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
171 tempLeft.setName(seqs[i].getName());
172 tempLeft.setAligned(seqLeft);
173 left.push_back(tempLeft);
174 //cout << "left = " << tempLeft.getAligned() << endl;
176 string seqRight = seqs[i].getAligned().substr(midpoint, window);
178 tempRight.setName(seqs[i].getName());
179 tempRight.setAligned(seqRight);
180 right.push_back(tempRight);
181 //cout << "right = " << seqRight << endl;
184 //adjust midpoint by increment
185 midpoint += increment;
188 //this should be parallelized
189 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
190 //create a matrix containing the distance from left to left and right to right
191 //calculate distances
192 SparseMatrix* SparseLeft = new SparseMatrix();
193 SparseMatrix* SparseRight = new SparseMatrix();
195 createSparseMatrix(0, left.size(), SparseLeft, left);
196 createSparseMatrix(0, right.size(), SparseRight, right);
198 vector<SeqMap> distMapRight;
199 vector<SeqMap> distMapLeft;
201 // Create a data structure to quickly access the distance information.
202 // It consists of a vector of distance maps, where each map contains
203 // all distances of a certain sequence. Vector and maps are accessed
204 // via the index of a sequence in the distance matrix
205 distMapRight = vector<SeqMap>(numSeqs);
206 distMapLeft = vector<SeqMap>(numSeqs);
207 //cout << "left" << endl << endl;
208 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
209 distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
210 //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
212 //cout << "right" << endl << endl;
213 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
214 distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
215 //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
222 //fill preference structure
223 generatePreferences(distMapLeft, distMapRight, midpoint);
229 delete distCalculator;
231 //find average pref score across windows
232 //if (increment != 0) {
234 //for (int i = 0; i < pref.size(); i++) {
235 //pref[i].score[0] = pref[i].score[0] / iters;
239 //sort Preferences highest to lowest
240 sort(pref.begin(), pref.end(), comparePref);
242 string outputFileName = getRootName(fastafile) + "chimeras";
244 openOutputFile(outputFileName, out);
247 out << "Name\tScore\tLeft\tRight\t" << endl;
248 //output prefenence structure to .chimeras file
249 for (int i = 0; i < pref.size(); i++) {
250 out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
252 //calc # of seqs with preference above 1.0
253 if (pref[i].score[0] > 1.0) {
255 mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
256 mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
262 //output results to screen
264 mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
266 spot = pref.size()-1;
267 mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
268 spot = pref.size() * 0.975;
269 mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
270 spot = pref.size() * 0.75;
271 mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
272 spot = pref.size() * 0.50;
273 mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
274 spot = pref.size() * 0.25;
275 mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
276 spot = pref.size() * 0.025;
277 mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
279 mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
283 catch(exception& e) {
284 errorOut(e, "ChimeraSeqsCommand", "execute");
289 //***************************************************************************************************************
290 void ChimeraSeqsCommand::readSeqs(){
293 openInputFile(fastafile, inFASTA);
295 //read in seqs and store in vector
296 while(!inFASTA.eof()){
297 Sequence current(inFASTA);
299 if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
301 seqs.push_back(current);
308 catch(exception& e) {
309 errorOut(e, "ChimeraSeqsCommand", "readSeqs");
314 /***************************************************************************************************************/
315 int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
318 for(int i=startSeq; i<endSeq; i++){
320 for(int j=0;j<i;j++){
322 distCalculator->calcDist(s[i], s[j]);
323 float dist = distCalculator->getDist();
325 PCell temp(i, j, dist);
326 sparse->addCell(temp);
334 catch(exception& e) {
335 errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix");
339 /***************************************************************************************************************/
340 void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
344 SeqMap::iterator itR;
345 SeqMap::iterator itL;
348 for (int i = 0; i < pref.size(); i++) {
349 pref[i].score[1] = 0.0;
350 pref[i].closestLeft[1] = 100000.0;
351 pref[i].closestRight[1] = 100000.0;
352 pref[i].leftParent[1] = "";
353 pref[i].rightParent[1] = "";
356 for (int i = 0; i < left.size(); i++) {
358 SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
359 // 1 to the distance of sequence 3 to sequence 1,
360 // 2 to the distance of sequence 3 to sequence 2.
361 SeqMap currentRight = right[i]; // same as left but with distances on the right side.
363 for (int j = 0; j < i; j++) {
365 itL = currentLeft.find(j);
366 itR = currentRight.find(j);
367 cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
368 cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
370 //if you can find this entry update the preferences
371 if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
374 pref[i].score[1] += abs((itL->second - itR->second));
375 pref[j].score[1] += abs((itL->second - itR->second));
376 cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
377 cout << "abs = " << abs((itL->second - itR->second)) << endl;
378 cout << i << " score = " << pref[i].score[1] << endl;
379 cout << j << " score = " << pref[j].score[1] << endl;
381 pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
382 pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
383 cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
384 cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
385 cout << i << " score = " << pref[i].score[1] << endl;
386 cout << j << " score = " << pref[j].score[1] << endl;
388 cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;
389 //are you the closest left sequence
390 if (itL->second < pref[i].closestLeft[1]) {
392 pref[i].closestLeft[1] = itL->second;
393 pref[i].leftParent[1] = seqs[j].getName();
394 cout << "updating closest left to " << pref[i].leftParent[1] << endl;
396 cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;
397 if (itL->second < pref[j].closestLeft[1]) {
398 pref[j].closestLeft[1] = itL->second;
399 pref[j].leftParent[1] = seqs[i].getName();
400 cout << "updating closest left to " << pref[j].leftParent[1] << endl;
403 //are you the closest right sequence
404 if (itR->second < pref[i].closestRight[1]) {
405 pref[i].closestRight[1] = itR->second;
406 pref[i].rightParent[1] = seqs[j].getName();
408 if (itR->second < pref[j].closestRight[1]) {
409 pref[j].closestRight[1] = itR->second;
410 pref[j].rightParent[1] = seqs[i].getName();
422 for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } }
424 float expectedPercent = 1 / (float) (pref.size() - count0);
425 cout << endl << "dme = " << dme << endl;
426 //recalculate prefernences based on dme
427 for (int i = 0; i < pref.size(); i++) {
428 cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;
429 // gives the actual percentage of the dme this seq adds
430 pref[i].score[1] = pref[i].score[1] / dme;
432 //how much higher or lower is this than expected
433 pref[i].score[1] = pref[i].score[1] / expectedPercent;
435 //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
436 cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;
439 //is this score bigger then the last score
440 for (int i = 0; i < pref.size(); i++) {
442 //update biggest score
443 if (pref[i].score[1] > pref[i].score[0]) {
444 pref[i].score[0] = pref[i].score[1];
445 pref[i].leftParent[0] = pref[i].leftParent[1];
446 pref[i].rightParent[0] = pref[i].rightParent[1];
447 pref[i].closestLeft[0] = pref[i].closestLeft[1];
448 pref[i].closestRight[0] = pref[i].closestRight[1];
449 pref[i].midpoint = mid;
452 //total of preference scores across windows
453 //pref[i].score[0] += pref[i].score[1];
457 catch(exception& e) {
458 errorOut(e, "ChimeraSeqsCommand", "generatePreferences");
462 /**************************************************************************************************/
464 /**************************************************************************************************/