2 * alignedsimilarity.cpp
5 * Created by westcott on 8/18/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "alignedsimilarity.h"
11 #include "ignoregaps.h"
14 //***************************************************************************************************************
15 AlignSim::AlignSim(string filename, string temp) { fastafile = filename; templateFile = temp; }
16 //***************************************************************************************************************
18 AlignSim::~AlignSim() {
20 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
21 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
25 errorOut(e, "AlignSim", "~AlignSim");
29 //***************************************************************************************************************
30 void AlignSim::print(ostream& out) {
35 for (int i = 0; i < querySeqs.size(); i++) {
37 int j = 0; float largest = -10;
38 //find largest sim value
39 for (int k = 0; k < IS[i].size(); k++) {
40 //is this score larger
41 if (IS[i][k].score > largest) {
43 largest = IS[i][k].score;
47 //find parental similarity
48 distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
49 float dist = distCalc->getDist();
51 //convert to similarity
52 dist = (1 - dist) * 100;
54 //warn about parental similarity - if its above 82% may not detect a chimera
55 if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly."); mothurOutEndLine(); }
57 int index = ceil(dist);
59 if (index == 0) { index=1; }
61 //is your DE value higher than the 95%
63 if (IS[i][j].score > quantile[index-1][4]) { chimera = "Yes"; }
64 else { chimera = "No"; }
66 out << querySeqs[i]->getName() << "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
68 if (chimera == "Yes") {
69 mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
71 out << "Improvement Score\t";
73 for (int r = 0; r < IS[i].size(); r++) { out << IS[i][r].score << '\t'; }
78 errorOut(e, "AlignSim", "print");
83 //***************************************************************************************************************
84 void AlignSim::getChimeras() {
87 //read in query sequences and subject sequences
88 mothurOut("Reading sequences and template file... "); cout.flush();
89 querySeqs = readSeqs(fastafile);
90 templateSeqs = readSeqs(templateFile);
91 mothurOut("Done."); mothurOutEndLine();
93 int numSeqs = querySeqs.size();
97 //break up file if needed
98 int linesPerProcess = numSeqs / processors ;
100 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
101 //find breakup of sequences for all times we will Parallelize
102 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
105 for (int i = 0; i < (processors-1); i++) {
106 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
108 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
109 int i = processors - 1;
110 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
113 //find breakup of templatefile for quantiles
114 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
116 for (int i = 0; i < processors; i++) {
117 templateLines.push_back(new linePair());
118 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
119 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
123 lines.push_back(new linePair(0, numSeqs));
124 templateLines.push_back(new linePair(0, templateSeqs.size()));
128 distCalc = new ignoreGaps();
131 windowBreak = findWindows();
132 //for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t'; }
135 mothurOut("Finding the IS values for your sequences..."); cout.flush();
136 if (processors == 1) {
137 IS = findIS(lines[0]->start, lines[0]->end, querySeqs);
138 }else { IS = createProcessesIS(querySeqs, lines); }
139 mothurOut("Done."); mothurOutEndLine();
141 //quantiles are used to determine whether the de values found indicate a chimera
142 if (quanfile != "") { quantile = readQuantiles(); }
145 mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
146 //get IS quantiles as a reference to these IS scores
147 //you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see
148 if (processors == 1) {
149 templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs);
150 }else { templateIS = createProcessesIS(templateSeqs, templateLines); }
151 //cout << "here" << endl;
155 o = getRootName(templateFile) + "alignedsimilarity.quan";
157 openOutputFile(o, out4);
161 quantile.resize(100);
163 for (int i = 0; i < templateIS.size(); i++) {
165 if (templateIS[i].size() != 0) {
166 int j = 0; float score = -1000;
167 //find highest IS score
168 //cout << templateIS[i].size() << endl;
169 for (int k = 0; k < templateIS[i].size(); k++) {
170 if (templateIS[i][k].score > score) {
171 score = templateIS[i][k].score;
176 //find similarity of parents
177 distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent));
178 float dist = distCalc->getDist();
180 //convert to similarity
181 dist = (1 - dist) * 100;
182 // cout << dist << endl;
183 int index = ceil(dist);
184 // cout << "index = " << index << endl;
185 if (index == 0) { index = 1; }
186 quantile[index-1].push_back(templateIS[i][j].score);
191 for (int i = 0; i < quantile.size(); i++) {
194 if (quantile[i].size() == 0) {
195 //in case this is not a distance found in your template files
196 for (int g = 0; g < 6; g++) {
201 sort(quantile[i].begin(), quantile[i].end());
204 temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]);
206 temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]);
208 temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]);
210 temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]);
212 temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]);
214 temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]);
220 for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; }
229 mothurOut("Done."); mothurOutEndLine();
234 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
235 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
238 catch(exception& e) {
239 errorOut(e, "AlignSim", "getChimeras");
243 //***************************************************************************************************************
244 vector<int> AlignSim::findWindows() {
249 if (increment > querySeqs[0]->getAligned().length()) { mothurOut("You have selected an increment larger than the length of your sequences. I will use the default of 25."); increment = 25; }
251 for (int m = increment; m < (querySeqs[0]->getAligned().length() - increment); m+=increment) { win.push_back(m); }
256 catch(exception& e) {
257 errorOut(e, "AlignSim", "findWindows");
262 //***************************************************************************************************************
263 vector< vector<sim> > AlignSim::findIS(int start, int end, vector<Sequence*> seqs) {
266 vector< vector<sim> > isValues;
267 isValues.resize(seqs.size());
270 for(int i = start; i < end; i++){
272 vector<sim> temp; temp.resize(windowBreak.size());
274 mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine();
277 for (int m = 0; m < windowBreak.size(); m++) {
279 vector<Sequence*> closest; //left, right, overall
280 vector<int> similarity; //left+right and overall
282 //find closest left, closest right and closest overall
283 closest = findClosestSides(seqs[i], windowBreak[m], similarity, i);
285 int totalNumBases = seqs[i]->getUnaligned().length();
287 //IS = left+right-overall
288 float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases);
290 ///save IS, leftparent, rightparent, breakpoint
291 temp[m].leftParent = closest[0];
292 temp[m].rightParent = closest[1];
294 temp[m].midpoint = windowBreak[m];
295 //cout << is << '\t';
305 catch(exception& e) {
306 errorOut(e, "AlignSim", "findIS");
310 //***************************************************************************************************************
311 vector<Sequence*> AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector<int>& sim, int i) {
314 vector<Sequence*> closest;
316 Sequence query, queryLeft, queryRight;
317 string frag = seq->getAligned();
318 string fragLeft = frag.substr(0, breakpoint);
319 string fragRight = frag.substr(breakpoint, frag.length());
325 queryLeft.setAligned(fragLeft);
326 queryRight.setAligned(fragRight);
329 Sequence* overall = templateSeqs[0];
330 Sequence* left = templateSeqs[0];
331 Sequence* right = templateSeqs[0];
333 float smallestOverall, smallestLeft, smallestRight;
334 smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000;
336 //go through the templateSeqs and search for the closest
337 for(int j = 0; j < templateSeqs.size(); j++){
339 //so you don't pick yourself
341 Sequence temp, tempLeft, tempRight;
342 string fragTemp = templateSeqs[j]->getAligned();
343 string fragTempLeft = fragTemp.substr(0, breakpoint);
344 string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length());
347 temp = *(templateSeqs[j]);
348 tempLeft = *(templateSeqs[j]);
349 tempRight = *(templateSeqs[j]);
350 tempLeft.setAligned(fragTempLeft);
351 tempRight.setAligned(fragTempRight);
354 distCalc->calcDist(query, temp);
355 float dist = distCalc->getDist();
357 if (dist < smallestOverall) {
358 overall = templateSeqs[j];
359 smallestOverall = dist;
363 distCalc->calcDist(queryLeft, tempLeft);
364 dist = distCalc->getDist();
366 if (dist < smallestLeft) {
367 left = templateSeqs[j];
372 distCalc->calcDist(queryRight, tempRight);
373 dist = distCalc->getDist();
375 if (dist < smallestRight) {
376 right = templateSeqs[j];
377 smallestRight = dist;
383 closest.push_back(left);
384 closest.push_back(right);
385 closest.push_back(overall);
387 //fill sim with number of matched bases
388 Sequence tempL = *(left);
389 string tempLFrag = tempL.getAligned();
390 tempL.setAligned(tempLFrag.substr(0, breakpoint));
392 int bothMatches = findNumMatchedBases(queryLeft, tempL);
393 sim.push_back(bothMatches);
395 Sequence tempR = *(right);
396 string tempRFrag = tempR.getAligned();
397 tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length()));
399 bothMatches = findNumMatchedBases(queryRight, tempR);
400 sim.push_back(bothMatches);
402 bothMatches = findNumMatchedBases(query, *(overall));
403 sim.push_back(bothMatches);
409 catch(exception& e) {
410 errorOut(e, "AlignSim", "findClosestSides");
414 //***************************************************************************************************************
416 int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) {
421 string query = seq.getAligned();
422 string subject = temp.getAligned();
423 //cout << seq.getName() << endl << endl;
424 //cout << temp.getName() << endl << endl;
426 for (int i = 0; i < query.length(); i++) {
427 //count matches if query[i] is a base and subject is same bases or gap
428 if (isalpha(query[i])) {
429 if(!isalpha(subject[i])) { num++; }
430 else if (query[i] == subject[i]) { num++; }
434 //cout << "num = " << num << endl;
437 catch(exception& e) {
438 errorOut(e, "AlignSim", "findNumMatchedBases");
443 /**************************************************************************************************/
444 vector< vector<sim> > AlignSim::createProcessesIS(vector<Sequence*> seqs, vector<linePair*> linesToProcess) {
446 vector< vector<sim> > localIs; localIs.resize(seqs.size());
448 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
450 vector<int> processIDS;
452 //loop through and create all the processes you want
453 while (process != processors) {
457 processIDS.push_back(pid);
461 mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
462 localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs);
463 mothurOut("Done finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
465 //write out data to file so parent can read it
467 string s = toString(getpid()) + ".temp";
468 openOutputFile(s, out);
471 for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) {
472 out << localIs[i].size() << endl;
473 for (int j = 0; j < localIs[i].size(); j++) {
474 localIs[i][j].leftParent->printSequence(out);
475 localIs[i][j].rightParent->printSequence(out);
476 out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl;
482 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
485 //force parent to wait until all the processes are done
486 for (int i=0;i<processors;i++) {
487 int temp = processIDS[i];
491 //get data created by processes
492 for (int i=0;i<processors;i++) {
494 string s = toString(processIDS[i]) + ".temp";
495 openInputFile(s, in);
498 for (int k = linesToProcess[i]->start; k < linesToProcess[i]->end; k++) {
503 vector<sim> tempVector;
505 for (int j = 0; j < size; j++) {
509 temp.leftParent = new Sequence(in);
511 //temp.leftParent->printSequence(cout);
512 temp.rightParent = new Sequence(in);
514 //temp.rightParent->printSequence(cout);
516 in >> throwaway >> temp.score >> temp.midpoint;
517 //cout << temp.score << '\t' << temp.midpoint << endl;
520 tempVector.push_back(temp);
523 localIs[k] = tempVector;
532 localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs);
537 catch(exception& e) {
538 errorOut(e, "AlignSim", "createProcessesIS");
543 //***************************************************************************************************************