5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17 return (left.score < right.score);
19 //***************************************************************************************************************
21 Pintail::Pintail(string filename, string o) {
22 fastafile = filename; outputDir = o;
23 distcalculator = new eachGapDist();
24 decalc = new DeCalculator();
26 //***************************************************************************************************************
31 delete distcalculator;
35 m->errorOut(e, "Pintail", "~Pintail");
39 //***************************************************************************************************************
40 int Pintail::doPrep() {
43 mergedFilterString = "";
44 windowSizesTemplate.resize(templateSeqs.size(), window);
45 quantiles.resize(100); //one for every percent mismatch
46 quantilesMembers.resize(100); //one for every percent mismatch
48 //if the user does not enter a mask then you want to keep all the spots in the alignment
49 if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); }
50 else { decalc->setAlignmentLength(seqMask.length()); }
52 decalc->setMask(seqMask);
54 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
55 //find breakup of templatefile for quantiles
56 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
58 for (int i = 0; i < processors; i++) {
59 templateLines.push_back(new linePair());
60 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
61 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
65 templateLines.push_back(new linePair(0, templateSeqs.size()));
69 m->mothurOut("Getting conservation... "); cout.flush();
71 m->mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
72 probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName));
73 if (m->control_pressed) { return 0; }
74 m->mothurOut("Done."); m->mothurOutEndLine();
75 }else { probabilityProfile = readFreq(); m->mothurOut("Done."); }
76 m->mothurOutEndLine();
79 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
82 //create filter if needed for later
85 //read in all query seqs
87 openInputFile(fastafile, in);
89 vector<Sequence*> tempQuerySeqs;
91 if (m->control_pressed) {
92 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
96 Sequence* s = new Sequence(in);
99 if (s->getName() != "") { tempQuerySeqs.push_back(s); }
103 vector<Sequence*> temp;
104 //merge query seqs and template seqs
106 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
111 for (int i = 0; i < temp.size(); i++) {
112 if (m->control_pressed) {
113 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
116 decalc->runMask(temp[i]);
120 mergedFilterString = createFilter(temp, 0.5);
122 if (m->control_pressed) {
123 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
127 //reread template seqs
128 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
132 //quantiles are used to determine whether the de values found indicate a chimera
133 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
134 //combination of sequences in the template
135 if (quanfile != "") {
136 quantiles = readQuantiles();
138 if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
141 for (int i = 0; i < templateSeqs.size(); i++) {
142 if (m->control_pressed) { return 0; }
143 decalc->runMask(templateSeqs[i]);
149 for (int i = 0; i < templateSeqs.size(); i++) {
150 if (m->control_pressed) { return 0; }
151 runFilter(templateSeqs[i]);
155 m->mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
156 if (processors == 1) {
157 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
158 }else { createProcessesQuan(); }
160 if (m->control_pressed) { return 0; }
163 string noOutliers, outliers;
165 if ((!filter) && (seqMask == "")) {
166 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
167 }else if ((!filter) && (seqMask != "")) {
168 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
169 }else if ((filter) && (seqMask != "")) {
170 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan";
171 }else if ((filter) && (seqMask == "")) {
172 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan";
175 decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
177 if (m->control_pressed) { return 0; }
179 openOutputFile(noOutliers, out5);
181 for (int i = 0; i < quantilesMembers.size(); i++) {
184 if (quantilesMembers[i].size() == 0) {
185 //in case this is not a distance found in your template files
186 for (int g = 0; g < 6; g++) {
191 sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
194 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
196 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
198 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
200 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
202 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
204 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
210 for (int u = 0; u < temp.size(); u++) { out5 << temp[u] << '\t'; }
217 m->mothurOut("Done."); m->mothurOutEndLine();
221 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
222 templateSeqs.clear();
223 templateSeqs = readSeqs(templateFileName);
228 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
233 catch(exception& e) {
234 m->errorOut(e, "Pintail", "doPrep");
238 //***************************************************************************************************************
239 int Pintail::print(ostream& out, ostream& outAcc) {
241 int index = ceil(deviation);
243 //is your DE value higher than the 95%
245 if (index != 0) { //if index is 0 then its an exact match to a template seq
246 if (quantiles[index][4] == 0.0) {
247 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
249 if (DE > quantiles[index][4]) { chimera = "Yes"; }
250 else { chimera = "No"; }
252 }else{ chimera = "No"; }
254 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
255 if (chimera == "Yes") {
256 m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine();
257 outAcc << querySeq->getName() << endl;
261 for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; }
266 for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; }
272 catch(exception& e) {
273 m->errorOut(e, "Pintail", "print");
278 //***************************************************************************************************************
279 int Pintail::getChimeras(Sequence* query) {
283 windowSizes = window;
285 //find pairs has to be done before a mask
286 bestfit = findPairs(query);
288 if (m->control_pressed) { return 0; }
292 decalc->runMask(query);
293 decalc->runMask(bestfit);
296 if (filter) { //must be done after a mask
303 decalc->trimSeqs(query, bestfit, trimmed);
306 it = trimmed.begin();
307 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
309 //find observed distance
310 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
312 if (m->control_pressed) { return 0; }
314 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
316 if (m->control_pressed) { return 0; }
319 seqCoef = decalc->getCoef(obsDistance, Qav);
321 //calculating expected distance
322 expectedDistance = decalc->calcExpected(Qav, seqCoef);
324 if (m->control_pressed) { return 0; }
327 DE = decalc->calcDE(obsDistance, expectedDistance);
329 if (m->control_pressed) { return 0; }
331 //find distance between query and closest match
332 it = trimmed.begin();
333 deviation = decalc->calcDist(query, bestfit, it->first, it->second);
339 catch(exception& e) {
340 m->errorOut(e, "Pintail", "getChimeras");
345 //***************************************************************************************************************
347 vector<float> Pintail::readFreq() {
351 openInputFile(consfile, in);
354 set<int> h = decalc->getPos(); //positions of bases in masking sequence
356 //read in probabilities and store in vector
363 if (h.count(pos) > 0) {
365 Pi = (num - 0.25) / 0.75;
367 //cannot have probability less than 0.
368 if (Pi < 0) { Pi = 0.0; }
370 //do you want this spot
381 catch(exception& e) {
382 m->errorOut(e, "Pintail", "readFreq");
387 //***************************************************************************************************************
388 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
389 Sequence* Pintail::findPairs(Sequence* q) {
392 Sequence* seqsMatches;
394 seqsMatches = decalc->findClosest(q, templateSeqs);
398 catch(exception& e) {
399 m->errorOut(e, "Pintail", "findPairs");
403 /**************************************************************************************************/
404 void Pintail::createProcessesQuan() {
406 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
408 vector<int> processIDS;
410 //loop through and create all the processes you want
411 while (process != processors) {
415 processIDS.push_back(pid);
419 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
421 //write out data to file so parent can read it
423 string s = toString(getpid()) + ".temp";
424 openOutputFile(s, out);
427 //output observed distances
428 for (int i = 0; i < quantilesMembers.size(); i++) {
429 out << quantilesMembers[i].size() << '\t';
430 for (int j = 0; j < quantilesMembers[i].size(); j++) {
431 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
439 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
442 //force parent to wait until all the processes are done
443 for (int i=0;i<processors;i++) {
444 int temp = processIDS[i];
448 //get data created by processes
449 for (int i=0;i<processors;i++) {
451 string s = toString(processIDS[i]) + ".temp";
452 openInputFile(s, in);
454 vector< vector<quanMember> > quan;
458 for (int m = 0; m < quan.size(); m++) {
464 vector<quanMember> q; float w; int b, n;
465 for (int j = 0; j < num; j++) {
467 //cout << w << '\t' << b << '\t' n << endl;
468 quanMember newMember(w, b, n);
469 q.push_back(newMember);
471 //cout << "here" << endl;
473 //cout << "now here" << endl;
478 //save quan in quantiles
479 for (int j = 0; j < quan.size(); j++) {
480 //put all values of q[i] into quan[i]
481 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
482 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
490 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
493 catch(exception& e) {
494 m->errorOut(e, "Pintail", "createProcessesQuan");
500 //***************************************************************************************************************