5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17 return (left.score < right.score);
19 //***************************************************************************************************************
21 Pintail::Pintail(string filename, string temp, bool f, int p, string mask, string cons, string q, int win, int inc, string o) : Chimera() {
25 templateFileName = temp; templateSeqs = readSeqs(temp);
35 distcalculator = new eachGapDist();
36 decalc = new DeCalculator();
41 m->errorOut(e, "Pintail", "Pintail");
46 //***************************************************************************************************************
51 delete distcalculator;
55 m->errorOut(e, "Pintail", "~Pintail");
59 //***************************************************************************************************************
60 int Pintail::doPrep() {
63 mergedFilterString = "";
64 windowSizesTemplate.resize(templateSeqs.size(), window);
65 quantiles.resize(100); //one for every percent mismatch
66 quantilesMembers.resize(100); //one for every percent mismatch
68 //if the user does not enter a mask then you want to keep all the spots in the alignment
69 if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); }
70 else { decalc->setAlignmentLength(seqMask.length()); }
72 decalc->setMask(seqMask);
77 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
78 //find breakup of templatefile for quantiles
79 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
81 for (int i = 0; i < processors; i++) {
82 templateLines.push_back(new linePair());
83 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
84 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
88 templateLines.push_back(new linePair(0, templateSeqs.size()));
92 m->mothurOut("Getting conservation... "); cout.flush();
94 m->mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
95 probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName));
96 if (m->control_pressed) { return 0; }
97 m->mothurOut("Done."); m->mothurOutEndLine();
98 }else { probabilityProfile = readFreq(); m->mothurOut("Done."); }
99 m->mothurOutEndLine();
102 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //
105 //create filter if needed for later
108 //read in all query seqs
109 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
111 vector<Sequence*> temp;
112 //merge query seqs and template seqs
114 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
119 for (int i = 0; i < temp.size(); i++) {
120 if (m->control_pressed) {
121 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
124 decalc->runMask(temp[i]);
128 mergedFilterString = createFilter(temp, 0.5);
130 if (m->control_pressed) {
131 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
135 //reread template seqs
136 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
140 //quantiles are used to determine whether the de values found indicate a chimera
141 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
142 //combination of sequences in the template
143 if (quanfile != "") {
144 quantiles = readQuantiles();
146 if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
149 for (int i = 0; i < templateSeqs.size(); i++) {
150 if (m->control_pressed) { return 0; }
151 decalc->runMask(templateSeqs[i]);
157 for (int i = 0; i < templateSeqs.size(); i++) {
158 if (m->control_pressed) { return 0; }
159 runFilter(templateSeqs[i]);
163 m->mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
164 if (processors == 1) {
165 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
166 }else { createProcessesQuan(); }
168 if (m->control_pressed) { return 0; }
170 string noOutliers, outliers;
172 if ((!filter) && (seqMask == "")) {
173 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
174 }else if ((!filter) && (seqMask != "")) {
175 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
176 }else if ((filter) && (seqMask != "")) {
177 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan";
178 }else if ((filter) && (seqMask == "")) {
179 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan";
182 decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
184 if (m->control_pressed) { return 0; }
186 string outputString = "";
189 for (int i = 0; i < quantilesMembers.size(); i++) {
192 if (quantilesMembers[i].size() == 0) {
193 //in case this is not a distance found in your template files
194 for (int g = 0; g < 6; g++) {
199 sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
202 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
204 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
206 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
208 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
210 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
212 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
217 outputString += toString(i+1) + "\t";
218 for (int u = 0; u < temp.size(); u++) { outputString += toString(temp[u]) + "\t"; }
219 outputString += "\n";
225 printQuanFile(noOutliers, outputString);
227 m->mothurOut("Done."); m->mothurOutEndLine();
231 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
232 templateSeqs.clear();
233 templateSeqs = readSeqs(templateFileName);
238 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
243 catch(exception& e) {
244 m->errorOut(e, "Pintail", "doPrep");
248 //***************************************************************************************************************
249 int Pintail::print(ostream& out, ostream& outAcc) {
251 int index = ceil(deviation);
253 //is your DE value higher than the 95%
255 if (index != 0) { //if index is 0 then its an exact match to a template seq
256 if (quantiles[index][4] == 0.0) {
257 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
259 if (DE > quantiles[index][4]) { chimera = "Yes"; }
260 else { chimera = "No"; }
262 }else{ chimera = "No"; }
264 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
265 if (chimera == "Yes") {
266 m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine();
267 outAcc << querySeq->getName() << endl;
271 for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; }
276 for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; }
282 catch(exception& e) {
283 m->errorOut(e, "Pintail", "print");
288 //***************************************************************************************************************
289 int Pintail::print(MPI_File& out, MPI_File& outAcc) {
291 bool results = false;
292 string outputString = "";
293 int index = ceil(deviation);
295 //is your DE value higher than the 95%
297 if (index != 0) { //if index is 0 then its an exact match to a template seq
298 if (quantiles[index][4] == 0.0) {
299 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
301 if (DE > quantiles[index][4]) { chimera = "Yes"; }
302 else { chimera = "No"; }
304 }else{ chimera = "No"; }
306 outputString += querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera + "\n";
307 if (chimera == "Yes") {
308 cout << querySeq->getName() << "\tdiv: " << toString(deviation) << "\tstDev: " << toString(DE) << "\tchimera flag: " << chimera << endl;
309 string outAccString = querySeq->getName() + "\n";
311 MPI_Status statusAcc;
312 int length = outAccString.length();
314 strcpy(buf, outAccString.c_str());
316 MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
320 outputString += "Observed\t";
322 for (int j = 0; j < obsDistance.size(); j++) { outputString += toString(obsDistance[j]) + "\t"; }
323 outputString += "\n";
325 outputString += "Expected\t";
327 for (int m = 0; m < expectedDistance.size(); m++) { outputString += toString(expectedDistance[m]) + "\t"; }
328 outputString += "\n";
331 int length = outputString.length();
333 strcpy(buf2, outputString.c_str());
335 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
339 catch(exception& e) {
340 m->errorOut(e, "Pintail", "print");
345 //***************************************************************************************************************
346 int Pintail::getChimeras(Sequence* query) {
350 windowSizes = window;
352 //find pairs has to be done before a mask
353 bestfit = findPairs(query);
355 if (m->control_pressed) { return 0; }
359 decalc->runMask(query);
360 decalc->runMask(bestfit);
363 if (filter) { //must be done after a mask
370 decalc->trimSeqs(query, bestfit, trimmed);
373 it = trimmed.begin();
374 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
376 //find observed distance
377 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
379 if (m->control_pressed) { return 0; }
381 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
383 if (m->control_pressed) { return 0; }
386 seqCoef = decalc->getCoef(obsDistance, Qav);
388 //calculating expected distance
389 expectedDistance = decalc->calcExpected(Qav, seqCoef);
391 if (m->control_pressed) { return 0; }
394 DE = decalc->calcDE(obsDistance, expectedDistance);
396 if (m->control_pressed) { return 0; }
398 //find distance between query and closest match
399 it = trimmed.begin();
400 deviation = decalc->calcDist(query, bestfit, it->first, it->second);
406 catch(exception& e) {
407 m->errorOut(e, "Pintail", "getChimeras");
412 //***************************************************************************************************************
414 vector<float> Pintail::readFreq() {
416 //read in probabilities and store in vector
420 set<int> h = decalc->getPos(); //positions of bases in masking sequence
428 char inFileName[consfile.length()];
429 strcpy(inFileName, consfile.c_str());
431 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
432 MPI_File_get_size(inMPI, &size);
435 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
437 string tempBuf = buffer;
439 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
440 istringstream iss (tempBuf,istringstream::in);
445 if (h.count(pos) > 0) {
447 Pi = (num - 0.25) / 0.75;
449 //cannot have probability less than 0.
450 if (Pi < 0) { Pi = 0.0; }
452 //do you want this spot
459 MPI_File_close(&inMPI);
464 openInputFile(consfile, in);
470 if (h.count(pos) > 0) {
472 Pi = (num - 0.25) / 0.75;
474 //cannot have probability less than 0.
475 if (Pi < 0) { Pi = 0.0; }
477 //do you want this spot
490 catch(exception& e) {
491 m->errorOut(e, "Pintail", "readFreq");
496 //***************************************************************************************************************
497 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
498 Sequence* Pintail::findPairs(Sequence* q) {
501 Sequence* seqsMatches;
503 seqsMatches = decalc->findClosest(q, templateSeqs);
507 catch(exception& e) {
508 m->errorOut(e, "Pintail", "findPairs");
512 //**************************************************************************************************
513 void Pintail::createProcessesQuan() {
515 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
517 vector<int> processIDS;
519 //loop through and create all the processes you want
520 while (process != processors) {
524 processIDS.push_back(pid);
528 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
530 //write out data to file so parent can read it
532 string s = toString(getpid()) + ".temp";
533 openOutputFile(s, out);
536 //output observed distances
537 for (int i = 0; i < quantilesMembers.size(); i++) {
538 out << quantilesMembers[i].size() << '\t';
539 for (int j = 0; j < quantilesMembers[i].size(); j++) {
540 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
548 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
551 //force parent to wait until all the processes are done
552 for (int i=0;i<processors;i++) {
553 int temp = processIDS[i];
557 //get data created by processes
558 for (int i=0;i<processors;i++) {
560 string s = toString(processIDS[i]) + ".temp";
561 openInputFile(s, in);
563 vector< vector<quanMember> > quan;
567 for (int m = 0; m < quan.size(); m++) {
573 vector<quanMember> q; float w; int b, n;
574 for (int j = 0; j < num; j++) {
577 quanMember newMember(w, b, n);
578 q.push_back(newMember);
586 //save quan in quantiles
587 for (int j = 0; j < quan.size(); j++) {
588 //put all values of q[i] into quan[i]
589 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
590 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
598 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
601 catch(exception& e) {
602 m->errorOut(e, "Pintail", "createProcessesQuan");
606 //***************************************************************************************************************
607 vector< vector<float> > Pintail::readQuantiles() {
610 float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
612 vector< vector<float> > quan;
613 vector <float> temp; temp.resize(6, 0);
616 quan.push_back(temp);
624 char inFileName[quanfile.length()];
625 strcpy(inFileName, quanfile.c_str());
627 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
628 MPI_File_get_size(inMPI, &size);
631 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
633 string tempBuf = buffer;
634 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
635 istringstream iss (tempBuf,istringstream::in);
638 iss >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
643 temp.push_back(twentyfive);
644 temp.push_back(fifty);
645 temp.push_back(seventyfive);
646 temp.push_back(ninetyfive);
647 temp.push_back(ninetynine);
649 quan.push_back(temp);
654 MPI_File_close(&inMPI);
659 openInputFile(quanfile, in);
663 in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
668 temp.push_back(twentyfive);
669 temp.push_back(fifty);
670 temp.push_back(seventyfive);
671 temp.push_back(ninetyfive);
672 temp.push_back(ninetynine);
674 quan.push_back(temp);
684 catch(exception& e) {
685 m->errorOut(e, "Pintail", "readQuantiles");
689 //***************************************************************************************************************/
691 void Pintail::printQuanFile(string file, string outputString) {
700 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
702 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
704 char FileName[file.length()];
705 strcpy(FileName, file.c_str());
708 MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outQuan); //comm, filename, mode, info, filepointer
710 int length = outputString.length();
712 strcpy(buf, outputString.c_str());
714 MPI_File_write(outQuan, buf, length, MPI_CHAR, &status);
716 MPI_File_close(&outQuan);
720 openOutputFile(file, outQuan);
722 outQuan << outputString;
727 catch(exception& e) {
728 m->errorOut(e, "Pintail", "printQuanFile");
733 //***************************************************************************************************************/