5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17 return (left.score < right.score);
19 //***************************************************************************************************************
21 Pintail::Pintail(string filename, string temp, bool f, int p, string mask, string cons, string q, int win, int inc, string o) : Chimera() {
25 templateFileName = temp; templateSeqs = readSeqs(temp);
35 distcalculator = new eachGapDist();
36 decalc = new DeCalculator();
41 m->errorOut(e, "Pintail", "Pintail");
46 //***************************************************************************************************************
51 delete distcalculator;
55 m->errorOut(e, "Pintail", "~Pintail");
59 //***************************************************************************************************************
60 int Pintail::doPrep() {
63 mergedFilterString = "";
64 windowSizesTemplate.resize(templateSeqs.size(), window);
65 quantiles.resize(100); //one for every percent mismatch
66 quantilesMembers.resize(100); //one for every percent mismatch
68 //if the user does not enter a mask then you want to keep all the spots in the alignment
69 if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); }
70 else { decalc->setAlignmentLength(seqMask.length()); }
72 decalc->setMask(seqMask);
77 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
78 //find breakup of templatefile for quantiles
79 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
81 for (int i = 0; i < processors; i++) {
82 templateLines.push_back(new linePair());
83 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
84 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
88 templateLines.push_back(new linePair(0, templateSeqs.size()));
92 m->mothurOut("Getting conservation... "); cout.flush();
94 m->mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
95 probabilityProfile = decalc->calcFreq(templateSeqs, templateFileName);
96 if (m->control_pressed) { return 0; }
97 m->mothurOut("Done."); m->mothurOutEndLine();
98 }else { probabilityProfile = readFreq(); m->mothurOut("Done."); }
99 m->mothurOutEndLine();
102 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //
105 //create filter if needed for later
108 //read in all query seqs
109 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
111 vector<Sequence*> temp;
112 //merge query seqs and template seqs
114 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
119 for (int i = 0; i < temp.size(); i++) {
120 if (m->control_pressed) {
121 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
124 decalc->runMask(temp[i]);
128 mergedFilterString = createFilter(temp, 0.5);
130 if (m->control_pressed) {
131 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
135 //reread template seqs
136 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
140 //quantiles are used to determine whether the de values found indicate a chimera
141 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
142 //combination of sequences in the template
143 if (quanfile != "") {
144 quantiles = readQuantiles();
146 if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
149 for (int i = 0; i < templateSeqs.size(); i++) {
150 if (m->control_pressed) { return 0; }
151 decalc->runMask(templateSeqs[i]);
157 for (int i = 0; i < templateSeqs.size(); i++) {
158 if (m->control_pressed) { return 0; }
159 runFilter(templateSeqs[i]);
163 m->mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
164 if (processors == 1) {
165 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
166 }else { createProcessesQuan(); }
168 if (m->control_pressed) { return 0; }
170 string noOutliers, outliers;
172 if ((!filter) && (seqMask == "")) {
173 noOutliers = templateFileName + "pintail.quan";
174 }else if ((!filter) && (seqMask != "")) {
175 noOutliers =templateFileName + "pintail.masked.quan";
176 }else if ((filter) && (seqMask != "")) {
177 noOutliers = templateFileName + "pintail.filtered." + m->getSimpleName(m->getRootName(fastafile)) + "masked.quan";
178 }else if ((filter) && (seqMask == "")) {
179 noOutliers = templateFileName + "pintail.filtered." + m->getSimpleName(m->getRootName(fastafile)) + "quan";
182 decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
184 if (m->control_pressed) { return 0; }
186 string outputString = "#" + m->getVersion() + "\n";
189 for (int i = 0; i < quantilesMembers.size(); i++) {
192 if (quantilesMembers[i].size() == 0) {
193 //in case this is not a distance found in your template files
194 for (int g = 0; g < 6; g++) {
199 sort(quantilesMembers[i].begin(), quantilesMembers[i].end());
202 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)]);
204 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)]);
206 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)]);
208 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)]);
210 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)]);
212 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)]);
217 outputString += toString(i+1) + "\t";
218 for (int u = 0; u < temp.size(); u++) { outputString += toString(temp[u]) + "\t"; }
219 outputString += "\n";
225 printQuanFile(noOutliers, outputString);
228 quantilesMembers.clear();
230 m->mothurOut("Done."); m->mothurOutEndLine();
234 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
235 templateSeqs.clear();
236 templateSeqs = readSeqs(templateFileName);
241 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
246 catch(exception& e) {
247 m->errorOut(e, "Pintail", "doPrep");
251 //***************************************************************************************************************
252 int Pintail::print(ostream& out, ostream& outAcc) {
254 int index = ceil(deviation);
256 //is your DE value higher than the 95%
258 if (index != 0) { //if index is 0 then its an exact match to a template seq
259 if (quantiles[index][4] == 0.0) {
260 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
262 if (DE > quantiles[index][4]) { chimera = "Yes"; }
263 else { chimera = "No"; }
265 }else{ chimera = "No"; }
267 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
268 if (chimera == "Yes") {
269 m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine();
270 outAcc << querySeq->getName() << endl;
274 for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; }
279 for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; }
285 catch(exception& e) {
286 m->errorOut(e, "Pintail", "print");
291 //***************************************************************************************************************
292 int Pintail::print(MPI_File& out, MPI_File& outAcc) {
294 bool results = false;
295 string outputString = "";
296 int index = ceil(deviation);
298 //is your DE value higher than the 95%
300 if (index != 0) { //if index is 0 then its an exact match to a template seq
301 if (quantiles[index][4] == 0.0) {
302 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
304 if (DE > quantiles[index][4]) { chimera = "Yes"; }
305 else { chimera = "No"; }
307 }else{ chimera = "No"; }
309 outputString += querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera + "\n";
310 if (chimera == "Yes") {
311 cout << querySeq->getName() << "\tdiv: " << toString(deviation) << "\tstDev: " << toString(DE) << "\tchimera flag: " << chimera << endl;
312 string outAccString = querySeq->getName() + "\n";
314 MPI_Status statusAcc;
315 int length = outAccString.length();
316 char* buf = new char[length];
317 memcpy(buf, outAccString.c_str(), length);
319 MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
324 outputString += "Observed\t";
326 for (int j = 0; j < obsDistance.size(); j++) { outputString += toString(obsDistance[j]) + "\t"; }
327 outputString += "\n";
329 outputString += "Expected\t";
331 for (int m = 0; m < expectedDistance.size(); m++) { outputString += toString(expectedDistance[m]) + "\t"; }
332 outputString += "\n";
335 int length = outputString.length();
336 char* buf2 = new char[length];
337 memcpy(buf2, outputString.c_str(), length);
339 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
344 catch(exception& e) {
345 m->errorOut(e, "Pintail", "print");
350 //***************************************************************************************************************
351 int Pintail::getChimeras(Sequence* query) {
355 windowSizes = window;
357 //find pairs has to be done before a mask
358 bestfit = findPairs(query);
360 if (m->control_pressed) { return 0; }
364 decalc->runMask(query);
365 decalc->runMask(bestfit);
368 if (filter) { //must be done after a mask
375 decalc->trimSeqs(query, bestfit, trimmed);
378 it = trimmed.begin();
379 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
381 //find observed distance
382 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
384 if (m->control_pressed) { return 0; }
386 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
388 if (m->control_pressed) { return 0; }
391 seqCoef = decalc->getCoef(obsDistance, Qav);
393 //calculating expected distance
394 expectedDistance = decalc->calcExpected(Qav, seqCoef);
396 if (m->control_pressed) { return 0; }
399 DE = decalc->calcDE(obsDistance, expectedDistance);
401 if (m->control_pressed) { return 0; }
403 //find distance between query and closest match
404 it = trimmed.begin();
405 deviation = decalc->calcDist(query, bestfit, it->first, it->second);
411 catch(exception& e) {
412 m->errorOut(e, "Pintail", "getChimeras");
417 //***************************************************************************************************************
419 vector<float> Pintail::readFreq() {
421 //read in probabilities and store in vector
425 set<int> h = decalc->getPos(); //positions of bases in masking sequence
433 //char* inFileName = new char[consfile.length()];
434 //memcpy(inFileName, consfile.c_str(), consfile.length());
436 char inFileName[1024];
437 strcpy(inFileName, consfile.c_str());
439 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
440 MPI_File_get_size(inMPI, &size);
443 char* buffer = new char[size];
444 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
446 string tempBuf = buffer;
449 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
450 istringstream iss (tempBuf,istringstream::in);
453 string line = m->getline(iss); m->gobble(iss);
458 if (h.count(pos) > 0) {
460 Pi = (num - 0.25) / 0.75;
462 //cannot have probability less than 0.
463 if (Pi < 0) { Pi = 0.0; }
465 //do you want this spot
472 MPI_File_close(&inMPI);
477 m->openInputFile(consfile, in);
480 string line = m->getline(in); m->gobble(in);
486 if (h.count(pos) > 0) {
488 Pi = (num - 0.25) / 0.75;
490 //cannot have probability less than 0.
491 if (Pi < 0) { Pi = 0.0; }
493 //do you want this spot
506 catch(exception& e) {
507 m->errorOut(e, "Pintail", "readFreq");
512 //***************************************************************************************************************
513 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
514 Sequence* Pintail::findPairs(Sequence* q) {
517 Sequence* seqsMatches;
519 seqsMatches = decalc->findClosest(q, templateSeqs);
523 catch(exception& e) {
524 m->errorOut(e, "Pintail", "findPairs");
528 //**************************************************************************************************
529 void Pintail::createProcessesQuan() {
531 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
533 vector<int> processIDS;
535 //loop through and create all the processes you want
536 while (process != processors) {
540 processIDS.push_back(pid);
544 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
546 //write out data to file so parent can read it
548 string s = toString(getpid()) + ".temp";
549 m->openOutputFile(s, out);
551 //output observed distances
552 for (int i = 0; i < quantilesMembers.size(); i++) {
553 out << quantilesMembers[i].size() << '\t';
554 for (int j = 0; j < quantilesMembers[i].size(); j++) {
555 out << quantilesMembers[i][j] << '\t';
563 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
566 //parent does its part
567 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[0]->start, templateLines[0]->end);
569 //force parent to wait until all the processes are done
570 for (int i=0;i<(processors-1);i++) {
571 int temp = processIDS[i];
575 //get data created by processes
576 for (int i=0;i<(processors-1);i++) {
578 string s = toString(processIDS[i]) + ".temp";
579 m->openInputFile(s, in);
581 vector< vector<float> > quan;
585 for (int h = 0; h < quan.size(); h++) {
591 vector<float> q; float w;
592 for (int j = 0; j < num; j++) {
602 //save quan in quantiles
603 for (int j = 0; j < quan.size(); j++) {
604 //put all values of q[i] into quan[i]
605 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
606 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
614 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
617 catch(exception& e) {
618 m->errorOut(e, "Pintail", "createProcessesQuan");
622 //***************************************************************************************************************
623 vector< vector<float> > Pintail::readQuantiles() {
626 float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
628 vector< vector<float> > quan;
629 vector <float> temp; temp.resize(6, 0);
632 quan.push_back(temp);
640 //char* inFileName = new char[quanfile.length()];
641 //memcpy(inFileName, quanfile.c_str(), quanfile.length());
643 char inFileName[1024];
644 strcpy(inFileName, quanfile.c_str());
646 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
647 MPI_File_get_size(inMPI, &size);
651 char* buffer = new char[size];
652 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
654 string tempBuf = buffer;
655 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
656 istringstream iss (tempBuf,istringstream::in);
660 string line = m->getline(iss); m->gobble(iss);
663 iss >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
668 temp.push_back(twentyfive);
669 temp.push_back(fifty);
670 temp.push_back(seventyfive);
671 temp.push_back(ninetyfive);
672 temp.push_back(ninetynine);
674 quan.push_back(temp);
679 MPI_File_close(&inMPI);
684 m->openInputFile(quanfile, in);
687 string line = m->getline(in); m->gobble(in);
691 in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
696 temp.push_back(twentyfive);
697 temp.push_back(fifty);
698 temp.push_back(seventyfive);
699 temp.push_back(ninetyfive);
700 temp.push_back(ninetynine);
702 quan.push_back(temp);
712 catch(exception& e) {
713 m->errorOut(e, "Pintail", "readQuantiles");
717 //***************************************************************************************************************/
719 void Pintail::printQuanFile(string file, string outputString) {
728 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
730 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
732 //char* FileName = new char[file.length()];
733 //memcpy(FileName, file.c_str(), file.length());
736 strcpy(FileName, file.c_str());
739 MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outQuan); //comm, filename, mode, info, filepointer
741 int length = outputString.length();
742 char* buf = new char[length];
743 memcpy(buf, outputString.c_str(), length);
745 MPI_File_write(outQuan, buf, length, MPI_CHAR, &status);
748 MPI_File_close(&outQuan);
754 m->openOutputFile(file, outQuan);
756 outQuan << outputString;
761 catch(exception& e) {
762 m->errorOut(e, "Pintail", "printQuanFile");
767 //***************************************************************************************************************/