5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17 return (left.score < right.score);
19 //***************************************************************************************************************
21 Pintail::Pintail(string filename, string temp, bool f, int p, string mask, string cons, string q, int win, int inc, string o) : Chimera() {
25 templateFileName = temp; templateSeqs = readSeqs(temp);
35 distcalculator = new eachGapDist();
36 decalc = new DeCalculator();
41 m->errorOut(e, "Pintail", "Pintail");
46 //***************************************************************************************************************
51 delete distcalculator;
55 m->errorOut(e, "Pintail", "~Pintail");
59 //***************************************************************************************************************
60 int Pintail::doPrep() {
63 mergedFilterString = "";
64 windowSizesTemplate.resize(templateSeqs.size(), window);
65 quantiles.resize(100); //one for every percent mismatch
66 quantilesMembers.resize(100); //one for every percent mismatch
68 //if the user does not enter a mask then you want to keep all the spots in the alignment
69 if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); }
70 else { decalc->setAlignmentLength(seqMask.length()); }
72 decalc->setMask(seqMask);
77 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
78 //find breakup of templatefile for quantiles
79 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
81 for (int i = 0; i < processors; i++) {
82 templateLines.push_back(new linePair());
83 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
84 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
88 templateLines.push_back(new linePair(0, templateSeqs.size()));
92 m->mothurOut("Getting conservation... "); cout.flush();
94 m->mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
95 probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName));
96 if (m->control_pressed) { return 0; }
97 m->mothurOut("Done."); m->mothurOutEndLine();
98 }else { probabilityProfile = readFreq(); m->mothurOut("Done."); }
99 m->mothurOutEndLine();
102 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //
105 //create filter if needed for later
108 //read in all query seqs
109 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
111 vector<Sequence*> temp;
112 //merge query seqs and template seqs
114 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
119 for (int i = 0; i < temp.size(); i++) {
120 if (m->control_pressed) {
121 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
124 decalc->runMask(temp[i]);
128 mergedFilterString = createFilter(temp, 0.5);
130 if (m->control_pressed) {
131 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
135 //reread template seqs
136 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
140 //quantiles are used to determine whether the de values found indicate a chimera
141 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
142 //combination of sequences in the template
143 if (quanfile != "") {
144 quantiles = readQuantiles();
146 if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
149 for (int i = 0; i < templateSeqs.size(); i++) {
150 if (m->control_pressed) { return 0; }
151 decalc->runMask(templateSeqs[i]);
157 for (int i = 0; i < templateSeqs.size(); i++) {
158 if (m->control_pressed) { return 0; }
159 runFilter(templateSeqs[i]);
163 m->mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
164 if (processors == 1) {
165 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
166 }else { createProcessesQuan(); }
168 if (m->control_pressed) { return 0; }
170 string noOutliers, outliers;
172 if ((!filter) && (seqMask == "")) {
173 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
174 }else if ((!filter) && (seqMask != "")) {
175 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
176 }else if ((filter) && (seqMask != "")) {
177 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan";
178 }else if ((filter) && (seqMask == "")) {
179 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan";
182 decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
184 if (m->control_pressed) { return 0; }
186 string outputString = "";
189 for (int i = 0; i < quantilesMembers.size(); i++) {
192 if (quantilesMembers[i].size() == 0) {
193 //in case this is not a distance found in your template files
194 for (int g = 0; g < 6; g++) {
199 sort(quantilesMembers[i].begin(), quantilesMembers[i].end());
202 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)]);
204 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)]);
206 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)]);
208 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)]);
210 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)]);
212 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)]);
217 outputString += toString(i+1) + "\t";
218 for (int u = 0; u < temp.size(); u++) { outputString += toString(temp[u]) + "\t"; }
219 outputString += "\n";
225 printQuanFile(noOutliers, outputString);
228 quantilesMembers.clear();
230 m->mothurOut("Done."); m->mothurOutEndLine();
234 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
235 templateSeqs.clear();
236 templateSeqs = readSeqs(templateFileName);
241 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
246 catch(exception& e) {
247 m->errorOut(e, "Pintail", "doPrep");
251 //***************************************************************************************************************
252 int Pintail::print(ostream& out, ostream& outAcc) {
254 int index = ceil(deviation);
256 //is your DE value higher than the 95%
258 if (index != 0) { //if index is 0 then its an exact match to a template seq
259 if (quantiles[index][4] == 0.0) {
260 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
262 if (DE > quantiles[index][4]) { chimera = "Yes"; }
263 else { chimera = "No"; }
265 }else{ chimera = "No"; }
267 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
268 if (chimera == "Yes") {
269 m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine();
270 outAcc << querySeq->getName() << endl;
274 for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; }
279 for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; }
285 catch(exception& e) {
286 m->errorOut(e, "Pintail", "print");
291 //***************************************************************************************************************
292 int Pintail::print(MPI_File& out, MPI_File& outAcc) {
294 bool results = false;
295 string outputString = "";
296 int index = ceil(deviation);
298 //is your DE value higher than the 95%
300 if (index != 0) { //if index is 0 then its an exact match to a template seq
301 if (quantiles[index][4] == 0.0) {
302 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
304 if (DE > quantiles[index][4]) { chimera = "Yes"; }
305 else { chimera = "No"; }
307 }else{ chimera = "No"; }
309 outputString += querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera + "\n";
310 if (chimera == "Yes") {
311 cout << querySeq->getName() << "\tdiv: " << toString(deviation) << "\tstDev: " << toString(DE) << "\tchimera flag: " << chimera << endl;
312 string outAccString = querySeq->getName() + "\n";
314 MPI_Status statusAcc;
315 int length = outAccString.length();
316 char* buf = new char[length];
317 memcpy(buf, outAccString.c_str(), length);
319 MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
324 outputString += "Observed\t";
326 for (int j = 0; j < obsDistance.size(); j++) { outputString += toString(obsDistance[j]) + "\t"; }
327 outputString += "\n";
329 outputString += "Expected\t";
331 for (int m = 0; m < expectedDistance.size(); m++) { outputString += toString(expectedDistance[m]) + "\t"; }
332 outputString += "\n";
335 int length = outputString.length();
336 char* buf2 = new char[length];
337 memcpy(buf2, outputString.c_str(), length);
339 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
344 catch(exception& e) {
345 m->errorOut(e, "Pintail", "print");
350 //***************************************************************************************************************
351 int Pintail::getChimeras(Sequence* query) {
355 windowSizes = window;
357 //find pairs has to be done before a mask
358 bestfit = findPairs(query);
360 if (m->control_pressed) { return 0; }
364 decalc->runMask(query);
365 decalc->runMask(bestfit);
368 if (filter) { //must be done after a mask
375 decalc->trimSeqs(query, bestfit, trimmed);
378 it = trimmed.begin();
379 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
381 //find observed distance
382 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
384 if (m->control_pressed) { return 0; }
386 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
388 if (m->control_pressed) { return 0; }
391 seqCoef = decalc->getCoef(obsDistance, Qav);
393 //calculating expected distance
394 expectedDistance = decalc->calcExpected(Qav, seqCoef);
396 if (m->control_pressed) { return 0; }
399 DE = decalc->calcDE(obsDistance, expectedDistance);
401 if (m->control_pressed) { return 0; }
403 //find distance between query and closest match
404 it = trimmed.begin();
405 deviation = decalc->calcDist(query, bestfit, it->first, it->second);
411 catch(exception& e) {
412 m->errorOut(e, "Pintail", "getChimeras");
417 //***************************************************************************************************************
419 vector<float> Pintail::readFreq() {
421 //read in probabilities and store in vector
425 set<int> h = decalc->getPos(); //positions of bases in masking sequence
433 //char* inFileName = new char[consfile.length()];
434 //memcpy(inFileName, consfile.c_str(), consfile.length());
436 char inFileName[1024];
437 strcpy(inFileName, consfile.c_str());
439 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
440 MPI_File_get_size(inMPI, &size);
443 char* buffer = new char[size];
444 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
446 string tempBuf = buffer;
449 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
450 istringstream iss (tempBuf,istringstream::in);
455 if (h.count(pos) > 0) {
457 Pi = (num - 0.25) / 0.75;
459 //cannot have probability less than 0.
460 if (Pi < 0) { Pi = 0.0; }
462 //do you want this spot
469 MPI_File_close(&inMPI);
474 openInputFile(consfile, in);
480 if (h.count(pos) > 0) {
482 Pi = (num - 0.25) / 0.75;
484 //cannot have probability less than 0.
485 if (Pi < 0) { Pi = 0.0; }
487 //do you want this spot
500 catch(exception& e) {
501 m->errorOut(e, "Pintail", "readFreq");
506 //***************************************************************************************************************
507 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
508 Sequence* Pintail::findPairs(Sequence* q) {
511 Sequence* seqsMatches;
513 seqsMatches = decalc->findClosest(q, templateSeqs);
517 catch(exception& e) {
518 m->errorOut(e, "Pintail", "findPairs");
522 //**************************************************************************************************
523 void Pintail::createProcessesQuan() {
525 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
527 vector<int> processIDS;
529 //loop through and create all the processes you want
530 while (process != processors) {
534 processIDS.push_back(pid);
538 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
540 //write out data to file so parent can read it
542 string s = toString(getpid()) + ".temp";
543 openOutputFile(s, out);
546 //output observed distances
547 for (int i = 0; i < quantilesMembers.size(); i++) {
548 out << quantilesMembers[i].size() << '\t';
549 for (int j = 0; j < quantilesMembers[i].size(); j++) {
550 out << quantilesMembers[i][j] << '\t';
558 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
561 //force parent to wait until all the processes are done
562 for (int i=0;i<processors;i++) {
563 int temp = processIDS[i];
567 //get data created by processes
568 for (int i=0;i<processors;i++) {
570 string s = toString(processIDS[i]) + ".temp";
571 openInputFile(s, in);
573 vector< vector<float> > quan;
577 for (int h = 0; h < quan.size(); h++) {
583 vector<float> q; float w;
584 for (int j = 0; j < num; j++) {
594 //save quan in quantiles
595 for (int j = 0; j < quan.size(); j++) {
596 //put all values of q[i] into quan[i]
597 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
598 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
606 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
609 catch(exception& e) {
610 m->errorOut(e, "Pintail", "createProcessesQuan");
614 //***************************************************************************************************************
615 vector< vector<float> > Pintail::readQuantiles() {
618 float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
620 vector< vector<float> > quan;
621 vector <float> temp; temp.resize(6, 0);
624 quan.push_back(temp);
632 //char* inFileName = new char[quanfile.length()];
633 //memcpy(inFileName, quanfile.c_str(), quanfile.length());
635 char inFileName[1024];
636 strcpy(inFileName, quanfile.c_str());
638 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
639 MPI_File_get_size(inMPI, &size);
643 char* buffer = new char[size];
644 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
646 string tempBuf = buffer;
647 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
648 istringstream iss (tempBuf,istringstream::in);
652 iss >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
657 temp.push_back(twentyfive);
658 temp.push_back(fifty);
659 temp.push_back(seventyfive);
660 temp.push_back(ninetyfive);
661 temp.push_back(ninetynine);
663 quan.push_back(temp);
668 MPI_File_close(&inMPI);
673 openInputFile(quanfile, in);
677 in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
682 temp.push_back(twentyfive);
683 temp.push_back(fifty);
684 temp.push_back(seventyfive);
685 temp.push_back(ninetyfive);
686 temp.push_back(ninetynine);
688 quan.push_back(temp);
698 catch(exception& e) {
699 m->errorOut(e, "Pintail", "readQuantiles");
703 //***************************************************************************************************************/
705 void Pintail::printQuanFile(string file, string outputString) {
714 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
716 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
718 //char* FileName = new char[file.length()];
719 //memcpy(FileName, file.c_str(), file.length());
722 strcpy(FileName, file.c_str());
725 MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outQuan); //comm, filename, mode, info, filepointer
727 int length = outputString.length();
728 char* buf = new char[length];
729 memcpy(buf, outputString.c_str(), length);
731 MPI_File_write(outQuan, buf, length, MPI_CHAR, &status);
734 MPI_File_close(&outQuan);
740 openOutputFile(file, outQuan);
742 outQuan << outputString;
747 catch(exception& e) {
748 m->errorOut(e, "Pintail", "printQuanFile");
753 //***************************************************************************************************************/