5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17 return (left.score < right.score);
19 //***************************************************************************************************************
21 Pintail::Pintail(string filename, string temp, bool f, int p, string mask, string cons, string q, int win, int inc, string o) : Chimera() {
25 templateFileName = temp; templateSeqs = readSeqs(temp);
35 distcalculator = new eachGapDist();
36 decalc = new DeCalculator();
41 m->errorOut(e, "Pintail", "Pintail");
46 //***************************************************************************************************************
51 delete distcalculator;
55 m->errorOut(e, "Pintail", "~Pintail");
59 //***************************************************************************************************************
60 int Pintail::doPrep() {
63 mergedFilterString = "";
64 windowSizesTemplate.resize(templateSeqs.size(), window);
65 quantiles.resize(100); //one for every percent mismatch
66 quantilesMembers.resize(100); //one for every percent mismatch
68 //if the user does not enter a mask then you want to keep all the spots in the alignment
69 if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); }
70 else { decalc->setAlignmentLength(seqMask.length()); }
72 decalc->setMask(seqMask);
77 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
78 //find breakup of templatefile for quantiles
79 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
81 for (int i = 0; i < processors; i++) {
82 templateLines.push_back(new linePair());
83 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
84 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
88 templateLines.push_back(new linePair(0, templateSeqs.size()));
92 m->mothurOut("Getting conservation... "); cout.flush();
94 m->mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
95 probabilityProfile = decalc->calcFreq(templateSeqs, templateFileName);
96 if (m->control_pressed) { return 0; }
97 m->mothurOut("Done."); m->mothurOutEndLine();
98 }else { probabilityProfile = readFreq(); m->mothurOut("Done."); }
99 m->mothurOutEndLine();
102 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //
105 //create filter if needed for later
108 //read in all query seqs
109 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
111 vector<Sequence*> temp;
112 //merge query seqs and template seqs
114 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
119 for (int i = 0; i < temp.size(); i++) {
120 if (m->control_pressed) {
121 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
124 decalc->runMask(temp[i]);
128 mergedFilterString = createFilter(temp, 0.5);
130 if (m->control_pressed) {
131 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
135 //reread template seqs
136 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
140 //quantiles are used to determine whether the de values found indicate a chimera
141 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
142 //combination of sequences in the template
143 if (quanfile != "") {
144 quantiles = readQuantiles();
146 if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
149 for (int i = 0; i < templateSeqs.size(); i++) {
150 if (m->control_pressed) { return 0; }
151 decalc->runMask(templateSeqs[i]);
157 for (int i = 0; i < templateSeqs.size(); i++) {
158 if (m->control_pressed) { return 0; }
159 runFilter(templateSeqs[i]);
163 m->mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
164 if (processors == 1) {
165 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
166 }else { createProcessesQuan(); }
168 if (m->control_pressed) { return 0; }
170 string noOutliers, outliers;
172 if ((!filter) && (seqMask == "")) {
173 noOutliers = templateFileName + "pintail.quan";
174 }else if ((!filter) && (seqMask != "")) {
175 noOutliers =templateFileName + "pintail.masked.quan";
176 }else if ((filter) && (seqMask != "")) {
177 noOutliers = templateFileName + "pintail.filtered." + m->getSimpleName(m->getRootName(fastafile)) + "masked.quan";
178 }else if ((filter) && (seqMask == "")) {
179 noOutliers = templateFileName + "pintail.filtered." + m->getSimpleName(m->getRootName(fastafile)) + "quan";
182 decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
184 if (m->control_pressed) { return 0; }
186 string outputString = "#" + m->getVersion() + "\n";
189 for (int i = 0; i < quantilesMembers.size(); i++) {
192 if (quantilesMembers[i].size() == 0) {
193 //in case this is not a distance found in your template files
194 for (int g = 0; g < 6; g++) {
199 sort(quantilesMembers[i].begin(), quantilesMembers[i].end());
202 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)]);
204 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)]);
206 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)]);
208 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)]);
210 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)]);
212 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)]);
217 outputString += toString(i+1) + "\t";
218 for (int u = 0; u < temp.size(); u++) { outputString += toString(temp[u]) + "\t"; }
219 outputString += "\n";
225 printQuanFile(noOutliers, outputString);
228 quantilesMembers.clear();
230 m->mothurOut("Done."); m->mothurOutEndLine();
234 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
235 templateSeqs.clear();
236 templateSeqs = readSeqs(templateFileName);
241 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
246 catch(exception& e) {
247 m->errorOut(e, "Pintail", "doPrep");
251 //***************************************************************************************************************
252 Sequence* Pintail::print(ostream& out, ostream& outAcc) {
255 int index = ceil(deviation);
257 //is your DE value higher than the 95%
259 if (index != 0) { //if index is 0 then its an exact match to a template seq
260 if (quantiles[index][4] == 0.0) {
261 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
263 if (DE > quantiles[index][4]) { chimera = "Yes"; }
264 else { chimera = "No"; }
266 }else{ chimera = "No"; }
268 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
269 if (chimera == "Yes") {
270 m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine();
271 outAcc << querySeq->getName() << endl;
275 for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; }
280 for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; }
286 catch(exception& e) {
287 m->errorOut(e, "Pintail", "print");
292 //***************************************************************************************************************
293 Sequence* Pintail::print(MPI_File& out, MPI_File& outAcc) {
296 string outputString = "";
297 int index = ceil(deviation);
299 //is your DE value higher than the 95%
301 if (index != 0) { //if index is 0 then its an exact match to a template seq
302 if (quantiles[index][4] == 0.0) {
303 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
305 if (DE > quantiles[index][4]) { chimera = "Yes"; }
306 else { chimera = "No"; }
308 }else{ chimera = "No"; }
310 outputString += querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera + "\n";
311 if (chimera == "Yes") {
312 cout << querySeq->getName() << "\tdiv: " << toString(deviation) << "\tstDev: " << toString(DE) << "\tchimera flag: " << chimera << endl;
313 string outAccString = querySeq->getName() + "\n";
315 MPI_Status statusAcc;
316 int length = outAccString.length();
317 char* buf = new char[length];
318 memcpy(buf, outAccString.c_str(), length);
320 MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
325 outputString += "Observed\t";
327 for (int j = 0; j < obsDistance.size(); j++) { outputString += toString(obsDistance[j]) + "\t"; }
328 outputString += "\n";
330 outputString += "Expected\t";
332 for (int m = 0; m < expectedDistance.size(); m++) { outputString += toString(expectedDistance[m]) + "\t"; }
333 outputString += "\n";
336 int length = outputString.length();
337 char* buf2 = new char[length];
338 memcpy(buf2, outputString.c_str(), length);
340 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
345 catch(exception& e) {
346 m->errorOut(e, "Pintail", "print");
351 //***************************************************************************************************************
352 int Pintail::getChimeras(Sequence* query) {
356 windowSizes = window;
358 //find pairs has to be done before a mask
359 bestfit = findPairs(query);
361 if (m->control_pressed) { return 0; }
365 decalc->runMask(query);
366 decalc->runMask(bestfit);
369 if (filter) { //must be done after a mask
376 decalc->trimSeqs(query, bestfit, trimmed);
379 it = trimmed.begin();
380 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
382 //find observed distance
383 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
385 if (m->control_pressed) { return 0; }
387 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
389 if (m->control_pressed) { return 0; }
392 seqCoef = decalc->getCoef(obsDistance, Qav);
394 //calculating expected distance
395 expectedDistance = decalc->calcExpected(Qav, seqCoef);
397 if (m->control_pressed) { return 0; }
400 DE = decalc->calcDE(obsDistance, expectedDistance);
402 if (m->control_pressed) { return 0; }
404 //find distance between query and closest match
405 it = trimmed.begin();
406 deviation = decalc->calcDist(query, bestfit, it->first, it->second);
412 catch(exception& e) {
413 m->errorOut(e, "Pintail", "getChimeras");
418 //***************************************************************************************************************
420 vector<float> Pintail::readFreq() {
422 //read in probabilities and store in vector
426 set<int> h = decalc->getPos(); //positions of bases in masking sequence
434 //char* inFileName = new char[consfile.length()];
435 //memcpy(inFileName, consfile.c_str(), consfile.length());
437 char inFileName[1024];
438 strcpy(inFileName, consfile.c_str());
440 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
441 MPI_File_get_size(inMPI, &size);
444 char* buffer = new char[size];
445 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
447 string tempBuf = buffer;
450 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
451 istringstream iss (tempBuf,istringstream::in);
454 string line = m->getline(iss); m->gobble(iss);
459 if (h.count(pos) > 0) {
461 Pi = (num - 0.25) / 0.75;
463 //cannot have probability less than 0.
464 if (Pi < 0) { Pi = 0.0; }
466 //do you want this spot
473 MPI_File_close(&inMPI);
478 m->openInputFile(consfile, in);
481 string line = m->getline(in); m->gobble(in);
487 if (h.count(pos) > 0) {
489 Pi = (num - 0.25) / 0.75;
491 //cannot have probability less than 0.
492 if (Pi < 0) { Pi = 0.0; }
494 //do you want this spot
507 catch(exception& e) {
508 m->errorOut(e, "Pintail", "readFreq");
513 //***************************************************************************************************************
514 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
515 Sequence* Pintail::findPairs(Sequence* q) {
518 Sequence* seqsMatches;
520 seqsMatches = decalc->findClosest(q, templateSeqs);
524 catch(exception& e) {
525 m->errorOut(e, "Pintail", "findPairs");
529 //**************************************************************************************************
530 void Pintail::createProcessesQuan() {
532 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
534 vector<int> processIDS;
536 //loop through and create all the processes you want
537 while (process != processors) {
541 processIDS.push_back(pid);
545 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
547 //write out data to file so parent can read it
549 string s = toString(getpid()) + ".temp";
550 m->openOutputFile(s, out);
552 //output observed distances
553 for (int i = 0; i < quantilesMembers.size(); i++) {
554 out << quantilesMembers[i].size() << '\t';
555 for (int j = 0; j < quantilesMembers[i].size(); j++) {
556 out << quantilesMembers[i][j] << '\t';
565 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
566 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
571 //parent does its part
572 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[0]->start, templateLines[0]->end);
574 //force parent to wait until all the processes are done
575 for (int i=0;i<(processors-1);i++) {
576 int temp = processIDS[i];
580 //get data created by processes
581 for (int i=0;i<(processors-1);i++) {
583 string s = toString(processIDS[i]) + ".temp";
584 m->openInputFile(s, in);
586 vector< vector<float> > quan;
590 for (int h = 0; h < quan.size(); h++) {
596 vector<float> q; float w;
597 for (int j = 0; j < num; j++) {
607 //save quan in quantiles
608 for (int j = 0; j < quan.size(); j++) {
609 //put all values of q[i] into quan[i]
610 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
611 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
619 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
622 catch(exception& e) {
623 m->errorOut(e, "Pintail", "createProcessesQuan");
627 //***************************************************************************************************************
628 vector< vector<float> > Pintail::readQuantiles() {
631 float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
633 vector< vector<float> > quan;
634 vector <float> temp; temp.resize(6, 0);
637 quan.push_back(temp);
645 //char* inFileName = new char[quanfile.length()];
646 //memcpy(inFileName, quanfile.c_str(), quanfile.length());
648 char inFileName[1024];
649 strcpy(inFileName, quanfile.c_str());
651 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
652 MPI_File_get_size(inMPI, &size);
656 char* buffer = new char[size];
657 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
659 string tempBuf = buffer;
660 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
661 istringstream iss (tempBuf,istringstream::in);
665 string line = m->getline(iss); m->gobble(iss);
668 iss >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
673 temp.push_back(twentyfive);
674 temp.push_back(fifty);
675 temp.push_back(seventyfive);
676 temp.push_back(ninetyfive);
677 temp.push_back(ninetynine);
679 quan.push_back(temp);
684 MPI_File_close(&inMPI);
689 m->openInputFile(quanfile, in);
692 string line = m->getline(in); m->gobble(in);
696 in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
701 temp.push_back(twentyfive);
702 temp.push_back(fifty);
703 temp.push_back(seventyfive);
704 temp.push_back(ninetyfive);
705 temp.push_back(ninetynine);
707 quan.push_back(temp);
717 catch(exception& e) {
718 m->errorOut(e, "Pintail", "readQuantiles");
722 //***************************************************************************************************************/
724 void Pintail::printQuanFile(string file, string outputString) {
733 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
735 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
737 //char* FileName = new char[file.length()];
738 //memcpy(FileName, file.c_str(), file.length());
741 strcpy(FileName, file.c_str());
744 MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outQuan); //comm, filename, mode, info, filepointer
746 int length = outputString.length();
747 char* buf = new char[length];
748 memcpy(buf, outputString.c_str(), length);
750 MPI_File_write(outQuan, buf, length, MPI_CHAR, &status);
753 MPI_File_close(&outQuan);
759 m->openOutputFile(file, outQuan);
761 outQuan << outputString;
766 catch(exception& e) {
767 m->errorOut(e, "Pintail", "printQuanFile");
772 //***************************************************************************************************************/