5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17 return (left.score < right.score);
19 //***************************************************************************************************************
21 Pintail::Pintail(string filename, string temp, bool f, int p, string mask, string cons, string q, int win, int inc, string o) : Chimera() {
25 templateFileName = temp; templateSeqs = readSeqs(temp);
35 distcalculator = new eachGapDist();
36 decalc = new DeCalculator();
41 m->errorOut(e, "Pintail", "Pintail");
46 //***************************************************************************************************************
51 delete distcalculator;
55 m->errorOut(e, "Pintail", "~Pintail");
59 //***************************************************************************************************************
60 int Pintail::doPrep() {
63 mergedFilterString = "";
64 windowSizesTemplate.resize(templateSeqs.size(), window);
65 quantiles.resize(100); //one for every percent mismatch
66 quantilesMembers.resize(100); //one for every percent mismatch
68 //if the user does not enter a mask then you want to keep all the spots in the alignment
69 if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); }
70 else { decalc->setAlignmentLength(seqMask.length()); }
72 decalc->setMask(seqMask);
77 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
78 //find breakup of templatefile for quantiles
79 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
81 for (int i = 0; i < processors; i++) {
82 templateLines.push_back(new linePair());
83 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
84 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
88 templateLines.push_back(new linePair(0, templateSeqs.size()));
92 m->mothurOut("Getting conservation... "); cout.flush();
94 m->mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
95 probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName));
96 if (m->control_pressed) { return 0; }
97 m->mothurOut("Done."); m->mothurOutEndLine();
98 }else { probabilityProfile = readFreq(); m->mothurOut("Done."); }
99 m->mothurOutEndLine();
102 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //
105 //create filter if needed for later
108 //read in all query seqs
109 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
111 vector<Sequence*> temp;
112 //merge query seqs and template seqs
114 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
119 for (int i = 0; i < temp.size(); i++) {
120 if (m->control_pressed) {
121 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
124 decalc->runMask(temp[i]);
128 mergedFilterString = createFilter(temp, 0.5);
130 if (m->control_pressed) {
131 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
135 //reread template seqs
136 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
140 //quantiles are used to determine whether the de values found indicate a chimera
141 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
142 //combination of sequences in the template
143 if (quanfile != "") {
144 quantiles = readQuantiles();
146 if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
149 for (int i = 0; i < templateSeqs.size(); i++) {
150 if (m->control_pressed) { return 0; }
151 decalc->runMask(templateSeqs[i]);
157 for (int i = 0; i < templateSeqs.size(); i++) {
158 if (m->control_pressed) { return 0; }
159 runFilter(templateSeqs[i]);
163 m->mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
164 if (processors == 1) {
165 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
166 }else { createProcessesQuan(); }
168 if (m->control_pressed) { return 0; }
170 string noOutliers, outliers;
172 if ((!filter) && (seqMask == "")) {
173 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
174 }else if ((!filter) && (seqMask != "")) {
175 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
176 }else if ((filter) && (seqMask != "")) {
177 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan";
178 }else if ((filter) && (seqMask == "")) {
179 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan";
182 decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
184 if (m->control_pressed) { return 0; }
186 string outputString = "";
189 for (int i = 0; i < quantilesMembers.size(); i++) {
192 if (quantilesMembers[i].size() == 0) {
193 //in case this is not a distance found in your template files
194 for (int g = 0; g < 6; g++) {
199 sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
202 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
204 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
206 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
208 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
210 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
212 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
217 outputString += toString(i+1) + "\t";
218 for (int u = 0; u < temp.size(); u++) { outputString += toString(temp[u]) + "\t"; }
219 outputString += "\n";
225 printQuanFile(noOutliers, outputString);
227 m->mothurOut("Done."); m->mothurOutEndLine();
231 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
232 templateSeqs.clear();
233 templateSeqs = readSeqs(templateFileName);
238 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
243 catch(exception& e) {
244 m->errorOut(e, "Pintail", "doPrep");
248 //***************************************************************************************************************
249 int Pintail::print(ostream& out, ostream& outAcc) {
251 int index = ceil(deviation);
253 //is your DE value higher than the 95%
255 if (index != 0) { //if index is 0 then its an exact match to a template seq
256 if (quantiles[index][4] == 0.0) {
257 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
259 if (DE > quantiles[index][4]) { chimera = "Yes"; }
260 else { chimera = "No"; }
262 }else{ chimera = "No"; }
264 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
265 if (chimera == "Yes") {
266 m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine();
267 outAcc << querySeq->getName() << endl;
271 for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; }
276 for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; }
282 catch(exception& e) {
283 m->errorOut(e, "Pintail", "print");
288 //***************************************************************************************************************
289 int Pintail::print(MPI_File& out, MPI_File& outAcc) {
291 bool results = false;
292 string outputString = "";
293 int index = ceil(deviation);
295 //is your DE value higher than the 95%
297 if (index != 0) { //if index is 0 then its an exact match to a template seq
298 if (quantiles[index][4] == 0.0) {
299 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
301 if (DE > quantiles[index][4]) { chimera = "Yes"; }
302 else { chimera = "No"; }
304 }else{ chimera = "No"; }
306 outputString += querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera + "\n";
307 if (chimera == "Yes") {
308 cout << querySeq->getName() << "\tdiv: " << toString(deviation) << "\tstDev: " << toString(DE) << "\tchimera flag: " << chimera << endl;
309 string outAccString = querySeq->getName() + "\n";
311 MPI_Status statusAcc;
312 int length = outAccString.length();
313 char* buf = new char[length];
314 memcpy(buf, outAccString.c_str(), length);
316 MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
321 outputString += "Observed\t";
323 for (int j = 0; j < obsDistance.size(); j++) { outputString += toString(obsDistance[j]) + "\t"; }
324 outputString += "\n";
326 outputString += "Expected\t";
328 for (int m = 0; m < expectedDistance.size(); m++) { outputString += toString(expectedDistance[m]) + "\t"; }
329 outputString += "\n";
332 int length = outputString.length();
333 char* buf2 = new char[length];
334 memcpy(buf2, outputString.c_str(), length);
336 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
341 catch(exception& e) {
342 m->errorOut(e, "Pintail", "print");
347 //***************************************************************************************************************
348 int Pintail::getChimeras(Sequence* query) {
352 windowSizes = window;
354 //find pairs has to be done before a mask
355 bestfit = findPairs(query);
357 if (m->control_pressed) { return 0; }
361 decalc->runMask(query);
362 decalc->runMask(bestfit);
365 if (filter) { //must be done after a mask
372 decalc->trimSeqs(query, bestfit, trimmed);
375 it = trimmed.begin();
376 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
378 //find observed distance
379 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
381 if (m->control_pressed) { return 0; }
383 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
385 if (m->control_pressed) { return 0; }
388 seqCoef = decalc->getCoef(obsDistance, Qav);
390 //calculating expected distance
391 expectedDistance = decalc->calcExpected(Qav, seqCoef);
393 if (m->control_pressed) { return 0; }
396 DE = decalc->calcDE(obsDistance, expectedDistance);
398 if (m->control_pressed) { return 0; }
400 //find distance between query and closest match
401 it = trimmed.begin();
402 deviation = decalc->calcDist(query, bestfit, it->first, it->second);
408 catch(exception& e) {
409 m->errorOut(e, "Pintail", "getChimeras");
414 //***************************************************************************************************************
416 vector<float> Pintail::readFreq() {
418 //read in probabilities and store in vector
422 set<int> h = decalc->getPos(); //positions of bases in masking sequence
430 //char* inFileName = new char[consfile.length()];
431 //memcpy(inFileName, consfile.c_str(), consfile.length());
433 char inFileName[1024];
434 strcpy(inFileName, consfile.c_str());
436 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
437 MPI_File_get_size(inMPI, &size);
440 char* buffer = new char[size];
441 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
443 string tempBuf = buffer;
446 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
447 istringstream iss (tempBuf,istringstream::in);
452 if (h.count(pos) > 0) {
454 Pi = (num - 0.25) / 0.75;
456 //cannot have probability less than 0.
457 if (Pi < 0) { Pi = 0.0; }
459 //do you want this spot
466 MPI_File_close(&inMPI);
471 openInputFile(consfile, in);
477 if (h.count(pos) > 0) {
479 Pi = (num - 0.25) / 0.75;
481 //cannot have probability less than 0.
482 if (Pi < 0) { Pi = 0.0; }
484 //do you want this spot
497 catch(exception& e) {
498 m->errorOut(e, "Pintail", "readFreq");
503 //***************************************************************************************************************
504 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
505 Sequence* Pintail::findPairs(Sequence* q) {
508 Sequence* seqsMatches;
510 seqsMatches = decalc->findClosest(q, templateSeqs);
514 catch(exception& e) {
515 m->errorOut(e, "Pintail", "findPairs");
519 //**************************************************************************************************
520 void Pintail::createProcessesQuan() {
522 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
524 vector<int> processIDS;
526 //loop through and create all the processes you want
527 while (process != processors) {
531 processIDS.push_back(pid);
535 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
537 //write out data to file so parent can read it
539 string s = toString(getpid()) + ".temp";
540 openOutputFile(s, out);
543 //output observed distances
544 for (int i = 0; i < quantilesMembers.size(); i++) {
545 out << quantilesMembers[i].size() << '\t';
546 for (int j = 0; j < quantilesMembers[i].size(); j++) {
547 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
555 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
558 //force parent to wait until all the processes are done
559 for (int i=0;i<processors;i++) {
560 int temp = processIDS[i];
564 //get data created by processes
565 for (int i=0;i<processors;i++) {
567 string s = toString(processIDS[i]) + ".temp";
568 openInputFile(s, in);
570 vector< vector<quanMember> > quan;
574 for (int m = 0; m < quan.size(); m++) {
580 vector<quanMember> q; float w; int b, n;
581 for (int j = 0; j < num; j++) {
584 quanMember newMember(w, b, n);
585 q.push_back(newMember);
593 //save quan in quantiles
594 for (int j = 0; j < quan.size(); j++) {
595 //put all values of q[i] into quan[i]
596 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
597 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
605 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
608 catch(exception& e) {
609 m->errorOut(e, "Pintail", "createProcessesQuan");
613 //***************************************************************************************************************
614 vector< vector<float> > Pintail::readQuantiles() {
617 float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
619 vector< vector<float> > quan;
620 vector <float> temp; temp.resize(6, 0);
623 quan.push_back(temp);
631 //char* inFileName = new char[quanfile.length()];
632 //memcpy(inFileName, quanfile.c_str(), quanfile.length());
634 char inFileName[1024];
635 strcpy(inFileName, quanfile.c_str());
637 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
638 MPI_File_get_size(inMPI, &size);
642 char* buffer = new char[size];
643 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
645 string tempBuf = buffer;
646 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
647 istringstream iss (tempBuf,istringstream::in);
651 iss >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
656 temp.push_back(twentyfive);
657 temp.push_back(fifty);
658 temp.push_back(seventyfive);
659 temp.push_back(ninetyfive);
660 temp.push_back(ninetynine);
662 quan.push_back(temp);
667 MPI_File_close(&inMPI);
672 openInputFile(quanfile, in);
676 in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
681 temp.push_back(twentyfive);
682 temp.push_back(fifty);
683 temp.push_back(seventyfive);
684 temp.push_back(ninetyfive);
685 temp.push_back(ninetynine);
687 quan.push_back(temp);
697 catch(exception& e) {
698 m->errorOut(e, "Pintail", "readQuantiles");
702 //***************************************************************************************************************/
704 void Pintail::printQuanFile(string file, string outputString) {
713 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
715 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
717 //char* FileName = new char[file.length()];
718 //memcpy(FileName, file.c_str(), file.length());
721 strcpy(FileName, file.c_str());
724 MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outQuan); //comm, filename, mode, info, filepointer
726 int length = outputString.length();
727 char* buf = new char[length];
728 memcpy(buf, outputString.c_str(), length);
730 MPI_File_write(outQuan, buf, length, MPI_CHAR, &status);
733 MPI_File_close(&outQuan);
739 openOutputFile(file, outQuan);
741 outQuan << outputString;
746 catch(exception& e) {
747 m->errorOut(e, "Pintail", "printQuanFile");
752 //***************************************************************************************************************/