5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17 return (left.score < right.score);
19 //***************************************************************************************************************
21 Pintail::Pintail(string filename, string o) {
22 fastafile = filename; outputDir = o;
23 distcalculator = new eachGapDist();
24 decalc = new DeCalculator();
26 //***************************************************************************************************************
31 delete distcalculator;
35 errorOut(e, "Pintail", "~Pintail");
39 //***************************************************************************************************************
40 void Pintail::doPrep() {
43 mergedFilterString = "";
44 windowSizesTemplate.resize(templateSeqs.size(), window);
45 quantiles.resize(100); //one for every percent mismatch
46 quantilesMembers.resize(100); //one for every percent mismatch
48 //if the user does not enter a mask then you want to keep all the spots in the alignment
49 if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); }
50 else { decalc->setAlignmentLength(seqMask.length()); }
52 decalc->setMask(seqMask);
54 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
55 //find breakup of templatefile for quantiles
56 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
58 for (int i = 0; i < processors; i++) {
59 templateLines.push_back(new linePair());
60 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
61 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
65 templateLines.push_back(new linePair(0, templateSeqs.size()));
69 mothurOut("Getting conservation... "); cout.flush();
71 mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
72 probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName));
73 mothurOut("Done."); mothurOutEndLine();
74 }else { probabilityProfile = readFreq(); mothurOut("Done."); }
78 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
81 //create filter if needed for later
84 cout << "filter" << endl;
86 //read in all query seqs
88 openInputFile(fastafile, in);
90 vector<Sequence*> tempQuerySeqs;
92 Sequence* s = new Sequence(in);
95 if (s->getName() != "") { tempQuerySeqs.push_back(s); }
99 vector<Sequence*> temp;
100 //merge query seqs and template seqs
102 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
103 cout << temp.size() << endl;
106 cout << "masked " << seqMask << endl;
108 for (int i = 0; i < temp.size(); i++) {
109 decalc->runMask(temp[i]);
113 mergedFilterString = createFilter(temp, 0.5);
114 cout << "here" << endl;
115 //reread template seqs
116 //for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
120 //quantiles are used to determine whether the de values found indicate a chimera
121 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
122 //combination of sequences in the template
123 if (quanfile != "") {
124 quantiles = readQuantiles();
126 if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
129 for (int i = 0; i < templateSeqs.size(); i++) {
130 decalc->runMask(templateSeqs[i]);
136 for (int i = 0; i < templateSeqs.size(); i++) {
137 runFilter(templateSeqs[i]);
141 mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
142 if (processors == 1) {
143 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
144 }else { createProcessesQuan(); }
148 string noOutliers, outliers;
150 if ((!filter) && (seqMask == "")) {
151 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
152 }else if ((!filter) && (seqMask != "")) {
153 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
154 }else if ((filter) && (seqMask != "")) {
155 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan";
156 }else if ((filter) && (seqMask == "")) {
157 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan";
163 decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
165 openOutputFile(noOutliers, out5);
167 for (int i = 0; i < quantilesMembers.size(); i++) {
170 if (quantilesMembers[i].size() == 0) {
171 //in case this is not a distance found in your template files
172 for (int g = 0; g < 6; g++) {
177 sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
180 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
182 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
184 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
186 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
188 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
190 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
196 for (int u = 0; u < temp.size(); u++) { out5 << temp[u] << '\t'; }
203 mothurOut("Done."); mothurOutEndLine();
207 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
208 templateSeqs.clear();
209 templateSeqs = readSeqs(templateFileName);
214 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
217 catch(exception& e) {
218 errorOut(e, "Pintail", "doPrep");
222 //***************************************************************************************************************
223 void Pintail::print(ostream& out, ostream& outAcc) {
225 int index = ceil(deviation);
227 //is your DE value higher than the 95%
229 if (index != 0) { //if index is 0 then its an exact match to a template seq
230 if (quantiles[index][4] == 0.0) {
231 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
233 if (DE > quantiles[index][4]) { chimera = "Yes"; }
234 else { chimera = "No"; }
236 }else{ chimera = "No"; }
238 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
239 if (chimera == "Yes") {
240 mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine();
241 outAcc << querySeq->getName() << endl;
245 for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; }
250 for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; }
255 catch(exception& e) {
256 errorOut(e, "Pintail", "print");
261 //***************************************************************************************************************
262 int Pintail::getChimeras(Sequence* query) {
266 windowSizes = window;
268 //find pairs has to be done before a mask
269 bestfit = findPairs(query);
273 decalc->runMask(query);
274 decalc->runMask(bestfit);
277 if (filter) { //must be done after a mask
284 decalc->trimSeqs(query, bestfit, trimmed);
287 it = trimmed.begin();
288 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
290 //find observed distance
291 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
293 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
296 seqCoef = decalc->getCoef(obsDistance, Qav);
298 //calculating expected distance
299 expectedDistance = decalc->calcExpected(Qav, seqCoef);
302 DE = decalc->calcDE(obsDistance, expectedDistance);
304 //find distance between query and closest match
305 it = trimmed.begin();
306 deviation = decalc->calcDist(query, bestfit, it->first, it->second);
312 catch(exception& e) {
313 errorOut(e, "Pintail", "getChimeras");
318 //***************************************************************************************************************
320 vector<float> Pintail::readFreq() {
324 openInputFile(consfile, in);
327 set<int> h = decalc->getPos(); //positions of bases in masking sequence
329 //read in probabilities and store in vector
336 if (h.count(pos) > 0) {
338 Pi = (num - 0.25) / 0.75;
340 //cannot have probability less than 0.
341 if (Pi < 0) { Pi = 0.0; }
343 //do you want this spot
354 catch(exception& e) {
355 errorOut(e, "Pintail", "readFreq");
360 //***************************************************************************************************************
361 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
362 Sequence* Pintail::findPairs(Sequence* q) {
365 Sequence* seqsMatches;
367 seqsMatches = decalc->findClosest(q, templateSeqs);
371 catch(exception& e) {
372 errorOut(e, "Pintail", "findPairs");
376 /**************************************************************************************************/
377 void Pintail::createProcessesQuan() {
379 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
381 vector<int> processIDS;
383 //loop through and create all the processes you want
384 while (process != processors) {
388 processIDS.push_back(pid);
392 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
394 //write out data to file so parent can read it
396 string s = toString(getpid()) + ".temp";
397 openOutputFile(s, out);
400 //output observed distances
401 for (int i = 0; i < quantilesMembers.size(); i++) {
402 out << quantilesMembers[i].size() << '\t';
403 for (int j = 0; j < quantilesMembers[i].size(); j++) {
404 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
412 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
415 //force parent to wait until all the processes are done
416 for (int i=0;i<processors;i++) {
417 int temp = processIDS[i];
421 //get data created by processes
422 for (int i=0;i<processors;i++) {
424 string s = toString(processIDS[i]) + ".temp";
425 openInputFile(s, in);
427 vector< vector<quanMember> > quan;
431 for (int m = 0; m < quan.size(); m++) {
437 vector<quanMember> q; float w; int b, n;
438 for (int j = 0; j < num; j++) {
440 //cout << w << '\t' << b << '\t' n << endl;
441 quanMember newMember(w, b, n);
442 q.push_back(newMember);
444 //cout << "here" << endl;
446 //cout << "now here" << endl;
451 //save quan in quantiles
452 for (int j = 0; j < quan.size(); j++) {
453 //put all values of q[i] into quan[i]
454 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
455 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
463 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
466 catch(exception& e) {
467 errorOut(e, "Pintail", "createProcessesQuan");
473 //***************************************************************************************************************