5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17 return (left.score < right.score);
19 //***************************************************************************************************************
21 Pintail::Pintail(string filename, string o) {
22 fastafile = filename; outputDir = o;
23 distcalculator = new eachGapDist();
24 decalc = new DeCalculator();
26 //***************************************************************************************************************
31 delete distcalculator;
35 m->errorOut(e, "Pintail", "~Pintail");
39 //***************************************************************************************************************
40 void Pintail::doPrep() {
43 mergedFilterString = "";
44 windowSizesTemplate.resize(templateSeqs.size(), window);
45 quantiles.resize(100); //one for every percent mismatch
46 quantilesMembers.resize(100); //one for every percent mismatch
48 //if the user does not enter a mask then you want to keep all the spots in the alignment
49 if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); }
50 else { decalc->setAlignmentLength(seqMask.length()); }
52 decalc->setMask(seqMask);
54 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
55 //find breakup of templatefile for quantiles
56 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
58 for (int i = 0; i < processors; i++) {
59 templateLines.push_back(new linePair());
60 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
61 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
65 templateLines.push_back(new linePair(0, templateSeqs.size()));
69 m->mothurOut("Getting conservation... "); cout.flush();
71 m->mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
72 probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName));
73 m->mothurOut("Done."); m->mothurOutEndLine();
74 }else { probabilityProfile = readFreq(); m->mothurOut("Done."); }
75 m->mothurOutEndLine();
78 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
81 //create filter if needed for later
84 //read in all query seqs
86 openInputFile(fastafile, in);
88 vector<Sequence*> tempQuerySeqs;
90 Sequence* s = new Sequence(in);
93 if (s->getName() != "") { tempQuerySeqs.push_back(s); }
97 vector<Sequence*> temp;
98 //merge query seqs and template seqs
100 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
105 for (int i = 0; i < temp.size(); i++) {
106 decalc->runMask(temp[i]);
110 mergedFilterString = createFilter(temp, 0.5);
112 //reread template seqs
113 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
117 //quantiles are used to determine whether the de values found indicate a chimera
118 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
119 //combination of sequences in the template
120 if (quanfile != "") {
121 quantiles = readQuantiles();
123 if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
126 for (int i = 0; i < templateSeqs.size(); i++) {
127 decalc->runMask(templateSeqs[i]);
133 for (int i = 0; i < templateSeqs.size(); i++) {
134 runFilter(templateSeqs[i]);
138 m->mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
139 if (processors == 1) {
140 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
141 }else { createProcessesQuan(); }
145 string noOutliers, outliers;
147 if ((!filter) && (seqMask == "")) {
148 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
149 }else if ((!filter) && (seqMask != "")) {
150 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
151 }else if ((filter) && (seqMask != "")) {
152 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan";
153 }else if ((filter) && (seqMask == "")) {
154 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan";
160 decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
162 openOutputFile(noOutliers, out5);
164 for (int i = 0; i < quantilesMembers.size(); i++) {
167 if (quantilesMembers[i].size() == 0) {
168 //in case this is not a distance found in your template files
169 for (int g = 0; g < 6; g++) {
174 sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
177 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
179 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
181 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
183 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
185 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
187 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
193 for (int u = 0; u < temp.size(); u++) { out5 << temp[u] << '\t'; }
200 m->mothurOut("Done."); m->mothurOutEndLine();
204 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
205 templateSeqs.clear();
206 templateSeqs = readSeqs(templateFileName);
211 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
214 catch(exception& e) {
215 m->errorOut(e, "Pintail", "doPrep");
219 //***************************************************************************************************************
220 void Pintail::print(ostream& out, ostream& outAcc) {
222 int index = ceil(deviation);
224 //is your DE value higher than the 95%
226 if (index != 0) { //if index is 0 then its an exact match to a template seq
227 if (quantiles[index][4] == 0.0) {
228 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
230 if (DE > quantiles[index][4]) { chimera = "Yes"; }
231 else { chimera = "No"; }
233 }else{ chimera = "No"; }
235 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
236 if (chimera == "Yes") {
237 m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine();
238 outAcc << querySeq->getName() << endl;
242 for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; }
247 for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; }
252 catch(exception& e) {
253 m->errorOut(e, "Pintail", "print");
258 //***************************************************************************************************************
259 int Pintail::getChimeras(Sequence* query) {
263 windowSizes = window;
265 //find pairs has to be done before a mask
266 bestfit = findPairs(query);
270 decalc->runMask(query);
271 decalc->runMask(bestfit);
274 if (filter) { //must be done after a mask
281 decalc->trimSeqs(query, bestfit, trimmed);
284 it = trimmed.begin();
285 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
287 //find observed distance
288 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
290 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
293 seqCoef = decalc->getCoef(obsDistance, Qav);
295 //calculating expected distance
296 expectedDistance = decalc->calcExpected(Qav, seqCoef);
299 DE = decalc->calcDE(obsDistance, expectedDistance);
301 //find distance between query and closest match
302 it = trimmed.begin();
303 deviation = decalc->calcDist(query, bestfit, it->first, it->second);
309 catch(exception& e) {
310 m->errorOut(e, "Pintail", "getChimeras");
315 //***************************************************************************************************************
317 vector<float> Pintail::readFreq() {
321 openInputFile(consfile, in);
324 set<int> h = decalc->getPos(); //positions of bases in masking sequence
326 //read in probabilities and store in vector
333 if (h.count(pos) > 0) {
335 Pi = (num - 0.25) / 0.75;
337 //cannot have probability less than 0.
338 if (Pi < 0) { Pi = 0.0; }
340 //do you want this spot
351 catch(exception& e) {
352 m->errorOut(e, "Pintail", "readFreq");
357 //***************************************************************************************************************
358 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
359 Sequence* Pintail::findPairs(Sequence* q) {
362 Sequence* seqsMatches;
364 seqsMatches = decalc->findClosest(q, templateSeqs);
368 catch(exception& e) {
369 m->errorOut(e, "Pintail", "findPairs");
373 /**************************************************************************************************/
374 void Pintail::createProcessesQuan() {
376 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
378 vector<int> processIDS;
380 //loop through and create all the processes you want
381 while (process != processors) {
385 processIDS.push_back(pid);
389 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
391 //write out data to file so parent can read it
393 string s = toString(getpid()) + ".temp";
394 openOutputFile(s, out);
397 //output observed distances
398 for (int i = 0; i < quantilesMembers.size(); i++) {
399 out << quantilesMembers[i].size() << '\t';
400 for (int j = 0; j < quantilesMembers[i].size(); j++) {
401 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
409 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
412 //force parent to wait until all the processes are done
413 for (int i=0;i<processors;i++) {
414 int temp = processIDS[i];
418 //get data created by processes
419 for (int i=0;i<processors;i++) {
421 string s = toString(processIDS[i]) + ".temp";
422 openInputFile(s, in);
424 vector< vector<quanMember> > quan;
428 for (int m = 0; m < quan.size(); m++) {
434 vector<quanMember> q; float w; int b, n;
435 for (int j = 0; j < num; j++) {
437 //cout << w << '\t' << b << '\t' n << endl;
438 quanMember newMember(w, b, n);
439 q.push_back(newMember);
441 //cout << "here" << endl;
443 //cout << "now here" << endl;
448 //save quan in quantiles
449 for (int j = 0; j < quan.size(); j++) {
450 //put all values of q[i] into quan[i]
451 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
452 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
460 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
463 catch(exception& e) {
464 m->errorOut(e, "Pintail", "createProcessesQuan");
470 //***************************************************************************************************************