5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17 return (left.score < right.score);
19 //***************************************************************************************************************
21 Pintail::Pintail(string filename, string o) {
22 fastafile = filename; outputDir = o;
23 distcalculator = new eachGapDist();
24 decalc = new DeCalculator();
26 //***************************************************************************************************************
31 delete distcalculator;
35 errorOut(e, "Pintail", "~Pintail");
39 //***************************************************************************************************************
40 void Pintail::doPrep() {
43 windowSizesTemplate.resize(templateSeqs.size(), window);
44 quantiles.resize(100); //one for every percent mismatch
45 quantilesMembers.resize(100); //one for every percent mismatch
47 //if the user does not enter a mask then you want to keep all the spots in the alignment
48 if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); }
49 else { decalc->setAlignmentLength(seqMask.length()); }
51 decalc->setMask(seqMask);
53 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
54 //find breakup of templatefile for quantiles
55 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
57 for (int i = 0; i < processors; i++) {
58 templateLines.push_back(new linePair());
59 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
60 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
64 templateLines.push_back(new linePair(0, templateSeqs.size()));
68 mothurOut("Getting conservation... "); cout.flush();
70 mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
71 probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName));
72 mothurOut("Done."); mothurOutEndLine();
73 }else { probabilityProfile = readFreq(); mothurOut("Done."); }
76 //quantiles are used to determine whether the de values found indicate a chimera
77 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
78 //combination of sequences in the template
79 if (quanfile != "") { quantiles = readQuantiles(); }
82 //if user has not provided the quantiles and wants the mask we need to mask, but then delete and reread or we will mess up the best match process in getChimeras
85 for (int i = 0; i < templateSeqs.size(); i++) {
86 decalc->runMask(templateSeqs[i]);
91 createFilter(templateSeqs);
92 for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); }
96 mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
97 if (processors == 1) {
98 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
99 }else { createProcessesQuan(); }
103 string noOutliers, outliers;
105 if ((!filter) && (seqMask == "")) {
106 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
107 }else if ((filter) && (seqMask == "")) {
108 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.quan";
109 }else if ((!filter) && (seqMask != "")) {
110 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
111 }else if ((filter) && (seqMask != "")) {
112 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.masked.quan";
116 decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
118 openOutputFile(noOutliers, out5);
121 for (int i = 0; i < quantilesMembers.size(); i++) {
124 if (quantilesMembers[i].size() == 0) {
125 //in case this is not a distance found in your template files
126 for (int g = 0; g < 6; g++) {
131 sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
134 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
136 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
138 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
140 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
142 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
144 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
150 for (int u = 0; u < temp.size(); u++) { out5 << temp[u] << '\t'; }
157 mothurOut("Done."); mothurOutEndLine();
159 //if mask reread template
160 if ((seqMask != "") || (filter)) {
161 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
162 templateSeqs = readSeqs(templateFileName);
168 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
171 catch(exception& e) {
172 errorOut(e, "Pintail", "doPrep");
176 //***************************************************************************************************************
177 void Pintail::print(ostream& out) {
179 int index = ceil(deviation);
181 //is your DE value higher than the 95%
183 if (index != 0) { //if index is 0 then its an exact match to a template seq
184 if (quantiles[index][4] == 0.0) {
185 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
187 if (DE > quantiles[index][4]) { chimera = "Yes"; }
188 else { chimera = "No"; }
190 }else{ chimera = "No"; }
192 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
193 if (chimera == "Yes") {
194 mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine();
198 for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; }
203 for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; }
208 catch(exception& e) {
209 errorOut(e, "Pintail", "print");
214 //***************************************************************************************************************
215 int Pintail::getChimeras(Sequence* query) {
219 windowSizes = window;
222 bestfit = findPairs(query);
225 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
227 //mask sequences if the user wants to
229 decalc->runMask(query);
230 decalc->runMask(bestfit);
239 decalc->trimSeqs(query, bestfit, trimmed);
242 it = trimmed.begin();
243 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
245 //find observed distance
246 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
248 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
251 seqCoef = decalc->getCoef(obsDistance, Qav);
253 //calculating expected distance
254 expectedDistance = decalc->calcExpected(Qav, seqCoef);
257 DE = decalc->calcDE(obsDistance, expectedDistance);
259 //find distance between query and closest match
260 it = trimmed.begin();
261 deviation = decalc->calcDist(query, bestfit, it->first, it->second);
267 catch(exception& e) {
268 errorOut(e, "Pintail", "getChimeras");
273 //***************************************************************************************************************
275 vector<float> Pintail::readFreq() {
279 openInputFile(consfile, in);
282 set<int> h = decalc->getPos(); //positions of bases in masking sequence
284 //read in probabilities and store in vector
291 if (h.count(pos) > 0) {
293 Pi = (num - 0.25) / 0.75;
295 //cannot have probability less than 0.
296 if (Pi < 0) { Pi = 0.0; }
298 //do you want this spot
309 catch(exception& e) {
310 errorOut(e, "Pintail", "readFreq");
315 //***************************************************************************************************************
316 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
317 Sequence* Pintail::findPairs(Sequence* q) {
320 Sequence* seqsMatches;
322 vector<Sequence*> copy = decalc->findClosest(q, templateSeqs, 1);
323 seqsMatches = copy[0];
328 catch(exception& e) {
329 errorOut(e, "Pintail", "findPairs");
333 /**************************************************************************************************/
334 void Pintail::createProcessesQuan() {
336 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
338 vector<int> processIDS;
340 //loop through and create all the processes you want
341 while (process != processors) {
345 processIDS.push_back(pid);
349 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
351 //write out data to file so parent can read it
353 string s = toString(getpid()) + ".temp";
354 openOutputFile(s, out);
357 //output observed distances
358 for (int i = 0; i < quantilesMembers.size(); i++) {
359 out << quantilesMembers[i].size() << '\t';
360 for (int j = 0; j < quantilesMembers[i].size(); j++) {
361 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
369 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
372 //force parent to wait until all the processes are done
373 for (int i=0;i<processors;i++) {
374 int temp = processIDS[i];
378 //get data created by processes
379 for (int i=0;i<processors;i++) {
381 string s = toString(processIDS[i]) + ".temp";
382 openInputFile(s, in);
384 vector< vector<quanMember> > quan;
388 for (int m = 0; m < quan.size(); m++) {
394 vector<quanMember> q; float w; int b, n;
395 for (int j = 0; j < num; j++) {
397 //cout << w << '\t' << b << '\t' n << endl;
398 quanMember newMember(w, b, n);
399 q.push_back(newMember);
401 //cout << "here" << endl;
403 //cout << "now here" << endl;
408 //save quan in quantiles
409 for (int j = 0; j < quan.size(); j++) {
410 //put all values of q[i] into quan[i]
411 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
412 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
420 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
423 catch(exception& e) {
424 errorOut(e, "Pintail", "createProcessesQuan");
430 //***************************************************************************************************************