5 * Created by westcott on 9/8/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "chimeracheckrdp.h"
12 //***************************************************************************************************************
13 ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp) { fastafile = filename; templateFile = temp; }
14 //***************************************************************************************************************
16 ChimeraCheckRDP::~ChimeraCheckRDP() {
18 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
23 errorOut(e, "ChimeraCheckRDP", "~AlignSim");
27 //***************************************************************************************************************
28 void ChimeraCheckRDP::print(ostream& out) {
33 //vector<bool> isChimeric; isChimeric.resize(querySeqs.size(), false);
35 for (int i = 0; i < querySeqs.size(); i++) {
37 out << querySeqs[i]->getName() << endl;
38 out << "IS scores: " << '\t';
40 //int lastChimericWindowFound = 0;
42 for (int k = 0; k < IS[i].size(); k++) {
43 out << IS[i][k].score << '\t';
44 //if (IS[i][k].score > chimeraCutoff) { isChimeric[i] = true; lastChimericWindowFound = k; }
47 //if (isChimeric[i]) {
48 //mothurOut(querySeqs[i]->getName() + "\tIS: " + toString(IS[i][lastChimericWindowFound].score) + "\tbreakpoint: " + toString(IS[i][lastChimericWindowFound].midpoint) + "\tleft parent: " + IS[i][lastChimericWindowFound].leftParent + "\tright parent: " + IS[i][lastChimericWindowFound].rightParent); mothurOutEndLine();
49 //out << endl << "chimera: YES" << endl;
51 //out << endl << "chimera: NO" << endl;
56 if (name != "") { //if user has specific names
57 map<string, string>::iterator it = names.find(querySeqs[i]->getName());
59 if (it != names.end()) { //user wants pic of this
60 makeSVGpic(IS[i], i); //zeros out negative results
62 }else{//output them all
63 makeSVGpic(IS[i], i); //zeros out negative results
68 mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); mothurOutEndLine();
71 errorOut(e, "ChimeraCheckRDP", "print");
76 //***************************************************************************************************************
77 int ChimeraCheckRDP::getChimeras() {
80 //read in query sequences and subject sequences
82 mothurOut("Reading query sequences... "); cout.flush();
83 querySeqs = readSeqs(fastafile);
85 //templateSeqs = readSeqs(templateFile);
86 templateDB = new AlignmentDB(templateFile, "kmer", kmerSize, 0.0,0.0,0.0,0.0);
89 int numSeqs = querySeqs.size();
92 closest.resize(numSeqs);
94 //break up file if needed
95 int linesPerProcess = numSeqs / processors ;
97 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
98 //find breakup of sequences for all times we will Parallelize
99 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
102 for (int i = 0; i < (processors-1); i++) {
103 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
105 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
106 int i = processors - 1;
107 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
111 lines.push_back(new linePair(0, numSeqs));
114 kmer = new Kmer(kmerSize);
117 readName(name); //fills name map with names of seqs the user wants to have .svg for.
120 //find closest seq to each querySeq
121 for (int i = 0; i < querySeqs.size(); i++) {
122 closest[i] = templateDB->findClosestSequence(querySeqs[i]);
125 //for each query find IS value
126 if (processors == 1) {
127 for (int i = 0; i < querySeqs.size(); i++) {
130 }else { createProcessesIS(); }
132 //determine chimera report cutoff - window score above 95%
133 //getCutoff(); - not very acurate predictor
136 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
140 catch(exception& e) {
141 errorOut(e, "ChimeraCheckRDP", "getChimeras");
145 //***************************************************************************************************************
146 vector<sim> ChimeraCheckRDP::findIS(int query) {
150 vector< map<int, int> > queryKmerInfo; //vector of maps - each entry in the vector is a map of the kmers up to that spot in the unaligned seq
151 //example: seqKmerInfo[50] = map containing the kmers found in the first 50 + kmersize characters of ecoli.
152 //i chose to store the kmers numbers in a map so you wouldn't have to check for dupilcate entries and could easily find the
153 //kmers 2 seqs had in common. There may be a better way to do this thats why I am leaving so many comments...
154 vector< map<int, int> > subjectKmerInfo;
156 vector<sim> isValues;
157 string queryName = querySeqs[query]->getName();
158 string seq = querySeqs[query]->getUnaligned();
160 mothurOut("Finding IS values for sequence " + toString(query+1)); mothurOutEndLine();
162 queryKmerInfo = kmer->getKmerCounts(seq);
163 subjectKmerInfo = kmer->getKmerCounts(closest[query].getUnaligned());
165 //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each
166 int nTotal = calcKmers(queryKmerInfo[(queryKmerInfo.size()-1)], subjectKmerInfo[(subjectKmerInfo.size()-1)]);
168 //you don't want the starting point to be virtually at hte end so move it in 10%
169 int start = seq.length() / 10;
172 for (int m = start; m < (seq.length() - start); m+=increment) {
174 if ((m - kmerSize) < 0) { mothurOut("Your sequence is too short for your kmerSize."); mothurOutEndLine(); exit(1); }
178 string fragLeft = seq.substr(0, m); //left side of breakpoint
179 string fragRight = seq.substr(m); //right side of breakpoint
181 //make a sequence of the left side and right side
182 Sequence* left = new Sequence(queryName, fragLeft);
183 Sequence* right = new Sequence(queryName, fragRight);
185 //find seqs closest to each fragment
186 Sequence closestLeft = templateDB->findClosestSequence(left);
188 Sequence closestRight = templateDB->findClosestSequence(right);
190 //get kmerinfo for the closest left
191 vector< map<int, int> > closeLeftKmerInfo = kmer->getKmerCounts(closestLeft.getUnaligned());
193 //get kmerinfo for the closest right
194 vector< map<int, int> > closeRightKmerInfo = kmer->getKmerCounts(closestRight.getUnaligned());
196 //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side
197 //iterate through left sides map to subtract the number of times you saw things before you got the the right side
198 map<int, int> rightside = queryKmerInfo[queryKmerInfo.size()-1];
199 for (map<int, int>::iterator itleft = queryKmerInfo[m-kmerSize].begin(); itleft != queryKmerInfo[m-kmerSize].end(); itleft++) {
200 int howManyTotal = queryKmerInfo[queryKmerInfo.size()-1][itleft->first]; //times that kmer was seen in total
202 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
203 int howmanyright = howManyTotal - itleft->second;
205 //if any were seen just on the left erase
206 if (howmanyright == 0) {
207 rightside.erase(itleft->first);
211 map<int, int> closerightside = closeRightKmerInfo[closeRightKmerInfo.size()-1];
212 for (map<int, int>::iterator itright = closeRightKmerInfo[m-kmerSize].begin(); itright != closeRightKmerInfo[m-kmerSize].end(); itright++) {
213 int howManyTotal = closeRightKmerInfo[(closeRightKmerInfo.size()-1)][itright->first]; //times that kmer was seen in total
215 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
216 int howmanyright = howManyTotal - itright->second;
218 //if any were seen just on the left erase
219 if (howmanyright == 0) {
220 closerightside.erase(itright->first);
225 int nLeft = calcKmers(closeLeftKmerInfo[m-kmerSize], queryKmerInfo[m-kmerSize]);
227 int nRight = calcKmers(closerightside, rightside);
229 int is = nLeft + nRight - nTotal;
231 //save IS, leftparent, rightparent, breakpoint
232 temp.leftParent = closestLeft.getName();
233 temp.rightParent = closestRight.getName();
237 isValues.push_back(temp);
246 catch(exception& e) {
247 errorOut(e, "ChimeraCheckRDP", "findIS");
251 //***************************************************************************************************************
252 void ChimeraCheckRDP::readName(string namefile) {
255 openInputFile(namefile, in);
268 catch(exception& e) {
269 errorOut(e, "ChimeraCheckRDP", "readName");
274 //***************************************************************************************************************
275 //find the smaller map and iterate through it and count kmers in common
276 int ChimeraCheckRDP::calcKmers(map<int, int> query, map<int, int> subject) {
280 map<int, int>::iterator small;
281 map<int, int>::iterator large;
283 if (query.size() < subject.size()) {
285 for (small = query.begin(); small != query.end(); small++) {
286 large = subject.find(small->first);
288 //if you found it they have that kmer in common
289 if (large != subject.end()) { common++; }
294 for (small = subject.begin(); small != subject.end(); small++) {
295 large = query.find(small->first);
297 //if you found it they have that kmer in common
298 if (large != query.end()) { common++; }
305 catch(exception& e) {
306 errorOut(e, "ChimeraCheckRDP", "calcKmers");
311 //***************************************************************************************************************
312 void ChimeraCheckRDP::getCutoff() {
317 //store all is scores for all windows
318 for (int i = 0; i < IS.size(); i++) {
319 for (int j = 0; j < IS[i].size(); j++) {
320 temp.push_back(IS[i][j].score);
325 sort(temp.begin(), temp.end());
328 chimeraCutoff = temp[int(temp.size() * 0.95)];
331 catch(exception& e) {
332 errorOut(e, "ChimeraCheckRDP", "getCutoff");
337 //***************************************************************************************************************
338 void ChimeraCheckRDP::makeSVGpic(vector<sim> info, int query) {
341 string file = querySeqs[query]->getName() + ".chimeracheck.svg";
343 openOutputFile(file, outsvg);
345 int width = (info.size()*5) + 150;
347 outsvg << "<svg xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns=\"http://www.w3.org/2000/svg\" width=\"100%\" height=\"100%\" viewBox=\"0 0 700 " + toString(width) + "\">\n";
349 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"25\">Plotted IS values for " + querySeqs[query]->getName() + "</text>\n";
351 outsvg << "<line x1=\"75\" y1=\"600\" x2=\"" + toString((info.size()*5) + 75) + "\" y2=\"600\" stroke=\"black\" stroke-width=\"2\"/>\n";
352 outsvg << "<line x1=\"75\" y1=\"600\" x2=\"75\" y2=\"125\" stroke=\"black\" stroke-width=\"2\"/>\n";
354 outsvg << "<text fill=\"black\" class=\"seri\" x=\"80\" y=\"620\">" + toString(info[0].midpoint) + "</text>\n";
355 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((info.size()*5) + 75) + "\" y=\"620\">" + toString(info[info.size()-1].midpoint) + "</text>\n";
356 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"650\">Base Positions</text>\n";
358 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"580\">0</text>\n";
360 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"350\">IS</text>\n";
365 for (int i = 0; i < info.size(); i++) {
366 if (info[i].score > biggest) {
367 biggest = info[i].score;
371 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"135\">" + toString(biggest) + "</text>\n";
373 int scaler2 = 500 / biggest;
376 outsvg << "<polyline fill=\"none\" stroke=\"red\" stroke-width=\"2\" points=\"";
377 //160,200 180,230 200,210 234,220\"/> ";
378 for (int i = 0; i < info.size(); i++) {
379 if(info[i].score < 0) { info[i].score = 0; }
380 outsvg << ((i*5) + 75) << "," << (600 - (info[i].score * scaler2)) << " ";
384 outsvg << "</g>\n</svg>\n";
389 catch(exception& e) {
390 errorOut(e, "ChimeraCheckRDP", "makeSVGpic");
394 //***************************************************************************************************************
395 void ChimeraCheckRDP::createProcessesIS() {
397 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
399 vector<int> processIDS;
401 //loop through and create all the processes you want
402 while (process != processors) {
406 processIDS.push_back(pid);
410 for (int i = lines[process]->start; i < lines[process]->end; i++) {
414 //write out data to file so parent can read it
416 string s = toString(getpid()) + ".temp";
417 openOutputFile(s, out);
420 for (int i = lines[process]->start; i < lines[process]->end; i++) {
421 out << IS[i].size() << endl;
422 for (int j = 0; j < IS[i].size(); j++) {
423 out << IS[i][j].leftParent << '\t'<< IS[i][j].rightParent << '\t' << IS[i][j].midpoint << '\t' << IS[i][j].score << endl;
430 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
433 //force parent to wait until all the processes are done
434 for (int i=0;i<processors;i++) {
435 int temp = processIDS[i];
439 //get data created by processes
440 for (int i=0;i<processors;i++) {
442 string s = toString(processIDS[i]) + ".temp";
443 openInputFile(s, in);
446 for (int k = lines[i]->start; k < lines[i]->end; k++) {
449 in >> size; gobble(in);
457 for (int j = 0; j < size; j++) {
458 in >> left >> right >> mid >> score; gobble(in);
461 temp.leftParent = left;
462 temp.rightParent = right;
466 IS[k].push_back(temp);
474 for (int i = 0; i < querySeqs.size(); i++) {
479 catch(exception& e) {
480 errorOut(e, "ChimeraCheckRDP", "createProcessesIS");
485 //***************************************************************************************************************