]> git.donarmstrong.com Git - mothur.git/blob - chimeracheckrdp.cpp
added sorted parameter to get.oturep, added error checking to chimera classes in...
[mothur.git] / chimeracheckrdp.cpp
1 /*
2  *  chimeracheckrdp.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/8/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeracheckrdp.h"
11                 
12 //***************************************************************************************************************
13 ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
14 //***************************************************************************************************************
15
16 ChimeraCheckRDP::~ChimeraCheckRDP() {
17         try {
18                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
19                 delete templateDB;
20                 delete kmer;
21         }
22         catch(exception& e) {
23                 errorOut(e, "ChimeraCheckRDP", "~AlignSim");
24                 exit(1);
25         }
26 }       
27 //***************************************************************************************************************
28 void ChimeraCheckRDP::print(ostream& out) {
29         try {
30                 
31                 mothurOutEndLine();
32                 
33                 //vector<bool> isChimeric;  isChimeric.resize(querySeqs.size(), false);
34                 
35                 for (int i = 0; i < querySeqs.size(); i++) {
36                         
37                                 out << querySeqs[i]->getName() << endl;
38                                 out << "IS scores: " << '\t';
39                                 
40                                 //int lastChimericWindowFound = 0;
41                                 
42                                 for (int k = 0; k < IS[i].size(); k++) {
43                                         out << IS[i][k].score << '\t'; 
44                                         //if (IS[i][k].score > chimeraCutoff) {  isChimeric[i] = true;   lastChimericWindowFound = k;           }                       
45                                 }
46                                 out << endl;
47                                 //if (isChimeric[i]) { 
48                                         //mothurOut(querySeqs[i]->getName() + "\tIS: " + toString(IS[i][lastChimericWindowFound].score) + "\tbreakpoint: " + toString(IS[i][lastChimericWindowFound].midpoint) + "\tleft parent: " + IS[i][lastChimericWindowFound].leftParent + "\tright parent: " + IS[i][lastChimericWindowFound].rightParent); mothurOutEndLine();
49                                         //out << endl << "chimera: YES" << endl;
50                                 //}else{
51                                         //out << endl << "chimera: NO" << endl;
52                                 //}
53                                 
54                                 if (svg) {
55                                         
56                                         if (name != "") { //if user has specific names
57                                                 map<string, string>::iterator it = names.find(querySeqs[i]->getName());
58                                         
59                                                 if (it != names.end()) { //user wants pic of this
60                                                         makeSVGpic(IS[i], i);  //zeros out negative results
61                                                 }
62                                         }else{//output them all
63                                                 makeSVGpic(IS[i], i);  //zeros out negative results
64                                         }
65                                 }
66                 }
67                 
68                 mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); mothurOutEndLine();
69         }
70         catch(exception& e) {
71                 errorOut(e, "ChimeraCheckRDP", "print");
72                 exit(1);
73         }
74 }
75
76 //***************************************************************************************************************
77 int ChimeraCheckRDP::getChimeras() {
78         try {
79                 
80                 //read in query sequences and subject sequences
81                 mothurOutEndLine();
82                 mothurOut("Reading query sequences... "); cout.flush();
83                 querySeqs = readSeqs(fastafile);
84                 mothurOut("Done."); 
85                 //templateSeqs = readSeqs(templateFile);
86                 templateDB = new AlignmentDB(templateFile, "kmer", kmerSize, 0.0,0.0,0.0,0.0);
87                 mothurOutEndLine();
88                 
89                 int numSeqs = querySeqs.size();
90                 
91                 IS.resize(numSeqs);
92                 closest.resize(numSeqs);
93                 
94                 //break up file if needed
95                 int linesPerProcess = numSeqs / processors ;
96                 
97                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
98                         //find breakup of sequences for all times we will Parallelize
99                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
100                         else {
101                                 //fill line pairs
102                                 for (int i = 0; i < (processors-1); i++) {                      
103                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
104                                 }
105                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
106                                 int i = processors - 1;
107                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
108                         }
109                         
110                 #else
111                         lines.push_back(new linePair(0, numSeqs));
112                 #endif
113                 
114                 kmer = new Kmer(kmerSize);
115                 
116                 if (name != "") { 
117                         readName(name);  //fills name map with names of seqs the user wants to have .svg for.  
118                 }
119                 
120                 //find closest seq to each querySeq
121                 for (int i = 0; i < querySeqs.size(); i++) {
122                         closest[i] = templateDB->findClosestSequence(querySeqs[i]);  
123                 }
124
125                 //for each query find IS value  
126                 if (processors == 1) {
127                         for (int i = 0; i < querySeqs.size(); i++) {
128                                 IS[i] = findIS(i); 
129                         }
130                 }else {         createProcessesIS();    }
131                 
132                 //determine chimera report cutoff - window score above 95%
133                 //getCutoff();  - not very acurate predictor
134                 
135                 //free memory
136                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];        }
137         
138                 return 0;
139         }
140         catch(exception& e) {
141                 errorOut(e, "ChimeraCheckRDP", "getChimeras");
142                 exit(1);
143         }
144 }
145 //***************************************************************************************************************
146 vector<sim> ChimeraCheckRDP::findIS(int query) {
147         try {
148                 
149                 
150                 vector< map<int, int> > queryKmerInfo;  //vector of maps - each entry in the vector is a map of the kmers up to that spot in the unaligned seq
151                                                                                                 //example:  seqKmerInfo[50] = map containing the kmers found in the first 50 + kmersize characters of ecoli.
152                                                                                                 //i chose to store the kmers numbers in a map so you wouldn't have to check for dupilcate entries and could easily find the 
153                                                                                                 //kmers 2 seqs had in common.  There may be a better way to do this thats why I am leaving so many comments...
154                 vector< map<int, int> > subjectKmerInfo;
155                 
156                 vector<sim>  isValues;
157                 string queryName = querySeqs[query]->getName();
158                 string seq = querySeqs[query]->getUnaligned();
159                 
160                 mothurOut("Finding IS values for sequence " + toString(query+1)); mothurOutEndLine();
161                 
162                 queryKmerInfo = kmer->getKmerCounts(seq);
163                 subjectKmerInfo = kmer->getKmerCounts(closest[query].getUnaligned());
164                 
165                 //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each
166                 int nTotal = calcKmers(queryKmerInfo[(queryKmerInfo.size()-1)], subjectKmerInfo[(subjectKmerInfo.size()-1)]);
167
168                 //you don't want the starting point to be virtually at hte end so move it in 10%
169                 int start = seq.length() / 10;
170                         
171                 //for each window
172                 for (int m = start; m < (seq.length() - start); m+=increment) {
173                         
174                         if ((m - kmerSize) < 0)  { mothurOut("Your sequence is too short for your kmerSize."); mothurOutEndLine(); exit(1); }
175                         
176                         sim temp;
177                         
178                         string fragLeft = seq.substr(0, m);  //left side of breakpoint
179                         string fragRight = seq.substr(m);  //right side of breakpoint
180                         
181                         //make a sequence of the left side and right side
182                         Sequence* left = new Sequence(queryName, fragLeft);
183                         Sequence* right = new Sequence(queryName, fragRight);
184                         
185                         //find seqs closest to each fragment
186                         Sequence closestLeft = templateDB->findClosestSequence(left); 
187         
188                         Sequence closestRight = templateDB->findClosestSequence(right); 
189                 
190                         //get kmerinfo for the closest left
191                         vector< map<int, int> > closeLeftKmerInfo = kmer->getKmerCounts(closestLeft.getUnaligned());
192                         
193                         //get kmerinfo for the closest right
194                         vector< map<int, int> > closeRightKmerInfo = kmer->getKmerCounts(closestRight.getUnaligned());
195                         
196                         //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side
197                         //iterate through left sides map to subtract the number of times you saw things before you got the the right side
198                         map<int, int> rightside = queryKmerInfo[queryKmerInfo.size()-1];
199                         for (map<int, int>::iterator itleft = queryKmerInfo[m-kmerSize].begin(); itleft != queryKmerInfo[m-kmerSize].end(); itleft++) {
200                                 int howManyTotal = queryKmerInfo[queryKmerInfo.size()-1][itleft->first];   //times that kmer was seen in total
201
202                                 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
203                                 int howmanyright = howManyTotal - itleft->second;
204                                 
205                                 //if any were seen just on the left erase
206                                 if (howmanyright == 0) {
207                                         rightside.erase(itleft->first);
208                                 }
209                         }
210                         
211                         map<int, int> closerightside = closeRightKmerInfo[closeRightKmerInfo.size()-1];
212                         for (map<int, int>::iterator itright = closeRightKmerInfo[m-kmerSize].begin(); itright != closeRightKmerInfo[m-kmerSize].end(); itright++) {
213                                 int howManyTotal = closeRightKmerInfo[(closeRightKmerInfo.size()-1)][itright->first];   //times that kmer was seen in total
214
215                                 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
216                                 int howmanyright = howManyTotal - itright->second;
217                                 
218                                 //if any were seen just on the left erase
219                                 if (howmanyright == 0) {
220                                         closerightside.erase(itright->first);
221                                 }
222                         }
223
224                         
225                         int nLeft = calcKmers(closeLeftKmerInfo[m-kmerSize], queryKmerInfo[m-kmerSize]);
226
227                         int nRight = calcKmers(closerightside, rightside);
228
229                         int is = nLeft + nRight - nTotal;
230
231                         //save IS, leftparent, rightparent, breakpoint
232                         temp.leftParent = closestLeft.getName();
233                         temp.rightParent = closestRight.getName();
234                         temp.score = is;
235                         temp.midpoint = m;
236                         
237                         isValues.push_back(temp);
238                         
239                         delete left;
240                         delete right;
241                 }       
242                 
243                 return isValues;
244         
245         }
246         catch(exception& e) {
247                 errorOut(e, "ChimeraCheckRDP", "findIS");
248                 exit(1);
249         }
250 }
251 //***************************************************************************************************************
252 void ChimeraCheckRDP::readName(string namefile) {
253         try{
254                 ifstream in;
255                 openInputFile(namefile, in);
256                 string name;
257                 
258                 while (!in.eof()) {
259                         
260                         in >> name;
261                         
262                         names[name] = name;
263                         
264                         gobble(in);
265                 }
266         
267         }
268         catch(exception& e) {
269                 errorOut(e, "ChimeraCheckRDP", "readName");
270                 exit(1);
271         }
272 }
273
274 //***************************************************************************************************************
275 //find the smaller map and iterate through it and count kmers in common
276 int ChimeraCheckRDP::calcKmers(map<int, int> query, map<int, int> subject) {
277         try{
278                 
279                 int common = 0;
280                 map<int, int>::iterator small;
281                 map<int, int>::iterator large;
282                 
283                 if (query.size() < subject.size()) {
284                 
285                         for (small = query.begin(); small != query.end(); small++) {
286                                 large = subject.find(small->first);
287                                 
288                                 //if you found it they have that kmer in common
289                                 if (large != subject.end()) {   common++;       }
290                         }
291                         
292                 }else { 
293                  
294                         for (small = subject.begin(); small != subject.end(); small++) {
295                                 large = query.find(small->first);
296                                 
297                                 //if you found it they have that kmer in common
298                                 if (large != query.end()) {             common++;        }
299                         }
300                 }
301                 
302                 return common;
303                 
304         }
305         catch(exception& e) {
306                 errorOut(e, "ChimeraCheckRDP", "calcKmers");
307                 exit(1);
308         }
309 }
310
311 //***************************************************************************************************************
312 void ChimeraCheckRDP::getCutoff() {
313         try{
314                 
315                 vector<float> temp;
316                 
317                 //store all is scores for all windows
318                 for (int i = 0; i < IS.size(); i++) {
319                         for (int j = 0; j < IS[i].size(); j++) {
320                                 temp.push_back(IS[i][j].score);
321                         }
322                 }
323                 
324                 //sort them
325                 sort(temp.begin(), temp.end());
326                 
327                 //get 95%
328                 chimeraCutoff = temp[int(temp.size() * 0.95)];
329
330         }
331         catch(exception& e) {
332                 errorOut(e, "ChimeraCheckRDP", "getCutoff");
333                 exit(1);
334         }
335 }
336
337 //***************************************************************************************************************
338 void ChimeraCheckRDP::makeSVGpic(vector<sim> info, int query) {
339         try{
340                 
341                 string file = querySeqs[query]->getName() + ".chimeracheck.svg";
342                 ofstream outsvg;
343                 openOutputFile(file, outsvg);
344                 
345                 int width = (info.size()*5) + 150;
346                 
347                 outsvg << "<svg xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns=\"http://www.w3.org/2000/svg\" width=\"100%\" height=\"100%\" viewBox=\"0 0 700 " + toString(width) + "\">\n";
348                 outsvg << "<g>\n";
349                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"25\">Plotted IS values for " + querySeqs[query]->getName() + "</text>\n";
350                 
351                 outsvg <<  "<line x1=\"75\" y1=\"600\" x2=\"" + toString((info.size()*5) + 75) + "\" y2=\"600\" stroke=\"black\" stroke-width=\"2\"/>\n";  
352                 outsvg <<  "<line x1=\"75\" y1=\"600\" x2=\"75\" y2=\"125\" stroke=\"black\" stroke-width=\"2\"/>\n";
353                 
354                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"80\" y=\"620\">" + toString(info[0].midpoint) + "</text>\n";
355                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((info.size()*5) + 75) + "\" y=\"620\">" + toString(info[info.size()-1].midpoint) + "</text>\n";
356                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"650\">Base Positions</text>\n";
357                 
358                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"580\">0</text>\n";
359                 
360                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"350\">IS</text>\n";
361                 
362                 
363                 //find max is score
364                 float biggest = 0.0;
365                 for (int i = 0; i < info.size(); i++) {
366                         if (info[i].score > biggest)  {
367                                 biggest = info[i].score;
368                         }
369                 }
370                 
371                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"135\">" + toString(biggest) + "</text>\n";
372                 
373                 int scaler2 = 500 / biggest;
374                 
375                 
376                 outsvg << "<polyline fill=\"none\" stroke=\"red\" stroke-width=\"2\" points=\"";
377                 //160,200 180,230 200,210 234,220\"/> "; 
378                 for (int i = 0; i < info.size(); i++) {
379                         if(info[i].score < 0) { info[i].score = 0; }
380                         outsvg << ((i*5) + 75) << "," << (600 - (info[i].score * scaler2)) << " ";
381                 }
382                 
383                 outsvg << "\"/> ";
384                 outsvg << "</g>\n</svg>\n";
385                 
386                 outsvg.close();
387
388         }
389         catch(exception& e) {
390                 errorOut(e, "ChimeraCheckRDP", "makeSVGpic");
391                 exit(1);
392         }
393 }
394 //***************************************************************************************************************
395 void ChimeraCheckRDP::createProcessesIS() {
396         try {
397         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
398                 int process = 0;
399                 vector<int> processIDS;
400                 
401                 //loop through and create all the processes you want
402                 while (process != processors) {
403                         int pid = fork();
404                         
405                         if (pid > 0) {
406                                 processIDS.push_back(pid);  
407                                 process++;
408                         }else if (pid == 0){
409                                                         
410                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
411                                         IS[i] = findIS(i);  
412                                 }                               
413                                 
414                                 //write out data to file so parent can read it
415                                 ofstream out;
416                                 string s = toString(getpid()) + ".temp";
417                                 openOutputFile(s, out);
418                                 
419                                 //output pairs
420                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
421                                         out << IS[i].size() << endl;
422                                         for (int j = 0; j < IS[i].size(); j++) {
423                                                 out << IS[i][j].leftParent << '\t'<< IS[i][j].rightParent << '\t' << IS[i][j].midpoint << '\t' << IS[i][j].score << endl;
424                                         }
425                                 }
426
427                                 out.close();
428                                 
429                                 exit(0);
430                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
431                 }
432                 
433                 //force parent to wait until all the processes are done
434                 for (int i=0;i<processors;i++) { 
435                         int temp = processIDS[i];
436                         wait(&temp);
437                 }
438         
439                 //get data created by processes
440                 for (int i=0;i<processors;i++) { 
441                         ifstream in;
442                         string s = toString(processIDS[i]) + ".temp";
443                         openInputFile(s, in);
444                 
445                         //get pairs
446                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
447                         
448                                 int size;
449                                 in >> size; gobble(in);
450                                 
451                                 string left, right;
452                                 int mid;
453                                 float score;
454                                 
455                                 IS[k].clear();
456                                 
457                                 for (int j = 0; j < size; j++) {
458                                         in >> left >> right >> mid >> score;  gobble(in);
459                                         
460                                         sim temp;
461                                         temp.leftParent = left;
462                                         temp.rightParent = right;
463                                         temp.midpoint = mid;
464                                         temp.score = score;
465                                         
466                                         IS[k].push_back(temp);
467                                 }
468                         }
469                         
470                         in.close();
471                         remove(s.c_str());
472                 }
473 #else
474                         for (int i = 0; i < querySeqs.size(); i++) {
475                                 IS[i] = findIS(i);
476                         }
477 #endif          
478         }
479         catch(exception& e) {
480                 errorOut(e, "ChimeraCheckRDP", "createProcessesIS");
481                 exit(1);
482         }
483 }
484
485 //***************************************************************************************************************
486
487