]> git.donarmstrong.com Git - mothur.git/blob - chimeracheckrdp.cpp
1.9
[mothur.git] / chimeracheckrdp.cpp
1 /*
2  *  chimeracheckrdp.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/8/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeracheckrdp.h"
11                 
12 //***************************************************************************************************************
13 ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp, string n, bool s, int inc, int k, string o) : Chimera() { 
14         try {
15                 fastafile = filename; 
16                 templateFileName = temp;  
17                 name = n;
18                 svg = s;
19                 increment = inc;
20                 kmerSize = k;
21                 outputDir = o; 
22                 
23                 templateDB = new AlignmentDB(templateFileName, "kmer", kmerSize, 0.0,0.0,0.0,0.0);
24                 m->mothurOutEndLine();
25                 
26                 kmer = new Kmer(kmerSize);
27                 
28                 if (name != "") { 
29                         readName(name);  //fills name map with names of seqs the user wants to have .svg for.  
30                 }
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ChimeraCheckRDP", "ChimeraCheckRDP");
34                 exit(1);
35         }
36 }
37 //***************************************************************************************************************
38
39 ChimeraCheckRDP::~ChimeraCheckRDP() {
40         try {
41                 delete templateDB;
42                 delete kmer;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ChimeraCheckRDP", "~ChimeraCheckRDP");
46                 exit(1);
47         }
48 }       
49 //***************************************************************************************************************
50 int ChimeraCheckRDP::print(ostream& out, ostream& outAcc) {
51         try {
52                 
53                 m->mothurOut("Processing: " + querySeq->getName()); m->mothurOutEndLine();
54                 
55                 out << querySeq->getName() << endl;
56                 out << "IS scores: " << '\t';
57                         
58                 for (int k = 0; k < IS.size(); k++) {
59                         out << IS[k].score << '\t'; 
60                 }
61                 out << endl;
62                 
63                 if (svg) {
64                         if (name != "") { //if user has specific names
65                                 map<string, string>::iterator it = names.find(querySeq->getName());
66                                 
67                                 if (it != names.end()) { //user wants pic of this
68                                         makeSVGpic(IS);  //zeros out negative results
69                                 }
70                         }else{//output them all
71                                 makeSVGpic(IS);  //zeros out negative results
72                         }
73                 }
74                 
75                 return 0;
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "ChimeraCheckRDP", "print");
79                 exit(1);
80         }
81 }
82 #ifdef USE_MPI
83 //***************************************************************************************************************
84 int ChimeraCheckRDP::print(MPI_File& out, MPI_File& outAcc) {
85         try {
86                 
87                 cout << "Processing: " << querySeq->getName() << endl; 
88                 
89                 string outString = "";
90                 
91                 outString += querySeq->getName() + "\nIS scores: \t";
92                         
93                 for (int k = 0; k < IS.size(); k++) {
94                         outString += toString(IS[k].score)  + "\t"; 
95                 }
96                 outString += "\n";
97                 
98                 MPI_Status status;
99                 int length = outString.length();
100                 char* buf = new char[length];\r
101                 memcpy(buf, outString.c_str(), length);
102                                 
103                 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
104                 delete buf;
105
106                 if (svg) {
107                         if (name != "") { //if user has specific names
108                                 map<string, string>::iterator it = names.find(querySeq->getName());
109                                 
110                                 if (it != names.end()) { //user wants pic of this
111                                         makeSVGpic(IS);  //zeros out negative results
112                                 }
113                         }else{//output them all
114                                 makeSVGpic(IS);  //zeros out negative results
115                         }
116                 }
117                 
118                 return 0;
119         }
120         catch(exception& e) {
121                 m->errorOut(e, "ChimeraCheckRDP", "print");
122                 exit(1);
123         }
124 }
125 #endif
126 //***************************************************************************************************************
127 int ChimeraCheckRDP::getChimeras(Sequence* query) {
128         try {
129                 
130                 IS.clear();
131                                 
132                 querySeq = query;
133                         
134                 closest = templateDB->findClosestSequence(query);  
135         
136                 IS = findIS(); 
137                                         
138                 //determine chimera report cutoff - window score above 95%
139                 //getCutoff();  - not very acurate predictor
140                 
141                 return 0;
142         }
143         catch(exception& e) {
144                 m->errorOut(e, "ChimeraCheckRDP", "getChimeras");
145                 exit(1);
146         }
147 }
148 //***************************************************************************************************************
149 vector<sim> ChimeraCheckRDP::findIS() {
150         try {
151                 
152                 
153                 vector< map<int, int> > queryKmerInfo;  //vector of maps - each entry in the vector is a map of the kmers up to that spot in the unaligned seq
154                                                                                                 //example:  seqKmerInfo[50] = map containing the kmers found in the first 50 + kmersize characters of ecoli.
155                                                                                                 //i chose to store the kmers numbers in a map so you wouldn't have to check for dupilcate entries and could easily find the 
156                                                                                                 //kmers 2 seqs had in common.  There may be a better way to do this thats why I am leaving so many comments...
157                 vector< map<int, int> > subjectKmerInfo;
158                 
159                 vector<sim>  isValues;
160                 string queryName = querySeq->getName();
161                 string seq = querySeq->getUnaligned();
162                 
163                 queryKmerInfo = kmer->getKmerCounts(seq);
164                 subjectKmerInfo = kmer->getKmerCounts(closest.getUnaligned());
165                 
166                 //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each
167                 int nTotal = calcKmers(queryKmerInfo[(queryKmerInfo.size()-1)], subjectKmerInfo[(subjectKmerInfo.size()-1)]);
168
169                 //you don't want the starting point to be virtually at hte end so move it in 10%
170                 int start = seq.length() / 10;
171                         
172                 //for each window
173                 for (int f = start; f < (seq.length() - start); f+=increment) {
174                 
175                         if (m->control_pressed) { return isValues; }
176                         
177                         if ((f - kmerSize) < 0)  { m->mothurOut("Your sequence is too short for your kmerSize."); m->mothurOutEndLine(); exit(1); }
178                         
179                         sim temp;
180                         
181                         string fragLeft = seq.substr(0, f);  //left side of breakpoint
182                         string fragRight = seq.substr(f);  //right side of breakpoint
183                         
184                         //make a sequence of the left side and right side
185                         Sequence* left = new Sequence(queryName, fragLeft);
186                         Sequence* right = new Sequence(queryName, fragRight);
187                         
188                         //find seqs closest to each fragment
189                         Sequence closestLeft = templateDB->findClosestSequence(left); 
190         
191                         Sequence closestRight = templateDB->findClosestSequence(right); 
192                 
193                         //get kmerinfo for the closest left
194                         vector< map<int, int> > closeLeftKmerInfo = kmer->getKmerCounts(closestLeft.getUnaligned());
195                         
196                         //get kmerinfo for the closest right
197                         vector< map<int, int> > closeRightKmerInfo = kmer->getKmerCounts(closestRight.getUnaligned());
198                         
199                         //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side
200                         //iterate through left sides map to subtract the number of times you saw things before you got the the right side
201                         map<int, int> rightside = queryKmerInfo[queryKmerInfo.size()-1];
202                         for (map<int, int>::iterator itleft = queryKmerInfo[f-kmerSize].begin(); itleft != queryKmerInfo[f-kmerSize].end(); itleft++) {
203                                 int howManyTotal = queryKmerInfo[queryKmerInfo.size()-1][itleft->first];   //times that kmer was seen in total
204
205                                 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
206                                 int howmanyright = howManyTotal - itleft->second;
207                                 
208                                 //if any were seen just on the left erase
209                                 if (howmanyright == 0) {
210                                         rightside.erase(itleft->first);
211                                 }
212                         }
213                         
214                         map<int, int> closerightside = closeRightKmerInfo[closeRightKmerInfo.size()-1];
215                         for (map<int, int>::iterator itright = closeRightKmerInfo[f-kmerSize].begin(); itright != closeRightKmerInfo[f-kmerSize].end(); itright++) {
216                                 int howManyTotal = closeRightKmerInfo[(closeRightKmerInfo.size()-1)][itright->first];   //times that kmer was seen in total
217
218                                 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
219                                 int howmanyright = howManyTotal - itright->second;
220                                 
221                                 //if any were seen just on the left erase
222                                 if (howmanyright == 0) {
223                                         closerightside.erase(itright->first);
224                                 }
225                         }
226
227                         
228                         int nLeft = calcKmers(closeLeftKmerInfo[f-kmerSize], queryKmerInfo[f-kmerSize]);
229
230                         int nRight = calcKmers(closerightside, rightside);
231
232                         int is = nLeft + nRight - nTotal;
233
234                         //save IS, leftparent, rightparent, breakpoint
235                         temp.leftParent = closestLeft.getName();
236                         temp.rightParent = closestRight.getName();
237                         temp.score = is;
238                         temp.midpoint = f;
239                         
240                         isValues.push_back(temp);
241                         
242                         delete left;
243                         delete right;
244                 }
245                 
246                 return isValues;
247         
248         }
249         catch(exception& e) {
250                 m->errorOut(e, "ChimeraCheckRDP", "findIS");
251                 exit(1);
252         }
253 }
254 //***************************************************************************************************************
255 void ChimeraCheckRDP::readName(string namefile) {
256         try{
257         
258                 string name;
259
260         #ifdef USE_MPI
261                 
262                 MPI_File inMPI;
263                 MPI_Offset size;
264                 MPI_Status status;
265
266                 char* inFileName = new char[namefile.length()];\r
267                 memcpy(inFileName, namefile.c_str(), namefile.length());
268
269                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
270                 MPI_File_get_size(inMPI, &size);
271
272                 delete inFileName;
273
274                 char* buffer = new char[size];
275                 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
276
277                 string tempBuf = buffer;
278                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
279                 istringstream iss (tempBuf,istringstream::in);
280                 delete buffer;
281                 
282                 while(!iss.eof()) {
283                         iss >> name; gobble(iss);
284                         names[name] = name;
285                 }
286         
287                 MPI_File_close(&inMPI);
288                 
289         #else   
290         
291                 ifstream in;
292                 openInputFile(namefile, in);
293                                 
294                 while (!in.eof()) {
295                         in >> name; gobble(in);
296                         names[name] = name;
297                 }
298                 in.close();
299         
300         #endif
301         
302         }
303         catch(exception& e) {
304                 m->errorOut(e, "ChimeraCheckRDP", "readName");
305                 exit(1);
306         }
307 }
308
309 //***************************************************************************************************************
310 //find the smaller map and iterate through it and count kmers in common
311 int ChimeraCheckRDP::calcKmers(map<int, int> query, map<int, int> subject) {
312         try{
313                 
314                 int common = 0;
315                 map<int, int>::iterator small;
316                 map<int, int>::iterator large;
317                 
318                 if (query.size() < subject.size()) {
319                 
320                         for (small = query.begin(); small != query.end(); small++) {
321                                 large = subject.find(small->first);
322                                 
323                                 //if you found it they have that kmer in common
324                                 if (large != subject.end()) {   common++;       }
325                         }
326                         
327                 }else { 
328                  
329                         for (small = subject.begin(); small != subject.end(); small++) {
330                                 large = query.find(small->first);
331                                 
332                                 //if you found it they have that kmer in common
333                                 if (large != query.end()) {             common++;        }
334                         }
335                 }
336                 
337                 return common;
338                 
339         }
340         catch(exception& e) {
341                 m->errorOut(e, "ChimeraCheckRDP", "calcKmers");
342                 exit(1);
343         }
344 }
345 #ifdef USE_MPI
346 //***************************************************************************************************************
347 void ChimeraCheckRDP::makeSVGpic(vector<sim> info) {
348         try{
349                 
350                 string file = outputDir + querySeq->getName() + ".chimeracheck.svg";
351                 
352                 MPI_File outSVG;
353                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
354
355                 char* FileName = new char[file.length()];\r
356                 memcpy(FileName, file.c_str(), file.length());
357                 
358                 MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outSVG);  //comm, filename, mode, info, filepointer
359                 
360                 delete FileName;
361
362                 int width = (info.size()*5) + 150;
363                 
364                 string outString = "";
365                 
366                 outString += "<svg xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns=\"http://www.w3.org/2000/svg\" width=\"100%\" height=\"100%\" viewBox=\"0 0 700 " + toString(width) + "\">\n";
367                 outString += "<g>\n";
368                 outString += "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"25\">Plotted IS values for " + querySeq->getName() + "</text>\n";
369                 
370                 outString +=  "<line x1=\"75\" y1=\"600\" x2=\"" + toString((info.size()*5) + 75) + "\" y2=\"600\" stroke=\"black\" stroke-width=\"2\"/>\n";  
371                 outString +=  "<line x1=\"75\" y1=\"600\" x2=\"75\" y2=\"125\" stroke=\"black\" stroke-width=\"2\"/>\n";
372                 
373                 outString += "<text fill=\"black\" class=\"seri\" x=\"80\" y=\"620\">" + toString(info[0].midpoint) + "</text>\n";
374                 outString += "<text fill=\"black\" class=\"seri\" x=\"" + toString((info.size()*5) + 75) + "\" y=\"620\">" + toString(info[info.size()-1].midpoint) + "</text>\n";
375                 outString += "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"650\">Base Positions</text>\n";
376                 
377                 outString += "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"580\">0</text>\n";
378                 
379                 outString += "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"350\">IS</text>\n";
380                 
381                 
382                 //find max is score
383                 float biggest = 0.0;
384                 for (int i = 0; i < info.size(); i++) {
385                         if (info[i].score > biggest)  {
386                                 biggest = info[i].score;
387                         }
388                 }
389                 
390                 outString += "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"135\">" + toString(biggest) + "</text>\n";
391                 
392                 int scaler2 = 500 / biggest;
393                 
394                 
395                 outString += "<polyline fill=\"none\" stroke=\"red\" stroke-width=\"2\" points=\"";
396                 //160,200 180,230 200,210 234,220\"/> "; 
397                 for (int i = 0; i < info.size(); i++) {
398                         if(info[i].score < 0) { info[i].score = 0; }
399                         outString += toString(((i*5) + 75)) + "," + toString((600 - (info[i].score * scaler2))) + " ";
400                 }
401                 
402                 outString += "\"/> ";
403                 outString += "</g>\n</svg>\n";
404                 
405                 MPI_Status status;
406                 int length = outString.length();
407                 char* buf2 = new char[length];\r
408                 memcpy(buf2, outString.c_str(), length);
409                                 
410                 MPI_File_write(outSVG, buf2, length, MPI_CHAR, &status);
411                 delete buf2;
412                 
413                 MPI_File_close(&outSVG);
414
415         }
416         catch(exception& e) {
417                 m->errorOut(e, "ChimeraCheckRDP", "makeSVGpic");
418                 exit(1);
419         }
420 }
421 #else
422 //***************************************************************************************************************
423 void ChimeraCheckRDP::makeSVGpic(vector<sim> info) {
424         try{
425                 
426                 string file = outputDir + querySeq->getName() + ".chimeracheck.svg";
427                 ofstream outsvg;
428                 openOutputFile(file, outsvg);
429                 
430                 int width = (info.size()*5) + 150;
431                 
432                 outsvg << "<svg xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns=\"http://www.w3.org/2000/svg\" width=\"100%\" height=\"100%\" viewBox=\"0 0 700 " + toString(width) + "\">\n";
433                 outsvg << "<g>\n";
434                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"25\">Plotted IS values for " + querySeq->getName() + "</text>\n";
435                 
436                 outsvg <<  "<line x1=\"75\" y1=\"600\" x2=\"" + toString((info.size()*5) + 75) + "\" y2=\"600\" stroke=\"black\" stroke-width=\"2\"/>\n";  
437                 outsvg <<  "<line x1=\"75\" y1=\"600\" x2=\"75\" y2=\"125\" stroke=\"black\" stroke-width=\"2\"/>\n";
438                 
439                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"80\" y=\"620\">" + toString(info[0].midpoint) + "</text>\n";
440                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((info.size()*5) + 75) + "\" y=\"620\">" + toString(info[info.size()-1].midpoint) + "</text>\n";
441                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"650\">Base Positions</text>\n";
442                 
443                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"580\">0</text>\n";
444                 
445                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"350\">IS</text>\n";
446                 
447                 
448                 //find max is score
449                 float biggest = 0.0;
450                 for (int i = 0; i < info.size(); i++) {
451                         if (info[i].score > biggest)  {
452                                 biggest = info[i].score;
453                         }
454                 }
455                 
456                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"135\">" + toString(biggest) + "</text>\n";
457                 
458                 int scaler2 = 500 / biggest;
459                 
460                 
461                 outsvg << "<polyline fill=\"none\" stroke=\"red\" stroke-width=\"2\" points=\"";
462                 //160,200 180,230 200,210 234,220\"/> "; 
463                 for (int i = 0; i < info.size(); i++) {
464                         if(info[i].score < 0) { info[i].score = 0; }
465                         outsvg << ((i*5) + 75) << "," << (600 - (info[i].score * scaler2)) << " ";
466                 }
467                 
468                 outsvg << "\"/> ";
469                 outsvg << "</g>\n</svg>\n";
470                 
471                 outsvg.close();
472
473         }
474         catch(exception& e) {
475                 m->errorOut(e, "ChimeraCheckRDP", "makeSVGpic");
476                 exit(1);
477         }
478 }
479 #endif
480 //***************************************************************************************************************/
481
482