]> git.donarmstrong.com Git - mothur.git/blob - chimeracheckrdp.cpp
added MPI code, broke up chimera.seqs into 5 separated commands, added parse.sff...
[mothur.git] / chimeracheckrdp.cpp
1 /*
2  *  chimeracheckrdp.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/8/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeracheckrdp.h"
11                 
12 //***************************************************************************************************************
13 ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp, string n, bool s, int inc, int k, string o) : Chimera() { 
14         try {
15                 fastafile = filename; 
16                 templateFileName = temp;  
17                 name = n;
18                 svg = s;
19                 increment = inc;
20                 kmerSize = k;
21                 outputDir = o; 
22                 
23                 templateDB = new AlignmentDB(templateFileName, "kmer", kmerSize, 0.0,0.0,0.0,0.0);
24                 m->mothurOutEndLine();
25                 
26                 kmer = new Kmer(kmerSize);
27                 
28                 if (name != "") { 
29                         readName(name);  //fills name map with names of seqs the user wants to have .svg for.  
30                 }
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ChimeraCheckRDP", "ChimeraCheckRDP");
34                 exit(1);
35         }
36 }
37 //***************************************************************************************************************
38
39 ChimeraCheckRDP::~ChimeraCheckRDP() {
40         try {
41                 delete templateDB;
42                 delete kmer;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ChimeraCheckRDP", "~ChimeraCheckRDP");
46                 exit(1);
47         }
48 }       
49 //***************************************************************************************************************
50 int ChimeraCheckRDP::print(ostream& out, ostream& outAcc) {
51         try {
52                 
53                 m->mothurOut("Processing: " + querySeq->getName()); m->mothurOutEndLine();
54                 
55                 out << querySeq->getName() << endl;
56                 out << "IS scores: " << '\t';
57                         
58                 for (int k = 0; k < IS.size(); k++) {
59                         out << IS[k].score << '\t'; 
60                 }
61                 out << endl;
62                 
63                 if (svg) {
64                         if (name != "") { //if user has specific names
65                                 map<string, string>::iterator it = names.find(querySeq->getName());
66                                 
67                                 if (it != names.end()) { //user wants pic of this
68                                         makeSVGpic(IS);  //zeros out negative results
69                                 }
70                         }else{//output them all
71                                 makeSVGpic(IS);  //zeros out negative results
72                         }
73                 }
74                 
75                 return 0;
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "ChimeraCheckRDP", "print");
79                 exit(1);
80         }
81 }
82 #ifdef USE_MPI
83 //***************************************************************************************************************
84 int ChimeraCheckRDP::print(MPI_File& out, MPI_File& outAcc) {
85         try {
86                 
87                 cout << "Processing: " << querySeq->getName() << endl; 
88                 
89                 string outString = "";
90                 
91                 outString += querySeq->getName() + "\nIS scores: \t";
92                         
93                 for (int k = 0; k < IS.size(); k++) {
94                         outString += toString(IS[k].score)  + "\t"; 
95                 }
96                 outString += "\n";
97                 
98                 MPI_Status status;
99                 int length = outString.length();
100                 char buf[length];
101                 strcpy(buf, outString.c_str()); 
102                                 
103                 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
104                 
105                 if (svg) {
106                         if (name != "") { //if user has specific names
107                                 map<string, string>::iterator it = names.find(querySeq->getName());
108                                 
109                                 if (it != names.end()) { //user wants pic of this
110                                         makeSVGpic(IS);  //zeros out negative results
111                                 }
112                         }else{//output them all
113                                 makeSVGpic(IS);  //zeros out negative results
114                         }
115                 }
116                 
117                 return 0;
118         }
119         catch(exception& e) {
120                 m->errorOut(e, "ChimeraCheckRDP", "print");
121                 exit(1);
122         }
123 }
124 #endif
125 //***************************************************************************************************************
126 int ChimeraCheckRDP::getChimeras(Sequence* query) {
127         try {
128                 
129                 IS.clear();
130                                 
131                 querySeq = query;
132                         
133                 closest = templateDB->findClosestSequence(query);  
134         
135                 IS = findIS(); 
136                                         
137                 //determine chimera report cutoff - window score above 95%
138                 //getCutoff();  - not very acurate predictor
139                 
140                 return 0;
141         }
142         catch(exception& e) {
143                 m->errorOut(e, "ChimeraCheckRDP", "getChimeras");
144                 exit(1);
145         }
146 }
147 //***************************************************************************************************************
148 vector<sim> ChimeraCheckRDP::findIS() {
149         try {
150                 
151                 
152                 vector< map<int, int> > queryKmerInfo;  //vector of maps - each entry in the vector is a map of the kmers up to that spot in the unaligned seq
153                                                                                                 //example:  seqKmerInfo[50] = map containing the kmers found in the first 50 + kmersize characters of ecoli.
154                                                                                                 //i chose to store the kmers numbers in a map so you wouldn't have to check for dupilcate entries and could easily find the 
155                                                                                                 //kmers 2 seqs had in common.  There may be a better way to do this thats why I am leaving so many comments...
156                 vector< map<int, int> > subjectKmerInfo;
157                 
158                 vector<sim>  isValues;
159                 string queryName = querySeq->getName();
160                 string seq = querySeq->getUnaligned();
161                 
162                 queryKmerInfo = kmer->getKmerCounts(seq);
163                 subjectKmerInfo = kmer->getKmerCounts(closest.getUnaligned());
164                 
165                 //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each
166                 int nTotal = calcKmers(queryKmerInfo[(queryKmerInfo.size()-1)], subjectKmerInfo[(subjectKmerInfo.size()-1)]);
167
168                 //you don't want the starting point to be virtually at hte end so move it in 10%
169                 int start = seq.length() / 10;
170                         
171                 //for each window
172                 for (int f = start; f < (seq.length() - start); f+=increment) {
173                 
174                         if (m->control_pressed) { return isValues; }
175                         
176                         if ((f - kmerSize) < 0)  { m->mothurOut("Your sequence is too short for your kmerSize."); m->mothurOutEndLine(); exit(1); }
177                         
178                         sim temp;
179                         
180                         string fragLeft = seq.substr(0, f);  //left side of breakpoint
181                         string fragRight = seq.substr(f);  //right side of breakpoint
182                         
183                         //make a sequence of the left side and right side
184                         Sequence* left = new Sequence(queryName, fragLeft);
185                         Sequence* right = new Sequence(queryName, fragRight);
186                         
187                         //find seqs closest to each fragment
188                         Sequence closestLeft = templateDB->findClosestSequence(left); 
189         
190                         Sequence closestRight = templateDB->findClosestSequence(right); 
191                 
192                         //get kmerinfo for the closest left
193                         vector< map<int, int> > closeLeftKmerInfo = kmer->getKmerCounts(closestLeft.getUnaligned());
194                         
195                         //get kmerinfo for the closest right
196                         vector< map<int, int> > closeRightKmerInfo = kmer->getKmerCounts(closestRight.getUnaligned());
197                         
198                         //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side
199                         //iterate through left sides map to subtract the number of times you saw things before you got the the right side
200                         map<int, int> rightside = queryKmerInfo[queryKmerInfo.size()-1];
201                         for (map<int, int>::iterator itleft = queryKmerInfo[f-kmerSize].begin(); itleft != queryKmerInfo[f-kmerSize].end(); itleft++) {
202                                 int howManyTotal = queryKmerInfo[queryKmerInfo.size()-1][itleft->first];   //times that kmer was seen in total
203
204                                 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
205                                 int howmanyright = howManyTotal - itleft->second;
206                                 
207                                 //if any were seen just on the left erase
208                                 if (howmanyright == 0) {
209                                         rightside.erase(itleft->first);
210                                 }
211                         }
212                         
213                         map<int, int> closerightside = closeRightKmerInfo[closeRightKmerInfo.size()-1];
214                         for (map<int, int>::iterator itright = closeRightKmerInfo[f-kmerSize].begin(); itright != closeRightKmerInfo[f-kmerSize].end(); itright++) {
215                                 int howManyTotal = closeRightKmerInfo[(closeRightKmerInfo.size()-1)][itright->first];   //times that kmer was seen in total
216
217                                 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
218                                 int howmanyright = howManyTotal - itright->second;
219                                 
220                                 //if any were seen just on the left erase
221                                 if (howmanyright == 0) {
222                                         closerightside.erase(itright->first);
223                                 }
224                         }
225
226                         
227                         int nLeft = calcKmers(closeLeftKmerInfo[f-kmerSize], queryKmerInfo[f-kmerSize]);
228
229                         int nRight = calcKmers(closerightside, rightside);
230
231                         int is = nLeft + nRight - nTotal;
232
233                         //save IS, leftparent, rightparent, breakpoint
234                         temp.leftParent = closestLeft.getName();
235                         temp.rightParent = closestRight.getName();
236                         temp.score = is;
237                         temp.midpoint = f;
238                         
239                         isValues.push_back(temp);
240                         
241                         delete left;
242                         delete right;
243                 }
244                 
245                 return isValues;
246         
247         }
248         catch(exception& e) {
249                 m->errorOut(e, "ChimeraCheckRDP", "findIS");
250                 exit(1);
251         }
252 }
253 //***************************************************************************************************************
254 void ChimeraCheckRDP::readName(string namefile) {
255         try{
256         
257                 string name;
258
259         #ifdef USE_MPI
260                 
261                 MPI_File inMPI;
262                 MPI_Offset size;
263                 MPI_Status status;
264                 
265                 char inFileName[namefile.length()];
266                 strcpy(inFileName, namefile.c_str());
267
268                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
269                 MPI_File_get_size(inMPI, &size);
270
271                 char buffer[size];
272                 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
273
274                 string tempBuf = buffer;
275                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
276                 istringstream iss (tempBuf,istringstream::in);
277                 
278                 while(!iss.eof()) {
279                         iss >> name; gobble(iss);
280                         names[name] = name;
281                 }
282         
283                 MPI_File_close(&inMPI);
284                 
285         #else   
286         
287                 ifstream in;
288                 openInputFile(namefile, in);
289                                 
290                 while (!in.eof()) {
291                         in >> name; gobble(in);
292                         names[name] = name;
293                 }
294                 in.close();
295         
296         #endif
297         
298         }
299         catch(exception& e) {
300                 m->errorOut(e, "ChimeraCheckRDP", "readName");
301                 exit(1);
302         }
303 }
304
305 //***************************************************************************************************************
306 //find the smaller map and iterate through it and count kmers in common
307 int ChimeraCheckRDP::calcKmers(map<int, int> query, map<int, int> subject) {
308         try{
309                 
310                 int common = 0;
311                 map<int, int>::iterator small;
312                 map<int, int>::iterator large;
313                 
314                 if (query.size() < subject.size()) {
315                 
316                         for (small = query.begin(); small != query.end(); small++) {
317                                 large = subject.find(small->first);
318                                 
319                                 //if you found it they have that kmer in common
320                                 if (large != subject.end()) {   common++;       }
321                         }
322                         
323                 }else { 
324                  
325                         for (small = subject.begin(); small != subject.end(); small++) {
326                                 large = query.find(small->first);
327                                 
328                                 //if you found it they have that kmer in common
329                                 if (large != query.end()) {             common++;        }
330                         }
331                 }
332                 
333                 return common;
334                 
335         }
336         catch(exception& e) {
337                 m->errorOut(e, "ChimeraCheckRDP", "calcKmers");
338                 exit(1);
339         }
340 }
341 #ifdef USE_MPI
342 //***************************************************************************************************************
343 void ChimeraCheckRDP::makeSVGpic(vector<sim> info) {
344         try{
345                 
346                 string file = outputDir + querySeq->getName() + ".chimeracheck.svg";
347                 
348                 MPI_File outSVG;
349                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
350                 
351                 char FileName[file.length()];
352                 strcpy(FileName, file.c_str());
353                 
354                 MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outSVG);  //comm, filename, mode, info, filepointer
355                 
356                 int width = (info.size()*5) + 150;
357                 
358                 string outString = "";
359                 
360                 outString += "<svg xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns=\"http://www.w3.org/2000/svg\" width=\"100%\" height=\"100%\" viewBox=\"0 0 700 " + toString(width) + "\">\n";
361                 outString += "<g>\n";
362                 outString += "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"25\">Plotted IS values for " + querySeq->getName() + "</text>\n";
363                 
364                 outString +=  "<line x1=\"75\" y1=\"600\" x2=\"" + toString((info.size()*5) + 75) + "\" y2=\"600\" stroke=\"black\" stroke-width=\"2\"/>\n";  
365                 outString +=  "<line x1=\"75\" y1=\"600\" x2=\"75\" y2=\"125\" stroke=\"black\" stroke-width=\"2\"/>\n";
366                 
367                 outString += "<text fill=\"black\" class=\"seri\" x=\"80\" y=\"620\">" + toString(info[0].midpoint) + "</text>\n";
368                 outString += "<text fill=\"black\" class=\"seri\" x=\"" + toString((info.size()*5) + 75) + "\" y=\"620\">" + toString(info[info.size()-1].midpoint) + "</text>\n";
369                 outString += "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"650\">Base Positions</text>\n";
370                 
371                 outString += "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"580\">0</text>\n";
372                 
373                 outString += "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"350\">IS</text>\n";
374                 
375                 
376                 //find max is score
377                 float biggest = 0.0;
378                 for (int i = 0; i < info.size(); i++) {
379                         if (info[i].score > biggest)  {
380                                 biggest = info[i].score;
381                         }
382                 }
383                 
384                 outString += "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"135\">" + toString(biggest) + "</text>\n";
385                 
386                 int scaler2 = 500 / biggest;
387                 
388                 
389                 outString += "<polyline fill=\"none\" stroke=\"red\" stroke-width=\"2\" points=\"";
390                 //160,200 180,230 200,210 234,220\"/> "; 
391                 for (int i = 0; i < info.size(); i++) {
392                         if(info[i].score < 0) { info[i].score = 0; }
393                         outString += toString(((i*5) + 75)) + "," + toString((600 - (info[i].score * scaler2))) + " ";
394                 }
395                 
396                 outString += "\"/> ";
397                 outString += "</g>\n</svg>\n";
398                 
399                 MPI_Status status;
400                 int length = outString.length();
401                 char buf2[length];
402                 strcpy(buf2, outString.c_str()); 
403                                 
404                 MPI_File_write(outSVG, buf2, length, MPI_CHAR, &status);
405                 
406                 MPI_File_close(&outSVG);
407
408         }
409         catch(exception& e) {
410                 m->errorOut(e, "ChimeraCheckRDP", "makeSVGpic");
411                 exit(1);
412         }
413 }
414 #else
415 //***************************************************************************************************************
416 void ChimeraCheckRDP::makeSVGpic(vector<sim> info) {
417         try{
418                 
419                 string file = outputDir + querySeq->getName() + ".chimeracheck.svg";
420                 ofstream outsvg;
421                 openOutputFile(file, outsvg);
422                 
423                 int width = (info.size()*5) + 150;
424                 
425                 outsvg << "<svg xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns=\"http://www.w3.org/2000/svg\" width=\"100%\" height=\"100%\" viewBox=\"0 0 700 " + toString(width) + "\">\n";
426                 outsvg << "<g>\n";
427                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"25\">Plotted IS values for " + querySeq->getName() + "</text>\n";
428                 
429                 outsvg <<  "<line x1=\"75\" y1=\"600\" x2=\"" + toString((info.size()*5) + 75) + "\" y2=\"600\" stroke=\"black\" stroke-width=\"2\"/>\n";  
430                 outsvg <<  "<line x1=\"75\" y1=\"600\" x2=\"75\" y2=\"125\" stroke=\"black\" stroke-width=\"2\"/>\n";
431                 
432                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"80\" y=\"620\">" + toString(info[0].midpoint) + "</text>\n";
433                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((info.size()*5) + 75) + "\" y=\"620\">" + toString(info[info.size()-1].midpoint) + "</text>\n";
434                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"650\">Base Positions</text>\n";
435                 
436                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"580\">0</text>\n";
437                 
438                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"350\">IS</text>\n";
439                 
440                 
441                 //find max is score
442                 float biggest = 0.0;
443                 for (int i = 0; i < info.size(); i++) {
444                         if (info[i].score > biggest)  {
445                                 biggest = info[i].score;
446                         }
447                 }
448                 
449                 outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"135\">" + toString(biggest) + "</text>\n";
450                 
451                 int scaler2 = 500 / biggest;
452                 
453                 
454                 outsvg << "<polyline fill=\"none\" stroke=\"red\" stroke-width=\"2\" points=\"";
455                 //160,200 180,230 200,210 234,220\"/> "; 
456                 for (int i = 0; i < info.size(); i++) {
457                         if(info[i].score < 0) { info[i].score = 0; }
458                         outsvg << ((i*5) + 75) << "," << (600 - (info[i].score * scaler2)) << " ";
459                 }
460                 
461                 outsvg << "\"/> ";
462                 outsvg << "</g>\n</svg>\n";
463                 
464                 outsvg.close();
465
466         }
467         catch(exception& e) {
468                 m->errorOut(e, "ChimeraCheckRDP", "makeSVGpic");
469                 exit(1);
470         }
471 }
472 #endif
473 //***************************************************************************************************************/
474
475