]> git.donarmstrong.com Git - mothur.git/blob - chimeracheckrdp.cpp
continued work on chimeras and fixed bug in trim.seqs and reverse.seqs that was due...
[mothur.git] / chimeracheckrdp.cpp
1 /*
2  *  chimeracheckrdp.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/8/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeracheckrdp.h"
11                 
12 //***************************************************************************************************************
13 ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
14 //***************************************************************************************************************
15
16 ChimeraCheckRDP::~ChimeraCheckRDP() {
17         try {
18                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
19                 delete templateDB;
20                 delete kmer;
21         }
22         catch(exception& e) {
23                 errorOut(e, "ChimeraCheckRDP", "~AlignSim");
24                 exit(1);
25         }
26 }       
27 //***************************************************************************************************************
28 void ChimeraCheckRDP::print(ostream& out) {
29         try {
30                 
31                 mothurOutEndLine();
32         /*      
33                 for (int i = 0; i < querySeqs.size(); i++) {
34                         
35                         int j = 0;  float largest = -10;
36                         //find largest sim value
37                         for (int k = 0; k < IS[i].size(); k++) {
38                                 //is this score larger
39                                 if (IS[i][k].score > largest) {
40                                         j = k;
41                                         largest = IS[i][k].score;
42                                 }
43                         }
44                         
45                         //find parental similarity
46                         distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
47                         float dist = distCalc->getDist();
48                         
49                         //convert to similarity
50                         dist = (1 - dist) * 100;
51
52                         //warn about parental similarity - if its above 82% may not detect a chimera 
53                         if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly.");  mothurOutEndLine(); }
54                         
55                         int index = ceil(dist);
56                         
57                         if (index == 0) { index=1;  }
58         
59                         //is your DE value higher than the 95%
60                         string chimera;
61                         if (IS[i][j].score > quantile[index-1][4])              {       chimera = "Yes";        }
62                         else                                                                            {       chimera = "No";         }                       
63                         
64                         out << querySeqs[i]->getName() <<  "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
65                         
66                         if (chimera == "Yes") {
67                                 mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
68                         }
69                         out << "Improvement Score\t";
70                         
71                         for (int r = 0; r < IS[i].size(); r++) {  out << IS[i][r].score << '\t';  }
72                         out << endl;
73                 }*/
74         }
75         catch(exception& e) {
76                 errorOut(e, "ChimeraCheckRDP", "print");
77                 exit(1);
78         }
79 }
80
81 //***************************************************************************************************************
82 void ChimeraCheckRDP::getChimeras() {
83         try {
84                 
85                 //read in query sequences and subject sequences
86                 mothurOut("Reading sequences and template file... "); cout.flush();
87                 querySeqs = readSeqs(fastafile);
88                 //templateSeqs = readSeqs(templateFile);
89                 templateDB = new KmerDB(templateFile, kmerSize);
90                 mothurOut("Done."); mothurOutEndLine();
91                 
92                 int numSeqs = querySeqs.size();
93                 
94                 IS.resize(numSeqs);
95                 closest.resize(numSeqs);
96                 
97                 //break up file if needed
98                 int linesPerProcess = numSeqs / processors ;
99                 
100                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
101                         //find breakup of sequences for all times we will Parallelize
102                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
103                         else {
104                                 //fill line pairs
105                                 for (int i = 0; i < (processors-1); i++) {                      
106                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
107                                 }
108                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
109                                 int i = processors - 1;
110                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
111                         }
112                         
113                 #else
114                         lines.push_back(new linePair(0, numSeqs));
115                 #endif
116                 
117                 kmer = new Kmer(kmerSize);
118                 
119                 //find closest seq to each querySeq
120                 for (int i = 0; i < querySeqs.size(); i++) {
121                         closest[i] = templateDB->findClosestSequence(querySeqs[i]);  
122                 }
123                 
124                 //fill seqKmerInfo for query seqs
125                 for (int i = 0; i < querySeqs.size(); i++) {
126                         seqKmerInfo[querySeqs[i]->getName()] = kmer->getKmerCounts(querySeqs[i]->getUnaligned());
127                 }
128                 
129                 //fill seqKmerInfo for closest
130                 for (int i = 0; i < closest.size(); i++) {
131                         seqKmerInfo[closest[i].getName()] = kmer->getKmerCounts(closest[i].getUnaligned());
132                 }
133
134                 
135                 //for each query find IS value - this should be paralellized, 
136                 //but paralellizing may cause you to have to recalculate some seqKmerInfo since the separate processes don't share memory after they split
137                 for (int i = 0; i < querySeqs.size(); i++) {
138                         IS[i] = findIS(i);  //fills seqKmerInfo
139                 }
140                 
141                 
142                 //free memory
143                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];        }
144         
145                         
146         }
147         catch(exception& e) {
148                 errorOut(e, "ChimeraCheckRDP", "getChimeras");
149                 exit(1);
150         }
151 }
152 //***************************************************************************************************************
153 vector<sim> ChimeraCheckRDP::findIS(int query) {
154         try {
155                 
156                 vector<sim>  isValues;
157                 string queryName = querySeqs[query]->getName();
158                 string seq = querySeqs[query]->getUnaligned();
159                 
160                 mothurOut("Finding IS values for sequence " + query); mothurOutEndLine();
161                 
162                 //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each
163                 int nTotal = calcKmers(seqKmerInfo[queryName][(seqKmerInfo[queryName].size()-1)], seqKmerInfo[closest[query].getName()][(seqKmerInfo[closest[query].getName()].size()-1)]);
164                 
165                 //you don't want the starting point to be virtually at hte end so move it in 10%
166                 int start = seq.length() / 10;
167                         
168                 //for each window
169                 for (int m = start; m < (seq.length() - start); m+=increment) {
170                         
171                         sim temp;
172                         
173                         string fragLeft = seq.substr(0, m);  //left side of breakpoint
174                         string fragRight = seq.substr(m, seq.length());  //right side of breakpoint
175                         
176                         //make a sequence of the left side and right side
177                         Sequence* left = new Sequence(queryName, fragLeft);
178                         Sequence* right = new Sequence(queryName, fragRight);
179                         
180                         //find seqs closest to each fragment
181                         Sequence closestLeft = templateDB->findClosestSequence(left); 
182                         Sequence closestRight = templateDB->findClosestSequence(right); 
183                         
184                         map<int, int>::iterator itleft;
185                         map<int, int>::iterator itleftclose;
186                         
187                         //get kmer in the closest seqs
188                         //if it's not found calc kmer info and save, otherwise use already calculated data
189                         //left
190                         it = seqKmerInfo.find(closestLeft.getName());
191                         if (it == seqKmerInfo.end()) {  //you have to calc it
192                                 seqKmerInfo[closestLeft.getName()] = kmer->getKmerCounts(closestLeft.getUnaligned());
193                         }
194                         
195                         //right
196                         it = seqKmerInfo.find(closestRight.getName());
197                         if (it == seqKmerInfo.end()) {  //you have to calc it
198                                 seqKmerInfo[closestRight.getName()] = kmer->getKmerCounts(closestRight.getUnaligned());
199                         }
200                         
201                         //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side
202                         //iterate through left sides map to subtract the number of times you saw things before you got the the right side
203                         map<int, int> rightside;
204                         for (itleft = seqKmerInfo[queryName][m-kmerSize].begin(); itleft != seqKmerInfo[queryName][m-kmerSize].end(); itleft++) {
205                                 int howManyTotal = seqKmerInfo[queryName][seqKmerInfo[queryName].size()-1][itleft->first];   //times that kmer was seen in total
206                                 
207                                 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
208                                 int howmanyright = howManyTotal - itleft->second;
209                                 
210                                 //if any were seen just on the right add that ammount to map
211                                 if (howmanyright > 0) {
212                                         rightside[itleft->first] = howmanyright;
213                                 }
214                         }
215                         
216                         //iterate through left side of the seq closest to the right fragment of query to subtract the number you saw before you reached the right side of the closest right
217                         //this way you can get the map for just the fragment you want to compare and not hte whole sequence
218                         map<int, int> rightsideclose;
219                         for (itleftclose = seqKmerInfo[closestRight.getName()][m-kmerSize].begin(); itleftclose != seqKmerInfo[closestRight.getName()][m-kmerSize].end(); itleftclose++) {
220                                 int howManyTotal = seqKmerInfo[closestRight.getName()][seqKmerInfo[closestRight.getName()].size()-1][itleftclose->first];   //times that kmer was seen in total
221                                 
222                                 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
223                                 int howmanyright = howManyTotal - itleftclose->second;
224                                 
225                                 //if any were seen just on the right add that ammount to map
226                                 if (howmanyright > 0) {
227                                         rightsideclose[itleftclose->first] = howmanyright;
228                                 }
229                         }
230
231                         int nLeft = calcKmers(seqKmerInfo[closestLeft.getName()][m-kmerSize], seqKmerInfo[queryName][m-kmerSize]);
232                         int nRight = calcKmers(rightsideclose, rightside);
233                         
234                         int is = nLeft + nRight - nTotal;
235                         
236                         //save IS, leftparent, rightparent, breakpoint
237                         temp.leftParent = closestLeft.getName();
238                         temp.rightParent = closestRight.getName();
239                         temp.score = is;
240                         temp.midpoint = m;
241                         
242                         isValues.push_back(temp);
243                         
244                         delete left;
245                         delete right;
246                 }       
247                 
248                 return isValues;
249         
250         }
251         catch(exception& e) {
252                 errorOut(e, "ChimeraCheckRDP", "findIS");
253                 exit(1);
254         }
255 }
256
257 //***************************************************************************************************************
258 //find the smaller map and iterate through it and count kmers in common
259 int ChimeraCheckRDP::calcKmers(map<int, int> query, map<int, int> subject) {
260         try{
261                 
262                 int common = 0;
263                 map<int, int>::iterator small;
264                 map<int, int>::iterator large;
265                 
266                 if (query.size() < subject.size()) {
267                 
268                         for (small = query.begin(); small != query.end(); small++) {
269                                 large = subject.find(small->first);
270                                 
271                                 //if you found it they have that kmer in common
272                                 if (large != subject.end()) {   common++;       }
273                         }
274                         
275                 }else { 
276                  
277                         for (small = subject.begin(); small != subject.end(); small++) {
278                                 large = query.find(small->first);
279                                 
280                                 //if you found it they have that kmer in common
281                                 if (large != query.end()) {             common++;       }
282                         }
283                 }
284                 
285                 return common;
286                 
287         }
288         catch(exception& e) {
289                 errorOut(e, "ChimeraCheckRDP", "calcKmers");
290                 exit(1);
291         }
292 }
293
294 //***************************************************************************************************************
295