5 * Created by westcott on 9/8/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "chimeracheckrdp.h"
12 //***************************************************************************************************************
13 ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp) { fastafile = filename; templateFile = temp; }
14 //***************************************************************************************************************
16 ChimeraCheckRDP::~ChimeraCheckRDP() {
18 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
23 errorOut(e, "ChimeraCheckRDP", "~AlignSim");
27 //***************************************************************************************************************
28 void ChimeraCheckRDP::print(ostream& out) {
33 for (int i = 0; i < querySeqs.size(); i++) {
35 int j = 0; float largest = -10;
36 //find largest sim value
37 for (int k = 0; k < IS[i].size(); k++) {
38 //is this score larger
39 if (IS[i][k].score > largest) {
41 largest = IS[i][k].score;
45 //find parental similarity
46 distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
47 float dist = distCalc->getDist();
49 //convert to similarity
50 dist = (1 - dist) * 100;
52 //warn about parental similarity - if its above 82% may not detect a chimera
53 if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly."); mothurOutEndLine(); }
55 int index = ceil(dist);
57 if (index == 0) { index=1; }
59 //is your DE value higher than the 95%
61 if (IS[i][j].score > quantile[index-1][4]) { chimera = "Yes"; }
62 else { chimera = "No"; }
64 out << querySeqs[i]->getName() << "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
66 if (chimera == "Yes") {
67 mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
69 out << "Improvement Score\t";
71 for (int r = 0; r < IS[i].size(); r++) { out << IS[i][r].score << '\t'; }
76 errorOut(e, "ChimeraCheckRDP", "print");
81 //***************************************************************************************************************
82 void ChimeraCheckRDP::getChimeras() {
85 //read in query sequences and subject sequences
86 mothurOut("Reading sequences and template file... "); cout.flush();
87 querySeqs = readSeqs(fastafile);
88 //templateSeqs = readSeqs(templateFile);
89 templateDB = new KmerDB(templateFile, kmerSize);
90 mothurOut("Done."); mothurOutEndLine();
92 int numSeqs = querySeqs.size();
95 closest.resize(numSeqs);
97 //break up file if needed
98 int linesPerProcess = numSeqs / processors ;
100 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
101 //find breakup of sequences for all times we will Parallelize
102 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
105 for (int i = 0; i < (processors-1); i++) {
106 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
108 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
109 int i = processors - 1;
110 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
114 lines.push_back(new linePair(0, numSeqs));
117 kmer = new Kmer(kmerSize);
119 //find closest seq to each querySeq
120 for (int i = 0; i < querySeqs.size(); i++) {
121 closest[i] = templateDB->findClosestSequence(querySeqs[i]);
124 //fill seqKmerInfo for query seqs
125 for (int i = 0; i < querySeqs.size(); i++) {
126 seqKmerInfo[querySeqs[i]->getName()] = kmer->getKmerCounts(querySeqs[i]->getUnaligned());
129 //fill seqKmerInfo for closest
130 for (int i = 0; i < closest.size(); i++) {
131 seqKmerInfo[closest[i].getName()] = kmer->getKmerCounts(closest[i].getUnaligned());
135 //for each query find IS value - this should be paralellized,
136 //but paralellizing may cause you to have to recalculate some seqKmerInfo since the separate processes don't share memory after they split
137 for (int i = 0; i < querySeqs.size(); i++) {
138 IS[i] = findIS(i); //fills seqKmerInfo
143 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
147 catch(exception& e) {
148 errorOut(e, "ChimeraCheckRDP", "getChimeras");
152 //***************************************************************************************************************
153 vector<sim> ChimeraCheckRDP::findIS(int query) {
156 vector<sim> isValues;
157 string queryName = querySeqs[query]->getName();
158 string seq = querySeqs[query]->getUnaligned();
160 mothurOut("Finding IS values for sequence " + query); mothurOutEndLine();
162 //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each
163 int nTotal = calcKmers(seqKmerInfo[queryName][(seqKmerInfo[queryName].size()-1)], seqKmerInfo[closest[query].getName()][(seqKmerInfo[closest[query].getName()].size()-1)]);
165 //you don't want the starting point to be virtually at hte end so move it in 10%
166 int start = seq.length() / 10;
169 for (int m = start; m < (seq.length() - start); m+=increment) {
173 string fragLeft = seq.substr(0, m); //left side of breakpoint
174 string fragRight = seq.substr(m, seq.length()); //right side of breakpoint
176 //make a sequence of the left side and right side
177 Sequence* left = new Sequence(queryName, fragLeft);
178 Sequence* right = new Sequence(queryName, fragRight);
180 //find seqs closest to each fragment
181 Sequence closestLeft = templateDB->findClosestSequence(left);
182 Sequence closestRight = templateDB->findClosestSequence(right);
184 map<int, int>::iterator itleft;
185 map<int, int>::iterator itleftclose;
187 //get kmer in the closest seqs
188 //if it's not found calc kmer info and save, otherwise use already calculated data
190 it = seqKmerInfo.find(closestLeft.getName());
191 if (it == seqKmerInfo.end()) { //you have to calc it
192 seqKmerInfo[closestLeft.getName()] = kmer->getKmerCounts(closestLeft.getUnaligned());
196 it = seqKmerInfo.find(closestRight.getName());
197 if (it == seqKmerInfo.end()) { //you have to calc it
198 seqKmerInfo[closestRight.getName()] = kmer->getKmerCounts(closestRight.getUnaligned());
201 //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side
202 //iterate through left sides map to subtract the number of times you saw things before you got the the right side
203 map<int, int> rightside;
204 for (itleft = seqKmerInfo[queryName][m-kmerSize].begin(); itleft != seqKmerInfo[queryName][m-kmerSize].end(); itleft++) {
205 int howManyTotal = seqKmerInfo[queryName][seqKmerInfo[queryName].size()-1][itleft->first]; //times that kmer was seen in total
207 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
208 int howmanyright = howManyTotal - itleft->second;
210 //if any were seen just on the right add that ammount to map
211 if (howmanyright > 0) {
212 rightside[itleft->first] = howmanyright;
216 //iterate through left side of the seq closest to the right fragment of query to subtract the number you saw before you reached the right side of the closest right
217 //this way you can get the map for just the fragment you want to compare and not hte whole sequence
218 map<int, int> rightsideclose;
219 for (itleftclose = seqKmerInfo[closestRight.getName()][m-kmerSize].begin(); itleftclose != seqKmerInfo[closestRight.getName()][m-kmerSize].end(); itleftclose++) {
220 int howManyTotal = seqKmerInfo[closestRight.getName()][seqKmerInfo[closestRight.getName()].size()-1][itleftclose->first]; //times that kmer was seen in total
222 //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
223 int howmanyright = howManyTotal - itleftclose->second;
225 //if any were seen just on the right add that ammount to map
226 if (howmanyright > 0) {
227 rightsideclose[itleftclose->first] = howmanyright;
231 int nLeft = calcKmers(seqKmerInfo[closestLeft.getName()][m-kmerSize], seqKmerInfo[queryName][m-kmerSize]);
232 int nRight = calcKmers(rightsideclose, rightside);
234 int is = nLeft + nRight - nTotal;
236 //save IS, leftparent, rightparent, breakpoint
237 temp.leftParent = closestLeft.getName();
238 temp.rightParent = closestRight.getName();
242 isValues.push_back(temp);
251 catch(exception& e) {
252 errorOut(e, "ChimeraCheckRDP", "findIS");
257 //***************************************************************************************************************
258 //find the smaller map and iterate through it and count kmers in common
259 int ChimeraCheckRDP::calcKmers(map<int, int> query, map<int, int> subject) {
263 map<int, int>::iterator small;
264 map<int, int>::iterator large;
266 if (query.size() < subject.size()) {
268 for (small = query.begin(); small != query.end(); small++) {
269 large = subject.find(small->first);
271 //if you found it they have that kmer in common
272 if (large != subject.end()) { common++; }
277 for (small = subject.begin(); small != subject.end(); small++) {
278 large = query.find(small->first);
280 //if you found it they have that kmer in common
281 if (large != query.end()) { common++; }
288 catch(exception& e) {
289 errorOut(e, "ChimeraCheckRDP", "calcKmers");
294 //***************************************************************************************************************