5 * Created by westcott on 8/24/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp) { fastafile = filename; templateFile = temp; }
17 //***************************************************************************************************************
21 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
22 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
26 errorOut(e, "Ccode", "~Ccode");
30 //***************************************************************************************************************
31 void Ccode::print(ostream& out) {
36 for (int i = 0; i < querySeqs.size(); i++) {
41 errorOut(e, "Ccode", "print");
46 //***************************************************************************************************************
47 void Ccode::getChimeras() {
50 //read in query sequences and subject sequences
51 mothurOut("Reading sequences and template file... "); cout.flush();
52 querySeqs = readSeqs(fastafile);
53 templateSeqs = readSeqs(templateFile);
54 mothurOut("Done."); mothurOutEndLine();
56 int numSeqs = querySeqs.size();
58 closest.resize(numSeqs);
60 //break up file if needed
61 int linesPerProcess = numSeqs / processors ;
63 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
64 //find breakup of sequences for all times we will Parallelize
65 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
68 for (int i = 0; i < (processors-1); i++) {
69 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
71 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
72 int i = processors - 1;
73 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
76 //find breakup of templatefile for quantiles
77 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
79 for (int i = 0; i < processors; i++) {
80 templateLines.push_back(new linePair());
81 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
82 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
86 lines.push_back(new linePair(0, numSeqs));
87 templateLines.push_back(new linePair(0, templateSeqs.size()));
90 distCalc = new eachGapDist();
91 decalc = new DeCalculator();
94 if (processors == 1) {
95 mothurOut("Finding top matches for sequences... "); cout.flush();
96 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
97 mothurOut("Done."); mothurOutEndLine();
98 }else { createProcessesClosest(); }
101 for (int i = 0; i < closest.size(); i++) {
102 cout << querySeqs[i]->getName() << ": ";
103 for (int j = 0; j < closest[i].size(); j++) {
105 cout << closest[i][j]->getName() << '\t';
110 //mask sequences if the user wants to
113 for (int i = 0; i < querySeqs.size(); i++) {
114 decalc->runMask(querySeqs[i]);
118 for (int i = 0; i < templateSeqs.size(); i++) {
119 decalc->runMask(templateSeqs[i]);
124 vector<Sequence*> temp = templateSeqs;
125 for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); }
129 runFilter(querySeqs);
130 runFilter(templateSeqs);
133 //trim sequences - this follows ccodes remove_extra_gaps
134 //just need to pass it query and template since closest points to template
137 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().
138 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
139 windows = findWindows();
141 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
142 for (int i = 0; i < closest.size(); i++) {
143 removeBadReferenceSeqs(closest[i], i);
148 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
149 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
152 catch(exception& e) {
153 errorOut(e, "Ccode", "getChimeras");
157 /***************************************************************************************************************/
158 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
159 void Ccode::trimSequences() {
162 int frontPos = 0; //should contain first position in all seqs that is not a gap character
163 int rearPos = querySeqs[0]->getAligned().length();
165 //********find first position in all seqs that is a non gap character***********//
166 //find first position all query seqs that is a non gap character
167 for (int i = 0; i < querySeqs.size(); i++) {
169 string aligned = querySeqs[i]->getAligned();
172 //find first spot in this seq
173 for (int j = 0; j < aligned.length(); j++) {
174 if (isalpha(aligned[j])) {
180 //save this spot if it is the farthest
181 if (pos > frontPos) { frontPos = pos; }
184 //find first position all template seqs that is a non gap character
185 for (int i = 0; i < templateSeqs.size(); i++) {
187 string aligned = templateSeqs[i]->getAligned();
190 //find first spot in this seq
191 for (int j = 0; j < aligned.length(); j++) {
192 if (isalpha(aligned[j])) {
198 //save this spot if it is the farthest
199 if (pos > frontPos) { frontPos = pos; }
203 //********find last position in all seqs that is a non gap character***********//
204 //find last position all query seqs that is a non gap character
205 for (int i = 0; i < querySeqs.size(); i++) {
207 string aligned = querySeqs[i]->getAligned();
208 int pos = aligned.length();
210 //find first spot in this seq
211 for (int j = aligned.length()-1; j >= 0; j--) {
212 if (isalpha(aligned[j])) {
218 //save this spot if it is the farthest
219 if (pos < rearPos) { rearPos = pos; }
222 //find last position all template seqs that is a non gap character
223 for (int i = 0; i < templateSeqs.size(); i++) {
225 string aligned = templateSeqs[i]->getAligned();
226 int pos = aligned.length();
228 //find first spot in this seq
229 for (int j = aligned.length()-1; j >= 0; j--) {
230 if (isalpha(aligned[j])) {
236 //save this spot if it is the farthest
237 if (pos < rearPos) { rearPos = pos; }
241 //check to make sure that is not whole seq
242 if ((rearPos - frontPos - 1) <= 0) { mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1); }
244 //***********trim all seqs to that position***************//
245 for (int i = 0; i < querySeqs.size(); i++) {
247 string aligned = querySeqs[i]->getAligned();
249 //between the two points
250 aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
252 querySeqs[i]->setAligned(aligned);
255 for (int i = 0; i < templateSeqs.size(); i++) {
257 string aligned = templateSeqs[i]->getAligned();
259 //between the two points
260 aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
262 templateSeqs[i]->setAligned(aligned);
266 catch(exception& e) {
267 errorOut(e, "Ccode", "trimSequences");
272 /***************************************************************************************************************/
273 vector<int> Ccode::findWindows() {
277 int length = querySeqs[0]->getAligned().length();
279 //default is wanted = 10% of total length
280 if (window > length) {
281 mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
282 window = length / 10;
283 }else if (window == 0) { window = length / 10; }
284 else if (window > (length / 20)) {
285 mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine();
286 }else if (window < (length / 5)) {
287 mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine();
290 //save starting points of each window
291 for (int m = 0; m < (length-window); m+=window) { win.push_back(m); }
296 catch(exception& e) {
297 errorOut(e, "Ccode", "findWindows");
301 //***************************************************************************************************************
302 int Ccode::getDiff(string seqA, string seqB) {
307 for (int i = 0; i < seqA.length(); i++) {
308 //if you are both not gaps
309 //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
311 if (seqA[i] != seqB[i]) { numDiff++; }
318 catch(exception& e) {
319 errorOut(e, "Ccode", "getDiff");
323 //***************************************************************************************************************
324 //tried to make this look most like ccode original implementation
325 void Ccode::removeBadReferenceSeqs(vector<Sequence*>& seqs, int query) {
328 vector< vector<int> > numDiffBases;
329 numDiffBases.resize(seqs.size());
331 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
333 int length = seqs[0]->getAligned().length();
335 //calc differences from each sequence to everyother seq in the set
336 for (int i = 0; i < seqs.size(); i++) {
338 string seqA = seqs[i]->getAligned();
340 //so you don't calc i to j and j to i since they are the same
341 for (int j = 0; j < i; j++) {
343 string seqB = seqs[j]->getAligned();
346 int numDiff = getDiff(seqA, seqB);
348 numDiffBases[i][j] = numDiff;
349 numDiffBases[j][i] = numDiff;
353 //initailize remove to 0
354 vector<int> remove; remove.resize(seqs.size(), 0);
356 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
357 for (int i = 0; i < numDiffBases.size(); i++) {
358 for (int j = 0; j < numDiffBases[i].size(); j++) {
360 //are you more than 20% different
361 if (numDiffBases[i][j] > ((20*length) / 100)) { remove[j] = 1; }
362 //are you less than 0.5% different
363 if (numDiffBases[i][j] < ((0.5*length) / 100)) { remove[j] = 1; }
369 //count seqs that are not going to be removed
370 for (int i = 0; i < remove.size(); i++) {
371 if (remove[i] == 0) { numSeqsLeft++; }
374 //if you have enough then remove bad ones
375 if (numSeqsLeft >= 3) {
376 vector<Sequence*> goodSeqs;
378 for (int i = 0; i < remove.size(); i++) {
379 if (remove[i] == 0) {
380 goodSeqs.push_back(seqs[i]);
386 }else { //warn, but dont remove any
387 mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); mothurOutEndLine();
392 catch(exception& e) {
393 errorOut(e, "Ccode", "removeBadReferenceSeqs");
397 //***************************************************************************************************************
398 vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted) {
401 vector< vector<Sequence*> > topMatches; topMatches.resize(querySeqs.size());
403 float smallestOverall, smallestLeft, smallestRight;
404 smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000;
406 //for each sequence in querySeqs - find top matches to use as reference
407 for(int j = start; j < end; j++){
409 Sequence query = *(querySeqs[j]);
411 vector<SeqDist> distances;
413 //calc distance to each sequence in template seqs
414 for (int i = 0; i < templateSeqs.size(); i++) {
416 Sequence ref = *(templateSeqs[i]);
419 distCalc->calcDist(query, ref);
420 float dist = distCalc->getDist();
424 temp.seq = templateSeqs[i];
427 distances.push_back(temp);
430 sort(distances.begin(), distances.end(), compareSeqDist);
432 //save the number of top matches wanted
433 for (int h = 0; h < numWanted; h++) {
434 topMatches[j].push_back(distances[h].seq);
441 catch(exception& e) {
442 errorOut(e, "Ccode", "findClosestSides");
446 /**************************************************************************************************/
447 vector<float> Ccode::getAverageRef(vector<Sequence*> ref) {
450 catch(exception& e) {
451 errorOut(e, "Ccode", "getAverageRef");
455 /**************************************************************************************************/
456 vector<float> Ccode::getAverageQuery (vector<Sequence*> ref, int query) {
461 catch(exception& e) {
462 errorOut(e, "Ccode", "getAverageQuery");
466 /**************************************************************************************************/
467 void Ccode::createProcessesClosest() {
469 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
471 vector<int> processIDS;
473 //loop through and create all the processes you want
474 while (process != processors) {
478 processIDS.push_back(pid);
482 mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
483 closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
484 mothurOut("Done finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
486 //write out data to file so parent can read it
488 string s = toString(getpid()) + ".temp";
489 openOutputFile(s, out);
492 for (int i = lines[process]->start; i < lines[process]->end; i++) {
493 for (int j = 0; j < closest[i].size(); j++) {
494 closest[i][j]->printSequence(out);
500 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
503 //force parent to wait until all the processes are done
504 for (int i=0;i<processors;i++) {
505 int temp = processIDS[i];
509 //get data created by processes
510 for (int i=0;i<processors;i++) {
512 string s = toString(processIDS[i]) + ".temp";
513 openInputFile(s, in);
516 for (int k = lines[i]->start; k < lines[i]->end; k++) {
518 vector<Sequence*> tempVector;
520 for (int j = 0; j < numWanted; j++) {
522 Sequence* temp = new Sequence(in);
525 tempVector.push_back(temp);
528 closest[k] = tempVector;
537 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
541 catch(exception& e) {
542 errorOut(e, "Ccode", "createProcessesClosest");
547 //***************************************************************************************************************