ccode.cpp

   1 /*
   2  *  ccode.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 8/24/09.
   6  *  Copyright 2009 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "ccode.h"
  11 #include "ignoregaps.h"
  12 #include "eachgapdist.h"
  13
  14
  15 //***************************************************************************************************************
  16 Ccode::Ccode(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
  17 //***************************************************************************************************************
  18
  19 Ccode::~Ccode() {
  20         try {
  21                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
  22                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
  23                 delete distCalc;
  24         }
  25         catch(exception& e) {
  26                 errorOut(e, "Ccode", "~Ccode");
  27                 exit(1);
  28         }
  29 }
  30 //***************************************************************************************************************
  31 void Ccode::print(ostream& out) {
  32         try {
  33
  34                 mothurOutEndLine();
  35
  36                 for (int i = 0; i < querySeqs.size(); i++) {
  37
  38                 }
  39         }
  40         catch(exception& e) {
  41                 errorOut(e, "Ccode", "print");
  42                 exit(1);
  43         }
  44 }
  45
  46 //***************************************************************************************************************
  47 void Ccode::getChimeras() {
  48         try {
  49
  50                 //read in query sequences and subject sequences
  51                 mothurOut("Reading sequences and template file... "); cout.flush();
  52                 querySeqs = readSeqs(fastafile);
  53                 templateSeqs = readSeqs(templateFile);
  54                 mothurOut("Done."); mothurOutEndLine();
  55
  56                 int numSeqs = querySeqs.size();
  57
  58                 closest.resize(numSeqs);
  59
  60                 //break up file if needed
  61                 int linesPerProcess = numSeqs / processors ;
  62
  63                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
  64                         //find breakup of sequences for all times we will Parallelize
  65                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
  66                         else {
  67                                 //fill line pairs
  68                                 for (int i = 0; i < (processors-1); i++) {
  69                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
  70                                 }
  71                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
  72                                 int i = processors - 1;
  73                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
  74                         }
  75
  76                         //find breakup of templatefile for quantiles
  77                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
  78                         else {
  79                                 for (int i = 0; i < processors; i++) {
  80                                         templateLines.push_back(new linePair());
  81                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
  82                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
  83                                 }
  84                         }
  85                 #else
  86                         lines.push_back(new linePair(0, numSeqs));
  87                         templateLines.push_back(new linePair(0, templateSeqs.size()));
  88                 #endif
  89
  90                 distCalc = new eachGapDist();
  91                 decalc = new DeCalculator();
  92
  93                 //find closest
  94                 if (processors == 1) {
  95                         mothurOut("Finding top matches for sequences... "); cout.flush();
  96                         closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
  97                         mothurOut("Done."); mothurOutEndLine();
  98                 }else {         createProcessesClosest();               }
  99
 100
 101 for (int i = 0; i < closest.size(); i++) {
 102         cout << querySeqs[i]->getName() << ": ";
 103         for (int j = 0; j < closest[i].size(); j++) {
 104
 105                 cout << closest[i][j]->getName() << '\t';
 106         }
 107         cout << endl;
 108 }
 109
 110                 //mask sequences if the user wants to
 111                 if (seqMask != "") {
 112                         //mask querys
 113                         for (int i = 0; i < querySeqs.size(); i++) {
 114                                 decalc->runMask(querySeqs[i]);
 115                         }
 116
 117                         //mask templates
 118                         for (int i = 0; i < templateSeqs.size(); i++) {
 119                                 decalc->runMask(templateSeqs[i]);
 120                         }
 121                 }
 122
 123                 if (filter) {
 124                         vector<Sequence*> temp = templateSeqs;
 125                         for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
 126
 127                         createFilter(temp);
 128
 129                         runFilter(querySeqs);
 130                         runFilter(templateSeqs);
 131                 }
 132
 133                 //trim sequences - this follows ccodes remove_extra_gaps
 134                 //just need to pass it query and template since closest points to template
 135                 trimSequences();
 136
 137                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().
 138                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
 139                 windows = findWindows();
 140
 141                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
 142                 for (int i = 0; i < closest.size(); i++) {
 143                         removeBadReferenceSeqs(closest[i], i);
 144                 }
 145
 146
 147                 //free memory
 148                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
 149                 for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
 150
 151         }
 152         catch(exception& e) {
 153                 errorOut(e, "Ccode", "getChimeras");
 154                 exit(1);
 155         }
 156 }
 157 /***************************************************************************************************************/
 158 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
 159 void Ccode::trimSequences() {
 160         try {
 161
 162                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
 163                 int rearPos = querySeqs[0]->getAligned().length();
 164
 165                 //********find first position in all seqs that is a non gap character***********//
 166                 //find first position all query seqs that is a non gap character
 167                 for (int i = 0; i < querySeqs.size(); i++) {
 168
 169                         string aligned = querySeqs[i]->getAligned();
 170                         int pos = 0;
 171
 172                         //find first spot in this seq
 173                         for (int j = 0; j < aligned.length(); j++) {
 174                                 if (isalpha(aligned[j])) {
 175                                         pos = j;
 176                                         break;
 177                                 }
 178                         }
 179
 180                         //save this spot if it is the farthest
 181                         if (pos > frontPos) { frontPos = pos; }
 182                 }
 183
 184                 //find first position all template seqs that is a non gap character
 185                 for (int i = 0; i < templateSeqs.size(); i++) {
 186
 187                         string aligned = templateSeqs[i]->getAligned();
 188                         int pos = 0;
 189
 190                         //find first spot in this seq
 191                         for (int j = 0; j < aligned.length(); j++) {
 192                                 if (isalpha(aligned[j])) {
 193                                         pos = j;
 194                                         break;
 195                                 }
 196                         }
 197
 198                         //save this spot if it is the farthest
 199                         if (pos > frontPos) { frontPos = pos; }
 200                 }
 201
 202
 203                 //********find last position in all seqs that is a non gap character***********//
 204                 //find last position all query seqs that is a non gap character
 205                 for (int i = 0; i < querySeqs.size(); i++) {
 206
 207                         string aligned = querySeqs[i]->getAligned();
 208                         int pos = aligned.length();
 209
 210                         //find first spot in this seq
 211                         for (int j = aligned.length()-1; j >= 0; j--) {
 212                                 if (isalpha(aligned[j])) {
 213                                         pos = j;
 214                                         break;
 215                                 }
 216                         }
 217
 218                         //save this spot if it is the farthest
 219                         if (pos < rearPos) { rearPos = pos; }
 220                 }
 221
 222                 //find last position all template seqs that is a non gap character
 223                 for (int i = 0; i < templateSeqs.size(); i++) {
 224
 225                         string aligned = templateSeqs[i]->getAligned();
 226                         int pos = aligned.length();
 227
 228                         //find first spot in this seq
 229                         for (int j = aligned.length()-1; j >= 0; j--) {
 230                                 if (isalpha(aligned[j])) {
 231                                         pos = j;
 232                                         break;
 233                                 }
 234                         }
 235
 236                         //save this spot if it is the farthest
 237                         if (pos < rearPos) { rearPos = pos; }
 238                 }
 239
 240
 241                 //check to make sure that is not whole seq
 242                 if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
 243
 244                 //***********trim all seqs to that position***************//
 245                 for (int i = 0; i < querySeqs.size(); i++) {
 246
 247                         string aligned = querySeqs[i]->getAligned();
 248
 249                         //between the two points
 250                         aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
 251
 252                         querySeqs[i]->setAligned(aligned);
 253                 }
 254
 255                 for (int i = 0; i < templateSeqs.size(); i++) {
 256
 257                         string aligned = templateSeqs[i]->getAligned();
 258
 259                         //between the two points
 260                         aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
 261
 262                         templateSeqs[i]->setAligned(aligned);
 263                 }
 264
 265         }
 266         catch(exception& e) {
 267                 errorOut(e, "Ccode", "trimSequences");
 268                 exit(1);
 269         }
 270
 271 }
 272 /***************************************************************************************************************/
 273 vector<int> Ccode::findWindows() {
 274         try {
 275
 276                 vector<int> win;
 277                 int length = querySeqs[0]->getAligned().length();
 278
 279                 //default is wanted = 10% of total length
 280                 if (window > length) {
 281                         mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
 282                         window = length / 10;
 283                 }else if (window == 0) { window = length / 10;  }
 284                 else if (window > (length / 20)) {
 285                         mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
 286                 }else if (window < (length / 5)) {
 287                         mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
 288                 }
 289
 290                 //save starting points of each window
 291                 for (int m = 0;  m < (length-window); m+=window) {  win.push_back(m);  }
 292
 293                 return win;
 294
 295         }
 296         catch(exception& e) {
 297                 errorOut(e, "Ccode", "findWindows");
 298                 exit(1);
 299         }
 300 }
 301 //***************************************************************************************************************
 302 int Ccode::getDiff(string seqA, string seqB) {
 303         try {
 304
 305                 int numDiff = 0;
 306
 307                 for (int i = 0; i < seqA.length(); i++) {
 308                         //if you are both not gaps
 309                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
 310                                 //are you different
 311                                 if (seqA[i] != seqB[i]) { numDiff++; }
 312                         //}
 313                 }
 314
 315                 return numDiff;
 316
 317         }
 318         catch(exception& e) {
 319                 errorOut(e, "Ccode", "getDiff");
 320                 exit(1);
 321         }
 322 }
 323 //***************************************************************************************************************
 324 //tried to make this look most like ccode original implementation
 325 void Ccode::removeBadReferenceSeqs(vector<Sequence*>& seqs, int query) {
 326         try {
 327
 328                 vector< vector<int> > numDiffBases;
 329                 numDiffBases.resize(seqs.size());
 330                 //initialize to 0
 331                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
 332
 333                 int length = seqs[0]->getAligned().length();
 334
 335                 //calc differences from each sequence to everyother seq in the set
 336                 for (int i = 0; i < seqs.size(); i++) {
 337
 338                         string seqA = seqs[i]->getAligned();
 339
 340                         //so you don't calc i to j and j to i since they are the same
 341                         for (int j = 0; j < i; j++) {
 342
 343                                 string seqB = seqs[j]->getAligned();
 344
 345                                 //compare strings
 346                                 int numDiff = getDiff(seqA, seqB);
 347
 348                                 numDiffBases[i][j] = numDiff;
 349                                 numDiffBases[j][i] = numDiff;
 350                         }
 351                 }
 352
 353                 //initailize remove to 0
 354                 vector<int> remove;  remove.resize(seqs.size(), 0);
 355
 356                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
 357                 for (int i = 0; i < numDiffBases.size(); i++) {
 358                         for (int j = 0; j < numDiffBases[i].size(); j++) {
 359
 360                                 //are you more than 20% different
 361                                 if (numDiffBases[i][j] > ((20*length) / 100))           {  remove[j] = 1;  }
 362                                 //are you less than 0.5% different
 363                                 if (numDiffBases[i][j] < ((0.5*length) / 100))  {  remove[j] = 1;  }
 364                         }
 365                 }
 366
 367                 int numSeqsLeft = 0;
 368
 369                 //count seqs that are not going to be removed
 370                 for (int i = 0; i < remove.size(); i++) {
 371                         if (remove[i] == 0)  { numSeqsLeft++;  }
 372                 }
 373
 374                 //if you have enough then remove bad ones
 375                 if (numSeqsLeft >= 3) {
 376                         vector<Sequence*> goodSeqs;
 377                         //remove bad seqs
 378                         for (int i = 0; i < remove.size(); i++) {
 379                                 if (remove[i] == 0) {
 380                                         goodSeqs.push_back(seqs[i]);
 381                                 }
 382                         }
 383
 384                         seqs = goodSeqs;
 385
 386                 }else { //warn, but dont remove any
 387                         mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); mothurOutEndLine();
 388                 }
 389
 390
 391         }
 392         catch(exception& e) {
 393                 errorOut(e, "Ccode", "removeBadReferenceSeqs");
 394                 exit(1);
 395         }
 396 }
 397 //***************************************************************************************************************
 398 vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted) {
 399         try{
 400
 401                 vector< vector<Sequence*> > topMatches;  topMatches.resize(querySeqs.size());
 402
 403                 float smallestOverall, smallestLeft, smallestRight;
 404                 smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
 405
 406                 //for each sequence in querySeqs - find top matches to use as reference
 407                 for(int j = start; j < end; j++){
 408
 409                         Sequence query = *(querySeqs[j]);
 410
 411                         vector<SeqDist> distances;
 412
 413                         //calc distance to each sequence in template seqs
 414                         for (int i = 0; i < templateSeqs.size(); i++) {
 415
 416                                 Sequence ref = *(templateSeqs[i]);
 417
 418                                 //find overall dist
 419                                 distCalc->calcDist(query, ref);
 420                                 float dist = distCalc->getDist();
 421
 422                                 //save distance
 423                                 SeqDist temp;
 424                                 temp.seq = templateSeqs[i];
 425                                 temp.dist = dist;
 426
 427                                 distances.push_back(temp);
 428                         }
 429
 430                         sort(distances.begin(), distances.end(), compareSeqDist);
 431
 432                         //save the number of top matches wanted
 433                         for (int h = 0; h < numWanted; h++) {
 434                                 topMatches[j].push_back(distances[h].seq);
 435                         }
 436                 }
 437
 438                 return topMatches;
 439
 440         }
 441         catch(exception& e) {
 442                 errorOut(e, "Ccode", "findClosestSides");
 443                 exit(1);
 444         }
 445 }
 446 /**************************************************************************************************/
 447 vector<float> Ccode::getAverageRef(vector<Sequence*> ref) {
 448         try {
 449         }
 450         catch(exception& e) {
 451                 errorOut(e, "Ccode", "getAverageRef");
 452                 exit(1);
 453         }
 454 }
 455 /**************************************************************************************************/
 456 vector<float> Ccode::getAverageQuery (vector<Sequence*> ref, int query) {
 457         try {
 458
 459
 460         }
 461         catch(exception& e) {
 462                 errorOut(e, "Ccode", "getAverageQuery");
 463                 exit(1);
 464         }
 465 }
 466 /**************************************************************************************************/
 467 void Ccode::createProcessesClosest() {
 468         try {
 469 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 470                 int process = 0;
 471                 vector<int> processIDS;
 472
 473                 //loop through and create all the processes you want
 474                 while (process != processors) {
 475                         int pid = fork();
 476
 477                         if (pid > 0) {
 478                                 processIDS.push_back(pid);
 479                                 process++;
 480                         }else if (pid == 0){
 481
 482                                 mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
 483                                 closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
 484                                 mothurOut("Done finding top matches for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
 485
 486                                 //write out data to file so parent can read it
 487                                 ofstream out;
 488                                 string s = toString(getpid()) + ".temp";
 489                                 openOutputFile(s, out);
 490
 491                                 //output pairs
 492                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
 493                                          for (int j = 0; j < closest[i].size(); j++) {
 494                                                 closest[i][j]->printSequence(out);
 495                                          }
 496                                 }
 497                                 out.close();
 498
 499                                 exit(0);
 500                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
 501                 }
 502
 503                 //force parent to wait until all the processes are done
 504                 for (int i=0;i<processors;i++) {
 505                         int temp = processIDS[i];
 506                         wait(&temp);
 507                 }
 508
 509                 //get data created by processes
 510                 for (int i=0;i<processors;i++) {
 511                         ifstream in;
 512                         string s = toString(processIDS[i]) + ".temp";
 513                         openInputFile(s, in);
 514
 515                         //get pairs
 516                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
 517
 518                                 vector<Sequence*> tempVector;
 519
 520                                 for (int j = 0; j < numWanted; j++) {
 521
 522                                         Sequence* temp = new Sequence(in);
 523                                         gobble(in);
 524
 525                                         tempVector.push_back(temp);
 526                                 }
 527
 528                                 closest[k] = tempVector;
 529                         }
 530
 531                         in.close();
 532                         remove(s.c_str());
 533                 }
 534
 535
 536 #else
 537                 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
 538 #endif
 539
 540         }
 541         catch(exception& e) {
 542                 errorOut(e, "Ccode", "createProcessesClosest");
 543                 exit(1);
 544         }
 545 }
 546
 547 //***************************************************************************************************************
 548