chimeraperseuscommand.h

   1 #ifndef CHIMERAPERSEUSCOMMAND_H
   2 #define CHIMERAPERSEUSCOMMAND_H
   3
   4
   5 /*
   6  *  chimeraperseuscommand.h
   7  *  Mothur
   8  *
   9  *  Created by westcott on 10/26/11.
  10  *  Copyright 2011 Schloss Lab. All rights reserved.
  11  *
  12  */
  13
  14
  15
  16 #include "mothur.h"
  17 #include "command.hpp"
  18 #include "sequenceparser.h"
  19 #include "sequencecountparser.h"
  20 #include "myPerseus.h"
  21 #include "counttable.h"
  22
  23 /***********************************************************/
  24 class ChimeraPerseusCommand : public Command {
  25 public:
  26         ChimeraPerseusCommand(string);
  27         ChimeraPerseusCommand();
  28         ~ChimeraPerseusCommand() {}
  29
  30         vector<string> setParameters();
  31         string getCommandName()                 { return "chimera.perseus";             }
  32         string getCommandCategory()             { return "Sequence Processing"; }
  33
  34         string getHelpString();
  35     string getOutputPattern(string);
  36         string getCitation() { return "Quince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011).  Removing noise from pyrosequenced amplicons.  BMC Bioinformatics  12:38.\nEdgar,R.C., Haas,B.J., Clemente,J.C., Quince,C. and Knight,R. (2011), UCHIME improves sensitivity and speed of chimera detection.  Bioinformatics 27:2194.\nhttp://www.mothur.org/wiki/Chimera.perseus\n"; }
  37         string getDescription()         { return "detect chimeric sequences"; }
  38
  39         int execute();
  40         void help() { m->mothurOut(getHelpString()); }
  41
  42 private:
  43         struct linePair {
  44                 int start;
  45                 int end;
  46                 linePair(int i, int j) : start(i), end(j) {}
  47         };
  48
  49         bool abort, hasName, hasCount, dups;
  50         string fastafile, groupfile, countfile, outputDir, namefile;
  51         int processors, alignLength;
  52         double cutoff, alpha, beta;
  53     SequenceParser* parser;
  54     SequenceCountParser* cparser;
  55
  56         vector<string> outputNames;
  57         vector<string> fastaFileNames;
  58         vector<string> nameFileNames;
  59         vector<string> groupFileNames;
  60
  61         string getNamesFile(string&);
  62         int driver(string, vector<seqData>&, string, int&);
  63         vector<seqData> readFiles(string, string);
  64     vector<seqData> readFiles(string inputFile, CountTable* ct);
  65         vector<seqData> loadSequences(string);
  66         int deconvoluteResults(map<string, string>&, string, string);
  67         int driverGroups(string, string, int, int, vector<string>);
  68         int createProcessesGroups(string, string, vector<string>, string, string, string);
  69     string removeNs(string);
  70 };
  71
  72 /**************************************************************************************************/
  73 //custom data structure for threads to use.
  74 // This is passed by void pointer so it can be any data type
  75 // that can be passed using a single void pointer (LPVOID).
  76 struct perseusData {
  77         string fastafile;
  78         string namefile;
  79         string groupfile;
  80         string outputFName;
  81         string accnos;
  82         MothurOut* m;
  83         int start;
  84         int end;
  85     bool hasName, hasCount;
  86         int threadID, count, numChimeras;
  87         double alpha, beta, cutoff;
  88         vector<string> groups;
  89
  90         perseusData(){}
  91         perseusData(bool hn, bool hc, double a, double b, double c, string o,  string f, string n, string g, string ac, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
  92                 alpha = a;
  93                 beta = b;
  94                 cutoff = c;
  95                 fastafile = f;
  96                 namefile = n;
  97                 groupfile = g;
  98                 outputFName = o;
  99                 accnos = ac;
 100                 m = mout;
 101                 start = st;
 102                 end = en;
 103                 threadID = tid;
 104                 groups = gr;
 105         hasName = hn;
 106         hasCount = hc;
 107                 count = 0;
 108                 numChimeras = 0;
 109         }
 110 };
 111 /**************************************************************************************************/
 112 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 113 #else
 114 static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
 115         perseusData* pDataArray;
 116         pDataArray = (perseusData*)lpParam;
 117
 118         try {
 119
 120                 //clears files
 121                 ofstream out, out1, out2;
 122                 pDataArray->m->openOutputFile(pDataArray->outputFName, out); out.close();
 123                 pDataArray->m->openOutputFile(pDataArray->accnos, out1); out1.close();
 124
 125                 //parse fasta and name file by group
 126                 SequenceParser* parser;
 127         SequenceCountParser* cparser;
 128                 if (pDataArray->hasCount) {
 129             CountTable* ct = new CountTable();
 130             ct->readTable(pDataArray->namefile);
 131             cparser = new SequenceCountParser(pDataArray->fastafile, *ct);
 132             delete ct;
 133         }else {
 134             if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);  }
 135             else                                                        { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
 136         }
 137
 138                 int totalSeqs = 0;
 139                 int numChimeras = 0;
 140
 141                 for (int u = pDataArray->start; u < pDataArray->end; u++) {
 142
 143                         int start = time(NULL);  if (pDataArray->m->control_pressed) {  if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
 144
 145                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[u] + "...");  pDataArray->m->mothurOutEndLine();
 146
 147                         //vector<seqData> sequences = loadSequences(parser, groups[i]); - same function below
 148                         ////////////////////////////////////////////////////////////////////////////////////////
 149                         bool error = false;
 150             int alignLength = 0;
 151             vector<seqData> sequences;
 152             if (pDataArray->hasCount) {
 153                 vector<Sequence> thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[u]);
 154                 map<string, int> counts = cparser->getCountTable(pDataArray->groups[u]);
 155                 map<string, int>::iterator it;
 156
 157                 for (int i = 0; i < thisGroupsSeqs.size(); i++) {
 158
 159                     if (pDataArray->m->control_pressed) {  break; }
 160
 161                     it = counts.find(thisGroupsSeqs[i].getName());
 162                     if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
 163                     else {
 164                         string newSeq = "";
 165                         string tempSeq = thisGroupsSeqs[i].getUnaligned();
 166                         for (int j = 0; j < tempSeq.length(); j++) { if (tempSeq[j] != 'N') {  newSeq += tempSeq[j]; } }
 167                         thisGroupsSeqs[i].setAligned(newSeq);
 168
 169                         sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
 170                         if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
 171                     }
 172                 }
 173             }else{
 174                 vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[u]);
 175                 map<string, string> nameMap = parser->getNameMap(pDataArray->groups[u]);
 176                 map<string, string>::iterator it;
 177
 178                 for (int i = 0; i < thisGroupsSeqs.size(); i++) {
 179
 180                     if (pDataArray->m->control_pressed) {  break; }
 181
 182                     it = nameMap.find(thisGroupsSeqs[i].getName());
 183                     if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
 184                     else {
 185                         int num = pDataArray->m->getNumNames(it->second);
 186                         string newSeq = "";
 187                         string tempSeq = thisGroupsSeqs[i].getUnaligned();
 188                         for (int j = 0; j < tempSeq.length(); j++) { if (tempSeq[j] != 'N') {  newSeq += tempSeq[j]; } }
 189                         thisGroupsSeqs[i].setAligned(newSeq);
 190
 191                         sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
 192                         if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
 193                     }
 194                 }
 195
 196             }
 197
 198
 199                         if (error) { pDataArray->m->control_pressed = true; }
 200
 201                         //sort by frequency
 202                         sort(sequences.rbegin(), sequences.rend());
 203                         ////////////////////////////////////////////////////////////////////////////////////////
 204
 205                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
 206
 207                         //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below
 208                         ////////////////////////////////////////////////////////////////////////////////////////
 209                         string chimeraFileName = pDataArray->outputFName+pDataArray->groups[u];
 210                         string accnosFileName = pDataArray->accnos+pDataArray->groups[u];
 211
 212                         vector<vector<double> > correctModel(4);        //could be an option in the future to input own model matrix
 213                         for(int j=0;j<4;j++){   correctModel[j].resize(4);      }
 214
 215                         correctModel[0][0] = 0.000000;  //AA
 216                         correctModel[1][0] = 11.619259; //CA
 217                         correctModel[2][0] = 11.694004; //TA
 218                         correctModel[3][0] = 7.748623;  //GA
 219
 220                         correctModel[1][1] = 0.000000;  //CC
 221                         correctModel[2][1] = 7.619657;  //TC
 222                         correctModel[3][1] = 12.852562; //GC
 223
 224                         correctModel[2][2] = 0.000000;  //TT
 225                         correctModel[3][2] = 10.964048; //TG
 226
 227                         correctModel[3][3] = 0.000000;  //GG
 228
 229                         for(int k=0;k<4;k++){
 230                                 for(int j=0;j<k;j++){
 231                                         correctModel[j][k] = correctModel[k][j];
 232                                 }
 233                         }
 234
 235                         int numSeqs = sequences.size();
 236                         //int alignLength = sequences[0].sequence.size();
 237
 238                         ofstream chimeraFile;
 239                         ofstream accnosFile;
 240                         pDataArray->m->openOutputFile(chimeraFileName, chimeraFile);
 241                         pDataArray->m->openOutputFile(accnosFileName, accnosFile);
 242
 243                         Perseus myPerseus;
 244                         vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
 245
 246                         chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
 247
 248                         vector<bool> chimeras(numSeqs, 0);
 249
 250                         for(int j=0;j<numSeqs;j++){
 251
 252                                 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
 253
 254                                 vector<bool> restricted = chimeras;
 255
 256                                 vector<vector<int> > leftDiffs(numSeqs);
 257                                 vector<vector<int> > leftMaps(numSeqs);
 258                                 vector<vector<int> > rightDiffs(numSeqs);
 259                                 vector<vector<int> > rightMaps(numSeqs);
 260
 261                                 vector<int> singleLeft, bestLeft;
 262                                 vector<int> singleRight, bestRight;
 263
 264                                 int bestSingleIndex, bestSingleDiff;
 265                                 vector<pwAlign> alignments(numSeqs);
 266
 267                                 int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
 268
 269                                 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
 270
 271                                 int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
 272
 273                                 string dummyA, dummyB;
 274
 275                                 if(comparisons >= 2){
 276                                         minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
 277
 278                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
 279
 280                                         int minMismatchToTrimera = numeric_limits<int>::max();
 281                                         int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
 282
 283                                         if(minMismatchToChimera >= 3 && comparisons >= 3){
 284                                                 minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
 285
 286                                                 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
 287                                         }
 288
 289                                         double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
 290
 291                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
 292
 293                                         string type;
 294                                         string chimeraRefSeq;
 295
 296                                         if(minMismatchToChimera - minMismatchToTrimera >= 3){
 297                                                 type = "trimera";
 298                                                 chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
 299                                         }
 300                                         else{
 301                                                 type = "chimera";
 302                                                 chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
 303                                         }
 304
 305                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
 306
 307                                         double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
 308
 309                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
 310
 311                                         double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq);
 312                                         double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);
 313
 314                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
 315
 316                                         chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
 317                                         chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
 318                                         chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
 319                                         chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
 320
 321                                         double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, pDataArray->alpha, pDataArray->beta);
 322
 323                                         chimeraFile << probability << '\t';
 324
 325                                         if(probability > pDataArray->cutoff){
 326                                                 chimeraFile << type << endl;
 327                                                 accnosFile << sequences[j].seqName << endl;
 328                                                 chimeras[j] = 1;
 329                                                 numChimeras++;
 330                                         }
 331                                         else{
 332                                                 chimeraFile << "good" << endl;
 333                                         }
 334
 335                                 }
 336                                 else{
 337                                         chimeraFile << j << '\t' << sequences[j].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
 338                                 }
 339                                 //report progress
 340                                 if((j+1) % 100 == 0){   pDataArray->m->mothurOut("Processing sequence: " + toString(j+1) + "\n");               }
 341                         }
 342
 343                         if((numSeqs) % 100 != 0){       pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n");           }
 344
 345                         chimeraFile.close();
 346                         accnosFile.close();
 347                         ////////////////////////////////////////////////////////////////////////////////////////
 348
 349                         totalSeqs += numSeqs;
 350
 351                         //append files
 352                         pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName);
 353                         pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName);
 354                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[u] + ".");        pDataArray->m->mothurOutEndLine();
 355
 356                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
 357                 }
 358
 359                 pDataArray->count = totalSeqs;
 360                 if (pDataArray->hasCount) { delete cparser; } { delete parser; }
 361                 return totalSeqs;
 362
 363         }
 364         catch(exception& e) {
 365                 pDataArray->m->errorOut(e, "ChimeraUchimeCommand", "MyPerseusThreadFunction");
 366                 exit(1);
 367         }
 368 }
 369 /**************************************************************************************************/
 370
 371 #endif
 372
 373 #endif
 374
 375