preclustercommand.h

   1 #ifndef PRECLUSTERCOMMAND_H
   2 #define PRECLUSTERCOMMAND_H
   3
   4
   5 /*
   6  *  preclustercommand.h
   7  *  Mothur
   8  *
   9  *  Created by westcott on 12/21/09.
  10  *  Copyright 2009 Schloss Lab. All rights reserved.
  11  *
  12  */
  13
  14
  15 #include "command.hpp"
  16 #include "sequence.hpp"
  17 #include "sequenceparser.h"
  18
  19 /************************************************************/
  20 struct seqPNode {
  21         int numIdentical;
  22         Sequence seq;
  23         string names;
  24         bool active;
  25         int diffs;
  26         seqPNode() {}
  27         seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm), active(1) { diffs = 0; }
  28         ~seqPNode() {}
  29 };
  30 /************************************************************/
  31 inline bool comparePriority(seqPNode first, seqPNode second) {  return (first.numIdentical > second.numIdentical); }
  32 //************************************************************/
  33
  34 class PreClusterCommand : public Command {
  35
  36 public:
  37         PreClusterCommand(string);
  38         PreClusterCommand();
  39         ~PreClusterCommand(){}
  40
  41         vector<string> setParameters();
  42         string getCommandName()                 { return "pre.cluster";                         }
  43         string getCommandCategory()             { return "Sequence Processing";         }
  44         string getHelpString();
  45         string getCitation() { return "http://www.mothur.org/wiki/Pre.cluster"; }
  46         string getDescription()         { return "implements a pseudo-single linkage algorithm with the goal of removing sequences that are likely due to pyrosequencing errors"; }
  47
  48
  49         int execute();
  50         void help() { m->mothurOut(getHelpString()); }
  51
  52 private:
  53
  54         struct linePair {
  55                 int start;
  56                 int end;
  57                 linePair(int i, int j) : start(i), end(j) {}
  58         };
  59
  60         int diffs, length, processors;
  61         bool abort, bygroup;
  62         string fastafile, namefile, outputDir, groupfile;
  63         vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
  64         map<string, string> names; //represents the names file first column maps to second column
  65         map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
  66         map<string, int>::iterator itSize;
  67 //      map<string, bool> active; //maps sequence name to whether it has already been merged or not.
  68         vector<string> outputNames;
  69         map<string, vector<string> > outputTypes;
  70
  71         int readFASTA();
  72         void readNameFile();
  73         //int readNamesFASTA();
  74         int calcMisMatches(string, string);
  75         void printData(string, string); //fasta filename, names file name
  76         int process(string);
  77         int loadSeqs(map<string, string>&, vector<Sequence>&);
  78         int driverGroups(SequenceParser*, string, string, string, int, int, vector<string> groups);
  79         int createProcessesGroups(SequenceParser*, string, string, string, vector<string>);
  80 };
  81
  82 /**************************************************************************************************/
  83 //custom data structure for threads to use.
  84 // This is passed by void pointer so it can be any data type
  85 // that can be passed using a single void pointer (LPVOID).
  86 struct preClusterData {
  87         string fastafile;
  88         string namefile;
  89         string groupfile;
  90         string newFName, newNName, newMName;
  91         MothurOut* m;
  92         int start;
  93         int end;
  94         int diffs, threadID;
  95         vector<string> groups;
  96         vector<string> mapFileNames;
  97
  98         preClusterData(){}
  99         preClusterData(string f, string n, string g, string nff,  string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, int tid) {
 100                 fastafile = f;
 101                 namefile = n;
 102                 groupfile = g;
 103                 newFName = nff;
 104                 newNName = nnf;
 105                 newMName = nmf;
 106                 m = mout;
 107                 start = st;
 108                 end = en;
 109                 diffs = d;
 110                 threadID = tid;
 111                 groups = gr;
 112         }
 113 };
 114
 115 /**************************************************************************************************/
 116 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 117 #else
 118 static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){
 119         preClusterData* pDataArray;
 120         pDataArray = (preClusterData*)lpParam;
 121
 122         try {
 123
 124                 //parse fasta and name file by group
 125                 SequenceParser* parser;
 126                 if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);      }
 127                 else                                                    { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
 128
 129                 int numSeqs = 0;
 130                 vector<seqPNode> alignSeqs;
 131                 //clear out old files
 132                 ofstream outF; pDataArray->m->openOutputFile(pDataArray->newFName, outF); outF.close();
 133                 ofstream outN; pDataArray->m->openOutputFile(pDataArray->newNName, outN);  outN.close();
 134
 135                 //precluster each group
 136                 for (int k = pDataArray->start; k < pDataArray->end; k++) {
 137
 138                         int start = time(NULL);
 139
 140                         if (pDataArray->m->control_pressed) {  delete parser; return 0; }
 141
 142                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine();
 143
 144                         map<string, string> thisNameMap;
 145                         if (pDataArray->namefile != "") { thisNameMap = parser->getNameMap(pDataArray->groups[k]); }
 146                         vector<Sequence> thisSeqs = parser->getSeqs(pDataArray->groups[k]);
 147
 148                         //fill alignSeqs with this groups info.
 149                         ////////////////////////////////////////////////////
 150                         //numSeqs = loadSeqs(thisNameMap, thisSeqs); same function below
 151
 152                         int length = 0;
 153                         alignSeqs.clear();
 154                         map<string, string>::iterator it;
 155                         bool error = false;
 156
 157                         for (int i = 0; i < thisSeqs.size(); i++) {
 158
 159                                 if (pDataArray->m->control_pressed) { delete parser; return 0; }
 160
 161                                 if (pDataArray->namefile != "") {
 162                                         it = thisNameMap.find(thisSeqs[i].getName());
 163
 164                                         //should never be true since parser checks for this
 165                                         if (it == thisNameMap.end()) { pDataArray->m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); pDataArray->m->mothurOutEndLine(); error = true; }
 166                                         else{
 167                                                 //get number of reps
 168                                                 int numReps = 1;
 169                                                 for(int j=0;j<(it->second).length();j++){
 170                                                         if((it->second)[j] == ','){     numReps++;      }
 171                                                 }
 172
 173                                                 seqPNode tempNode(numReps, thisSeqs[i], it->second);
 174                                                 alignSeqs.push_back(tempNode);
 175                                                 if (thisSeqs[i].getAligned().length() > length) {  length = thisSeqs[i].getAligned().length();  }
 176                                         }
 177                                 }else { //no names file, you are identical to yourself
 178                                         seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName());
 179                                         alignSeqs.push_back(tempNode);
 180                                         if (thisSeqs[i].getAligned().length() > length) {  length = thisSeqs[i].getAligned().length();  }
 181                                 }
 182                         }
 183
 184                         //sanity check
 185                         if (error) { pDataArray->m->control_pressed = true; }
 186
 187                         thisSeqs.clear();
 188                         numSeqs = alignSeqs.size();
 189
 190                         ////////////////////////////////////////////////////
 191
 192                         if (pDataArray->m->control_pressed) {   delete parser; return 0; }
 193
 194                         if (pDataArray->diffs > length) { pDataArray->m->mothurOut("Error: diffs is greater than your sequence length."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; return 0;  }
 195
 196                         ////////////////////////////////////////////////////
 197                         //int count = process(); - same function below
 198
 199                         ofstream out;
 200                         pDataArray->m->openOutputFile(pDataArray->newMName+pDataArray->groups[k]+".map", out);
 201                         pDataArray->mapFileNames.push_back(pDataArray->newMName+pDataArray->groups[k]+".map");
 202
 203                         //sort seqs by number of identical seqs
 204                         sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
 205
 206                         int count = 0;
 207
 208                         //think about running through twice...
 209                         for (int i = 0; i < numSeqs; i++) {
 210
 211                                 //are you active
 212                                 //                      itActive = active.find(alignSeqs[i].seq.getName());
 213
 214                                 if (alignSeqs[i].active) {  //this sequence has not been merged yet
 215
 216                                         string chunk = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n";
 217
 218                                         //try to merge it with all smaller seqs
 219                                         for (int j = i+1; j < numSeqs; j++) {
 220
 221                                                 if (pDataArray->m->control_pressed) { delete parser; return 0; }
 222
 223                                                 if (alignSeqs[j].active) {  //this sequence has not been merged yet
 224                                                         //are you within "diff" bases
 225                                                         //int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
 226                                                         int mismatch = 0;
 227
 228                                                         for (int k = 0; k < alignSeqs[i].seq.getAligned().length(); k++) {
 229                                                                 //do they match
 230                                                                 if (alignSeqs[i].seq.getAligned()[k] != alignSeqs[j].seq.getAligned()[k]) { mismatch++; }
 231                                                                 if (mismatch > pDataArray->diffs) { mismatch = length; break; } //to far to cluster
 232                                                         }
 233
 234                                                         if (mismatch <= pDataArray->diffs) {
 235                                                                 //merge
 236                                                                 alignSeqs[i].names += ',' + alignSeqs[j].names;
 237                                                                 alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
 238
 239                                                                 alignSeqs[j].active = 0;
 240                                                                 alignSeqs[j].numIdentical = 0;
 241                                                                 alignSeqs[j].diffs = mismatch;
 242                                                                 count++;
 243                                                                 chunk += alignSeqs[j].seq.getName() + "\t" + toString(alignSeqs[j].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[j].seq.getAligned() + "\n";
 244                                                         }
 245                                                 }//end if j active
 246                                         }//end for loop j
 247
 248                                         //remove from active list
 249                                         alignSeqs[i].active = 0;
 250
 251                                         out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl << chunk << endl;
 252
 253                                 }//end if active i
 254                                 if(i % 100 == 0)        { pDataArray->m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); pDataArray->m->mothurOutEndLine(); }
 255                         }
 256                         out.close();
 257                         if(numSeqs % 100 != 0)  { pDataArray->m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); pDataArray->m->mothurOutEndLine();   }
 258                         ////////////////////////////////////////////////////
 259
 260                         if (pDataArray->m->control_pressed) {  delete parser; return 0; }
 261
 262                         pDataArray->m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + ".");pDataArray-> m->mothurOutEndLine();
 263                         pDataArray->m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOutEndLine();
 264
 265                         ////////////////////////////////////////////////////
 266                         //printData(pDataArray->newFFile, pDataArray->newNFile); - same as below
 267                         ofstream outFasta;
 268                         ofstream outNames;
 269
 270                         pDataArray->m->openOutputFileAppend(pDataArray->newFName, outFasta);
 271                         pDataArray->m->openOutputFileAppend(pDataArray->newNName, outNames);
 272
 273                         for (int i = 0; i < alignSeqs.size(); i++) {
 274                                 if (alignSeqs[i].numIdentical != 0) {
 275                                         alignSeqs[i].seq.printSequence(outFasta);
 276                                         outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
 277                                 }
 278                         }
 279
 280                         outFasta.close();
 281                         outNames.close();
 282                         ////////////////////////////////////////////////////
 283
 284                         pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); pDataArray->m->mothurOutEndLine();
 285
 286                 }
 287
 288                 return numSeqs;
 289
 290
 291         }
 292         catch(exception& e) {
 293                 pDataArray->m->errorOut(e, "PreClusterCommand", "MyPreclusterThreadFunction");
 294                 exit(1);
 295         }
 296 }
 297 #endif
 298
 299 /**************************************************************************************************/
 300
 301
 302 #endif
 303
 304