]> git.donarmstrong.com Git - mothur.git/blob - chimeraperseuscommand.h
changed random forest output filename
[mothur.git] / chimeraperseuscommand.h
1 #ifndef CHIMERAPERSEUSCOMMAND_H
2 #define CHIMERAPERSEUSCOMMAND_H
3
4
5 /*
6  *  chimeraperseuscommand.h
7  *  Mothur
8  *
9  *  Created by westcott on 10/26/11.
10  *  Copyright 2011 Schloss Lab. All rights reserved.
11  *
12  */
13
14
15
16 #include "mothur.h"
17 #include "command.hpp"
18 #include "sequenceparser.h"
19 #include "sequencecountparser.h"
20 #include "myPerseus.h"
21 #include "counttable.h"
22
23 /***********************************************************/
24 class ChimeraPerseusCommand : public Command {
25 public:
26         ChimeraPerseusCommand(string);
27         ChimeraPerseusCommand();
28         ~ChimeraPerseusCommand() {}
29         
30         vector<string> setParameters();
31         string getCommandName()                 { return "chimera.perseus";             }
32         string getCommandCategory()             { return "Sequence Processing"; }
33         
34         string getHelpString(); 
35     string getOutputPattern(string);    
36         string getCitation() { return "Quince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011).  Removing noise from pyrosequenced amplicons.  BMC Bioinformatics  12:38.\nEdgar,R.C., Haas,B.J., Clemente,J.C., Quince,C. and Knight,R. (2011), UCHIME improves sensitivity and speed of chimera detection.  Bioinformatics 27:2194.\nhttp://www.mothur.org/wiki/Chimera.perseus\n"; }
37         string getDescription()         { return "detect chimeric sequences"; }
38         
39         int execute(); 
40         void help() { m->mothurOut(getHelpString()); }          
41         
42 private:
43         struct linePair {
44                 int start;
45                 int end;
46                 linePair(int i, int j) : start(i), end(j) {}
47         };
48         
49         bool abort, hasName, hasCount, dups;
50         string fastafile, groupfile, countfile, outputDir, namefile;
51         int processors, alignLength;
52         double cutoff, alpha, beta;
53     SequenceParser* parser;
54     SequenceCountParser* cparser;
55         
56         vector<string> outputNames;
57         vector<string> fastaFileNames;
58         vector<string> nameFileNames;
59         vector<string> groupFileNames;
60         
61         string getNamesFile(string&);
62         int driver(string, vector<seqData>&, string, int&);
63         vector<seqData> readFiles(string, string);
64     vector<seqData> readFiles(string inputFile, CountTable* ct);
65         vector<seqData> loadSequences(string);
66         int deconvoluteResults(map<string, string>&, string, string);
67         int driverGroups(string, string, string, int, int, vector<string>);
68         int createProcessesGroups(string, string, string, vector<string>, string, string, string);
69     string removeNs(string);
70 };
71
72 /**************************************************************************************************/
73 //custom data structure for threads to use.
74 // This is passed by void pointer so it can be any data type
75 // that can be passed using a single void pointer (LPVOID).
76 struct perseusData {
77         string fastafile; 
78         string namefile; 
79         string groupfile;
80         string outputFName;
81         string accnos;
82     string countlist;
83         MothurOut* m;
84         int start;
85         int end;
86     bool hasName, hasCount, dups;
87         int threadID, count, numChimeras;
88         double alpha, beta, cutoff;
89         vector<string> groups;
90         
91         perseusData(){}
92         perseusData(bool dps, bool hn, bool hc, double a, double b, double c, string o,  string f, string n, string g, string ac, string ctlist, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
93                 alpha = a;
94                 beta = b;
95                 cutoff = c;
96                 fastafile = f;
97                 namefile = n;
98                 groupfile = g;
99                 outputFName = o;
100         countlist = ctlist;
101                 accnos = ac;
102                 m = mout;
103                 start = st;
104                 end = en;
105                 threadID = tid;
106                 groups = gr;
107         hasName = hn;
108         hasCount = hc;
109         dups = dps;
110                 count = 0;
111                 numChimeras = 0;
112         }
113 };
114 /**************************************************************************************************/
115 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
116 #else
117 static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ 
118         perseusData* pDataArray;
119         pDataArray = (perseusData*)lpParam;
120         
121         try {
122                 
123                 //clears files
124                 ofstream out, out1, out2;
125                 pDataArray->m->openOutputFile(pDataArray->outputFName, out); out.close(); 
126                 pDataArray->m->openOutputFile(pDataArray->accnos, out1); out1.close();
127                 
128                 //parse fasta and name file by group
129                 SequenceParser* parser;
130         SequenceCountParser* cparser;
131                 if (pDataArray->hasCount) {
132             CountTable* ct = new CountTable();
133             ct->readTable(pDataArray->namefile, true);
134             cparser = new SequenceCountParser(pDataArray->fastafile, *ct);
135             delete ct;
136         }else {
137             if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);  }
138             else                                                        { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
139         }
140     
141                 int totalSeqs = 0;
142                 int numChimeras = 0;
143         
144         ofstream outCountList;
145         if (pDataArray->hasCount && pDataArray->dups) { pDataArray->m->openOutputFile(pDataArray->countlist, outCountList); }
146                 
147                 for (int u = pDataArray->start; u < pDataArray->end; u++) {
148                         
149                         int start = time(NULL);  if (pDataArray->m->control_pressed) {  if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
150                         
151                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[u] + "...");  pDataArray->m->mothurOutEndLine();                                      
152                         
153                         //vector<seqData> sequences = loadSequences(parser, groups[i]); - same function below
154                         ////////////////////////////////////////////////////////////////////////////////////////
155                         bool error = false;
156             int alignLength = 0;
157             vector<seqData> sequences;
158             if (pDataArray->hasCount) {
159                 vector<Sequence> thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[u]);
160                 map<string, int> counts = cparser->getCountTable(pDataArray->groups[u]);
161                 map<string, int>::iterator it;
162                 
163                 for (int i = 0; i < thisGroupsSeqs.size(); i++) {
164                     
165                     if (pDataArray->m->control_pressed) {  break; }
166                     
167                     it = counts.find(thisGroupsSeqs[i].getName());
168                     if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
169                     else {
170                         string newSeq = "";
171                         string tempSeq = thisGroupsSeqs[i].getUnaligned();
172                         for (int j = 0; j < tempSeq.length(); j++) { if (tempSeq[j] != 'N') {  newSeq += tempSeq[j]; } }
173                         thisGroupsSeqs[i].setAligned(newSeq);
174                         
175                         sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
176                         if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
177                     }
178                 }
179             }else{
180                 vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[u]);
181                 map<string, string> nameMap = parser->getNameMap(pDataArray->groups[u]);
182                 map<string, string>::iterator it;
183                 
184                 for (int i = 0; i < thisGroupsSeqs.size(); i++) {
185                     
186                     if (pDataArray->m->control_pressed) {  break; }
187                     
188                     it = nameMap.find(thisGroupsSeqs[i].getName());
189                     if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
190                     else {
191                         int num = pDataArray->m->getNumNames(it->second);
192                         string newSeq = "";
193                         string tempSeq = thisGroupsSeqs[i].getUnaligned();
194                         for (int j = 0; j < tempSeq.length(); j++) { if (tempSeq[j] != 'N') {  newSeq += tempSeq[j]; } }
195                         thisGroupsSeqs[i].setAligned(newSeq);
196
197                         sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
198                         if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
199                     }
200                 }
201                 
202             }
203             
204                         
205                         if (error) { pDataArray->m->control_pressed = true; }
206                         
207                         //sort by frequency
208                         sort(sequences.rbegin(), sequences.rend());
209                         ////////////////////////////////////////////////////////////////////////////////////////
210
211                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
212                         
213                         //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below
214                         ////////////////////////////////////////////////////////////////////////////////////////
215                         string chimeraFileName = pDataArray->outputFName+pDataArray->groups[u];
216                         string accnosFileName = pDataArray->accnos+pDataArray->groups[u];
217                         
218                         vector<vector<double> > correctModel(4);        //could be an option in the future to input own model matrix
219                         for(int j=0;j<4;j++){   correctModel[j].resize(4);      }
220                         
221                         correctModel[0][0] = 0.000000;  //AA
222                         correctModel[1][0] = 11.619259; //CA
223                         correctModel[2][0] = 11.694004; //TA
224                         correctModel[3][0] = 7.748623;  //GA
225                         
226                         correctModel[1][1] = 0.000000;  //CC
227                         correctModel[2][1] = 7.619657;  //TC
228                         correctModel[3][1] = 12.852562; //GC
229                         
230                         correctModel[2][2] = 0.000000;  //TT
231                         correctModel[3][2] = 10.964048; //TG
232                         
233                         correctModel[3][3] = 0.000000;  //GG
234                         
235                         for(int k=0;k<4;k++){
236                                 for(int j=0;j<k;j++){
237                                         correctModel[j][k] = correctModel[k][j];
238                                 }
239                         }
240                         
241                         int numSeqs = sequences.size();
242                         //int alignLength = sequences[0].sequence.size();
243                         
244                         ofstream chimeraFile;
245                         ofstream accnosFile;
246                         pDataArray->m->openOutputFile(chimeraFileName, chimeraFile); 
247                         pDataArray->m->openOutputFile(accnosFileName, accnosFile); 
248                         
249                         Perseus myPerseus;
250                         vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
251                         
252                         chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
253                         
254                         vector<bool> chimeras(numSeqs, 0);
255                         
256                         for(int j=0;j<numSeqs;j++){     
257                                 
258                                 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
259                                 
260                                 vector<bool> restricted = chimeras;
261                                 
262                                 vector<vector<int> > leftDiffs(numSeqs);
263                                 vector<vector<int> > leftMaps(numSeqs);
264                                 vector<vector<int> > rightDiffs(numSeqs);
265                                 vector<vector<int> > rightMaps(numSeqs);
266                                 
267                                 vector<int> singleLeft, bestLeft;
268                                 vector<int> singleRight, bestRight;
269                                 
270                                 int bestSingleIndex, bestSingleDiff;
271                                 vector<pwAlign> alignments(numSeqs);
272                                 
273                                 int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
274                                 
275                                 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
276                                 
277                                 int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
278                                 
279                                 string dummyA, dummyB;
280                                 
281                                 if(comparisons >= 2){   
282                                         minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
283                                         
284                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
285                                         
286                                         int minMismatchToTrimera = numeric_limits<int>::max();
287                                         int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
288                                         
289                                         if(minMismatchToChimera >= 3 && comparisons >= 3){
290                                                 minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
291                                                 
292                                                 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
293                                         }
294                                         
295                                         double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
296                                         
297                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
298                                         
299                                         string type;
300                                         string chimeraRefSeq;
301                                         
302                                         if(minMismatchToChimera - minMismatchToTrimera >= 3){
303                                                 type = "trimera";
304                                                 chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
305                                         }
306                                         else{
307                                                 type = "chimera";
308                                                 chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
309                                         }
310                                         
311                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
312                                         
313                                         double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
314                                         
315                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
316                                         
317                                         double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq);
318                                         double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);                
319                                         
320                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
321                                         
322                                         chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
323                                         chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
324                                         chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
325                                         chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
326                                         
327                                         double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, pDataArray->alpha, pDataArray->beta);
328                                         
329                                         chimeraFile << probability << '\t';
330                                         
331                                         if(probability > pDataArray->cutoff){ 
332                                                 chimeraFile << type << endl;
333                                                 accnosFile << sequences[j].seqName << endl;
334                                                 chimeras[j] = 1;
335                                                 numChimeras++;
336                                         }
337                                         else{
338                                                 chimeraFile << "good" << endl;
339                                         }
340                                         
341                                 }
342                                 else{
343                                         chimeraFile << j << '\t' << sequences[j].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
344                                 }
345                                 //report progress
346                                 if((j+1) % 100 == 0){   pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(j+1) + "\n");           }
347                         }
348                         
349                         if((numSeqs) % 100 != 0){       pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(numSeqs) + "\n");               }
350                         
351                         chimeraFile.close();
352                         accnosFile.close();
353                         ////////////////////////////////////////////////////////////////////////////////////////
354
355                         totalSeqs += numSeqs;
356             
357             if (pDataArray->dups) {
358                 if (!pDataArray->m->isBlank(accnosFileName)) {
359                     ifstream in;
360                     pDataArray->m->openInputFile(accnosFileName, in);
361                     string name;
362                     if (pDataArray->hasCount) {
363                         while (!in.eof()) {
364                             in >> name; pDataArray->m->gobble(in);
365                             outCountList << name << '\t' << pDataArray->groups[u] << endl;
366                         }
367                         in.close();
368                     }else {
369                         map<string, string> thisnamemap = parser->getNameMap(pDataArray->groups[u]);
370                         map<string, string>::iterator itN;
371                         ofstream out;
372                         pDataArray->m->openOutputFile(accnosFileName+".temp", out);
373                         while (!in.eof()) {
374                             in >> name; pDataArray->m->gobble(in);
375                             itN = thisnamemap.find(name);
376                             if (itN != thisnamemap.end()) {
377                                 vector<string> tempNames; pDataArray->m->splitAtComma(itN->second, tempNames);
378                                 for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; }
379                                 
380                             }else { pDataArray->m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); pDataArray->m->control_pressed = true; }
381                         }
382                         out.close();
383                         in.close();
384                         pDataArray->m->renameFile(accnosFileName+".temp", accnosFileName);
385                     }
386                     
387                 }
388             }
389                         
390                         //append files
391                         pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName);
392                         pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName);
393                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[u] + ".");        pDataArray->m->mothurOutEndLine();                                      
394                         
395                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
396                 }       
397                 
398         if (pDataArray->hasCount && pDataArray->dups) { outCountList.close(); }
399         
400                 pDataArray->count = totalSeqs;
401                 if (pDataArray->hasCount) { delete cparser; } { delete parser; }
402                 return totalSeqs;
403                 
404         }
405         catch(exception& e) {
406                 pDataArray->m->errorOut(e, "ChimeraUchimeCommand", "MyPerseusThreadFunction");
407                 exit(1);
408         }
409
410 /**************************************************************************************************/
411
412 #endif
413
414 #endif
415
416