]> git.donarmstrong.com Git - mothur.git/blob - chimeraperseuscommand.h
Merge remote-tracking branch 'mothur/master'
[mothur.git] / chimeraperseuscommand.h
1 #ifndef CHIMERAPERSEUSCOMMAND_H
2 #define CHIMERAPERSEUSCOMMAND_H
3
4
5 /*
6  *  chimeraperseuscommand.h
7  *  Mothur
8  *
9  *  Created by westcott on 10/26/11.
10  *  Copyright 2011 Schloss Lab. All rights reserved.
11  *
12  */
13
14
15
16 #include "mothur.h"
17 #include "command.hpp"
18 #include "sequenceparser.h"
19 #include "sequencecountparser.h"
20 #include "myPerseus.h"
21 #include "counttable.h"
22
23 /***********************************************************/
24 class ChimeraPerseusCommand : public Command {
25 public:
26         ChimeraPerseusCommand(string);
27         ChimeraPerseusCommand();
28         ~ChimeraPerseusCommand() {}
29         
30         vector<string> setParameters();
31         string getCommandName()                 { return "chimera.perseus";             }
32         string getCommandCategory()             { return "Sequence Processing"; }
33         string getOutputFileNameTag(string, string);
34         string getHelpString(); 
35         string getCitation() { return "Quince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011).  Removing noise from pyrosequenced amplicons.  BMC Bioinformatics  12:38.\nEdgar,R.C., Haas,B.J., Clemente,J.C., Quince,C. and Knight,R. (2011), UCHIME improves sensitivity and speed of chimera detection.  Bioinformatics 27:2194.\nhttp://www.mothur.org/wiki/Chimera.perseus\n"; }
36         string getDescription()         { return "detect chimeric sequences"; }
37         
38         int execute(); 
39         void help() { m->mothurOut(getHelpString()); }          
40         
41 private:
42         struct linePair {
43                 int start;
44                 int end;
45                 linePair(int i, int j) : start(i), end(j) {}
46         };
47         
48         bool abort, hasName, hasCount;
49         string fastafile, groupfile, countfile, outputDir, namefile;
50         int processors, alignLength;
51         double cutoff, alpha, beta;
52     SequenceParser* parser;
53     SequenceCountParser* cparser;
54         
55         vector<string> outputNames;
56         vector<string> fastaFileNames;
57         vector<string> nameFileNames;
58         vector<string> groupFileNames;
59         
60         string getNamesFile(string&);
61         int driver(string, vector<seqData>&, string, int&);
62         vector<seqData> readFiles(string, string);
63     vector<seqData> readFiles(string inputFile, CountTable* ct);
64         vector<seqData> loadSequences(string);
65         int deconvoluteResults(map<string, string>&, string, string);
66         int driverGroups(string, string, int, int, vector<string>);
67         int createProcessesGroups(string, string, vector<string>, string, string, string);
68 };
69
70 /**************************************************************************************************/
71 //custom data structure for threads to use.
72 // This is passed by void pointer so it can be any data type
73 // that can be passed using a single void pointer (LPVOID).
74 struct perseusData {
75         string fastafile; 
76         string namefile; 
77         string groupfile;
78         string outputFName;
79         string accnos;
80         MothurOut* m;
81         int start;
82         int end;
83     bool hasName, hasCount;
84         int threadID, count, numChimeras;
85         double alpha, beta, cutoff;
86         vector<string> groups;
87         
88         perseusData(){}
89         perseusData(bool hn, bool hc, double a, double b, double c, string o,  string f, string n, string g, string ac, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
90                 alpha = a;
91                 beta = b;
92                 cutoff = c;
93                 fastafile = f;
94                 namefile = n;
95                 groupfile = g;
96                 outputFName = o;
97                 accnos = ac;
98                 m = mout;
99                 start = st;
100                 end = en;
101                 threadID = tid;
102                 groups = gr;
103         hasName = hn;
104         hasCount = hc;
105                 count = 0;
106                 numChimeras = 0;
107         }
108 };
109 /**************************************************************************************************/
110 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
111 #else
112 static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ 
113         perseusData* pDataArray;
114         pDataArray = (perseusData*)lpParam;
115         
116         try {
117                 
118                 //clears files
119                 ofstream out, out1, out2;
120                 pDataArray->m->openOutputFile(pDataArray->outputFName, out); out.close(); 
121                 pDataArray->m->openOutputFile(pDataArray->accnos, out1); out1.close();
122                 
123                 //parse fasta and name file by group
124                 SequenceParser* parser;
125         SequenceCountParser* cparser;
126                 if (pDataArray->hasCount) {
127             CountTable* ct = new CountTable();
128             ct->readTable(pDataArray->namefile);
129             cparser = new SequenceCountParser(pDataArray->fastafile, *ct);
130             delete ct;
131         }else {
132             if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);  }
133             else                                                        { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
134         }
135     
136                 int totalSeqs = 0;
137                 int numChimeras = 0;
138                 
139                 for (int i = pDataArray->start; i < pDataArray->end; i++) {
140                         
141                         int start = time(NULL);  if (pDataArray->m->control_pressed) {  if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
142                         
143                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "...");  pDataArray->m->mothurOutEndLine();                                      
144                         
145                         //vector<seqData> sequences = loadSequences(parser, groups[i]); - same function below
146                         ////////////////////////////////////////////////////////////////////////////////////////
147                         bool error = false;
148             int alignLength = 0;
149             vector<seqData> sequences;
150             if (pDataArray->hasCount) {
151                 vector<Sequence> thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[i]);
152                 map<string, int> counts = cparser->getCountTable(pDataArray->groups[i]);
153                 map<string, int>::iterator it;
154                 
155                 for (int i = 0; i < thisGroupsSeqs.size(); i++) {
156                     
157                     if (pDataArray->m->control_pressed) {  break; }
158                     
159                     it = counts.find(thisGroupsSeqs[i].getName());
160                     if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
161                     else {
162                         sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
163                         if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
164                     }
165                 }
166             }else{
167                 vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]);
168                 map<string, string> nameMap = parser->getNameMap(pDataArray->groups[i]);
169                 map<string, string>::iterator it;
170                 
171                 for (int i = 0; i < thisGroupsSeqs.size(); i++) {
172                     
173                     if (pDataArray->m->control_pressed) {  break; }
174                     
175                     it = nameMap.find(thisGroupsSeqs[i].getName());
176                     if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
177                     else {
178                         int num = pDataArray->m->getNumNames(it->second);
179                         sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
180                         if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
181                     }
182                 }
183                 
184             }
185             
186                         
187                         if (error) { pDataArray->m->control_pressed = true; }
188                         
189                         //sort by frequency
190                         sort(sequences.rbegin(), sequences.rend());
191                         ////////////////////////////////////////////////////////////////////////////////////////
192
193                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
194                         
195                         //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below
196                         ////////////////////////////////////////////////////////////////////////////////////////
197                         string chimeraFileName = pDataArray->outputFName+pDataArray->groups[i];
198                         string accnosFileName = pDataArray->accnos+pDataArray->groups[i];
199                         
200                         vector<vector<double> > correctModel(4);        //could be an option in the future to input own model matrix
201                         for(int j=0;j<4;j++){   correctModel[j].resize(4);      }
202                         
203                         correctModel[0][0] = 0.000000;  //AA
204                         correctModel[1][0] = 11.619259; //CA
205                         correctModel[2][0] = 11.694004; //TA
206                         correctModel[3][0] = 7.748623;  //GA
207                         
208                         correctModel[1][1] = 0.000000;  //CC
209                         correctModel[2][1] = 7.619657;  //TC
210                         correctModel[3][1] = 12.852562; //GC
211                         
212                         correctModel[2][2] = 0.000000;  //TT
213                         correctModel[3][2] = 10.964048; //TG
214                         
215                         correctModel[3][3] = 0.000000;  //GG
216                         
217                         for(int k=0;k<4;k++){
218                                 for(int j=0;j<k;j++){
219                                         correctModel[j][k] = correctModel[k][j];
220                                 }
221                         }
222                         
223                         int numSeqs = sequences.size();
224                         //int alignLength = sequences[0].sequence.size();
225                         
226                         ofstream chimeraFile;
227                         ofstream accnosFile;
228                         pDataArray->m->openOutputFile(chimeraFileName, chimeraFile); 
229                         pDataArray->m->openOutputFile(accnosFileName, accnosFile); 
230                         
231                         Perseus myPerseus;
232                         vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
233                         
234                         chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
235                         
236                         vector<bool> chimeras(numSeqs, 0);
237                         
238                         for(int j=0;j<numSeqs;j++){     
239                                 
240                                 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
241                                 
242                                 vector<bool> restricted = chimeras;
243                                 
244                                 vector<vector<int> > leftDiffs(numSeqs);
245                                 vector<vector<int> > leftMaps(numSeqs);
246                                 vector<vector<int> > rightDiffs(numSeqs);
247                                 vector<vector<int> > rightMaps(numSeqs);
248                                 
249                                 vector<int> singleLeft, bestLeft;
250                                 vector<int> singleRight, bestRight;
251                                 
252                                 int bestSingleIndex, bestSingleDiff;
253                                 vector<pwAlign> alignments(numSeqs);
254                                 
255                                 int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
256                                 
257                                 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
258                                 
259                                 int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
260                                 
261                                 string dummyA, dummyB;
262                                 
263                                 if(comparisons >= 2){   
264                                         minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
265                                         
266                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
267                                         
268                                         int minMismatchToTrimera = numeric_limits<int>::max();
269                                         int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
270                                         
271                                         if(minMismatchToChimera >= 3 && comparisons >= 3){
272                                                 minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
273                                                 
274                                                 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
275                                         }
276                                         
277                                         double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
278                                         
279                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
280                                         
281                                         string type;
282                                         string chimeraRefSeq;
283                                         
284                                         if(minMismatchToChimera - minMismatchToTrimera >= 3){
285                                                 type = "trimera";
286                                                 chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
287                                         }
288                                         else{
289                                                 type = "chimera";
290                                                 chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
291                                         }
292                                         
293                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
294                                         
295                                         double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
296                                         
297                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
298                                         
299                                         double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq);
300                                         double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);                
301                                         
302                                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
303                                         
304                                         chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
305                                         chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
306                                         chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
307                                         chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
308                                         
309                                         double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, pDataArray->alpha, pDataArray->beta);
310                                         
311                                         chimeraFile << probability << '\t';
312                                         
313                                         if(probability > pDataArray->cutoff){ 
314                                                 chimeraFile << type << endl;
315                                                 accnosFile << sequences[j].seqName << endl;
316                                                 chimeras[j] = 1;
317                                                 numChimeras++;
318                                         }
319                                         else{
320                                                 chimeraFile << "good" << endl;
321                                         }
322                                         
323                                 }
324                                 else{
325                                         chimeraFile << j << '\t' << sequences[j].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
326                                 }
327                                 //report progress
328                                 if((j+1) % 100 == 0){   pDataArray->m->mothurOut("Processing sequence: " + toString(j+1) + "\n");               }
329                         }
330                         
331                         if((numSeqs) % 100 != 0){       pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n");           }
332                         
333                         chimeraFile.close();
334                         accnosFile.close();
335                         ////////////////////////////////////////////////////////////////////////////////////////
336
337                         totalSeqs += numSeqs;
338                         
339                         //append files
340                         pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName);
341                         pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName);
342                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + ".");        pDataArray->m->mothurOutEndLine();                                      
343                         
344                         if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
345                 }       
346                 
347                 pDataArray->count = totalSeqs;
348                 if (pDataArray->hasCount) { delete cparser; } { delete parser; }
349                 return totalSeqs;
350                 
351         }
352         catch(exception& e) {
353                 pDataArray->m->errorOut(e, "ChimeraUchimeCommand", "MyPerseusThreadFunction");
354                 exit(1);
355         }
356
357 /**************************************************************************************************/
358
359 #endif
360
361 #endif
362
363