]> git.donarmstrong.com Git - mothur.git/blob - chimeraperseuscommand.h
PDS: update citation fields
[mothur.git] / chimeraperseuscommand.h
1 #ifndef CHIMERAPERSEUSCOMMAND_H
2 #define CHIMERAPERSEUSCOMMAND_H
3
4
5 /*
6  *  chimeraperseuscommand.h
7  *  Mothur
8  *
9  *  Created by westcott on 10/26/11.
10  *  Copyright 2011 Schloss Lab. All rights reserved.
11  *
12  */
13
14
15
16 #include "mothur.h"
17 #include "command.hpp"
18 #include "sequenceparser.h"
19 #include "myPerseus.h"
20
21 /***********************************************************/
22 class ChimeraPerseusCommand : public Command {
23 public:
24         ChimeraPerseusCommand(string);
25         ChimeraPerseusCommand();
26         ~ChimeraPerseusCommand() {}
27         
28         vector<string> setParameters();
29         string getCommandName()                 { return "chimera.perseus";             }
30         string getCommandCategory()             { return "Sequence Processing"; }
31         string getOutputFileNameTag(string, string);
32         string getHelpString(); 
33         string getCitation() { return "Quince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011).  Removing noise from pyrosequenced amplicons.  BMC Bioinformatics  12:38.\nEdgar,R.C., Haas,B.J., Clemente,J.C., Quince,C. and Knight,R. (2011), UCHIME improves sensitivity and speed of chimera detection.  Bioinformatics 27:2194.\nhttp://www.mothur.org/wiki/Chimera.perseus\n"; }
34         string getDescription()         { return "detect chimeric sequences"; }
35         
36         int execute(); 
37         void help() { m->mothurOut(getHelpString()); }          
38         
39 private:
40         struct linePair {
41                 int start;
42                 int end;
43                 linePair(int i, int j) : start(i), end(j) {}
44         };
45         
46         bool abort;
47         string fastafile, groupfile, outputDir, namefile;
48         int processors, alignLength;
49         double cutoff, alpha, beta;
50         
51         vector<string> outputNames;
52         vector<string> fastaFileNames;
53         vector<string> nameFileNames;
54         vector<string> groupFileNames;
55         
56         string getNamesFile(string&);
57         int driver(string, vector<seqData>&, string, int&);
58         vector<seqData> readFiles(string, string);
59         vector<seqData> loadSequences(SequenceParser&, string);
60         int deconvoluteResults(SequenceParser&, string, string);
61         int driverGroups(SequenceParser&, string, string, int, int, vector<string>);
62         int createProcessesGroups(SequenceParser&, string, string, vector<string>, string, string, string);
63 };
64
65 /**************************************************************************************************/
66 //custom data structure for threads to use.
67 // This is passed by void pointer so it can be any data type
68 // that can be passed using a single void pointer (LPVOID).
69 struct perseusData {
70         string fastafile; 
71         string namefile; 
72         string groupfile;
73         string outputFName;
74         string accnos;
75         MothurOut* m;
76         int start;
77         int end;
78         int threadID, count, numChimeras;
79         double alpha, beta, cutoff;
80         vector<string> groups;
81         
82         perseusData(){}
83         perseusData(double a, double b, double c, string o,  string f, string n, string g, string ac, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
84                 alpha = a;
85                 beta = b;
86                 cutoff = c;
87                 fastafile = f;
88                 namefile = n;
89                 groupfile = g;
90                 outputFName = o;
91                 accnos = ac;
92                 m = mout;
93                 start = st;
94                 end = en;
95                 threadID = tid;
96                 groups = gr;
97                 count = 0;
98                 numChimeras = 0;
99         }
100 };
101 /**************************************************************************************************/
102 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
103 #else
104 static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ 
105         perseusData* pDataArray;
106         pDataArray = (perseusData*)lpParam;
107         
108         try {
109                 
110                 //clears files
111                 ofstream out, out1, out2;
112                 pDataArray->m->openOutputFile(pDataArray->outputFName, out); out.close(); 
113                 pDataArray->m->openOutputFile(pDataArray->accnos, out1); out1.close();
114                 
115                 //parse fasta and name file by group
116                 SequenceParser* parser;
117                 if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);      }
118                 else                                                    { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
119                 
120                 int totalSeqs = 0;
121                 int numChimeras = 0;
122                 
123                 for (int i = pDataArray->start; i < pDataArray->end; i++) {
124                         
125                         int start = time(NULL);  if (pDataArray->m->control_pressed) {  delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
126                         
127                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "...");  pDataArray->m->mothurOutEndLine();                                      
128                         
129                         //vector<seqData> sequences = loadSequences(parser, groups[i]); - same function below
130                         ////////////////////////////////////////////////////////////////////////////////////////
131                         vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]);
132                         map<string, string> nameMap = parser->getNameMap(pDataArray->groups[i]);
133                         map<string, string>::iterator it;
134                         
135                         vector<seqData> sequences;
136                         bool error = false;
137                         
138                         for (int j = 0; j < thisGroupsSeqs.size(); j++) {
139                                 
140                                 if (pDataArray->m->control_pressed) {  delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
141                                 
142                                 it = nameMap.find(thisGroupsSeqs[j].getName());
143                                 if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[j].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
144                                 else {
145                                         int num = pDataArray->m->getNumNames(it->second);
146                                         sequences.push_back(seqData(thisGroupsSeqs[j].getName(), thisGroupsSeqs[j].getUnaligned(), num));
147                                 }
148                         }
149                         
150                         if (error) { pDataArray->m->control_pressed = true; }
151                         
152                         //sort by frequency
153                         sort(sequences.rbegin(), sequences.rend());
154                         ////////////////////////////////////////////////////////////////////////////////////////
155
156                         if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
157                         
158                         //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below
159                         ////////////////////////////////////////////////////////////////////////////////////////
160                         string chimeraFileName = pDataArray->outputFName+pDataArray->groups[i];
161                         string accnosFileName = pDataArray->accnos+pDataArray->groups[i];
162                         
163                         vector<vector<double> > correctModel(4);        //could be an option in the future to input own model matrix
164                         for(int j=0;j<4;j++){   correctModel[j].resize(4);      }
165                         
166                         correctModel[0][0] = 0.000000;  //AA
167                         correctModel[1][0] = 11.619259; //CA
168                         correctModel[2][0] = 11.694004; //TA
169                         correctModel[3][0] = 7.748623;  //GA
170                         
171                         correctModel[1][1] = 0.000000;  //CC
172                         correctModel[2][1] = 7.619657;  //TC
173                         correctModel[3][1] = 12.852562; //GC
174                         
175                         correctModel[2][2] = 0.000000;  //TT
176                         correctModel[3][2] = 10.964048; //TG
177                         
178                         correctModel[3][3] = 0.000000;  //GG
179                         
180                         for(int k=0;k<4;k++){
181                                 for(int j=0;j<k;j++){
182                                         correctModel[j][k] = correctModel[k][j];
183                                 }
184                         }
185                         
186                         int numSeqs = sequences.size();
187                         int alignLength = sequences[0].sequence.size();
188                         
189                         ofstream chimeraFile;
190                         ofstream accnosFile;
191                         pDataArray->m->openOutputFile(chimeraFileName, chimeraFile); 
192                         pDataArray->m->openOutputFile(accnosFileName, accnosFile); 
193                         
194                         Perseus myPerseus;
195                         vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
196                         
197                         chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
198                         
199                         vector<bool> chimeras(numSeqs, 0);
200                         
201                         for(int j=0;j<numSeqs;j++){     
202                                 
203                                 if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
204                                 
205                                 vector<bool> restricted = chimeras;
206                                 
207                                 vector<vector<int> > leftDiffs(numSeqs);
208                                 vector<vector<int> > leftMaps(numSeqs);
209                                 vector<vector<int> > rightDiffs(numSeqs);
210                                 vector<vector<int> > rightMaps(numSeqs);
211                                 
212                                 vector<int> singleLeft, bestLeft;
213                                 vector<int> singleRight, bestRight;
214                                 
215                                 int bestSingleIndex, bestSingleDiff;
216                                 vector<pwAlign> alignments(numSeqs);
217                                 
218                                 int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
219                                 
220                                 if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
221                                 
222                                 int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
223                                 
224                                 string dummyA, dummyB;
225                                 
226                                 if(comparisons >= 2){   
227                                         minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
228                                         
229                                         if (pDataArray->m->control_pressed) { delete parser;  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
230                                         
231                                         int minMismatchToTrimera = numeric_limits<int>::max();
232                                         int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
233                                         
234                                         if(minMismatchToChimera >= 3 && comparisons >= 3){
235                                                 minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
236                                                 
237                                                 if (pDataArray->m->control_pressed) { delete parser;  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
238                                         }
239                                         
240                                         double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
241                                         
242                                         if (pDataArray->m->control_pressed) { delete parser;  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
243                                         
244                                         string type;
245                                         string chimeraRefSeq;
246                                         
247                                         if(minMismatchToChimera - minMismatchToTrimera >= 3){
248                                                 type = "trimera";
249                                                 chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
250                                         }
251                                         else{
252                                                 type = "chimera";
253                                                 chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
254                                         }
255                                         
256                                         if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
257                                         
258                                         double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
259                                         
260                                         if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
261                                         
262                                         double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq);
263                                         double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);                
264                                         
265                                         if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
266                                         
267                                         chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
268                                         chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
269                                         chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
270                                         chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
271                                         
272                                         double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, pDataArray->alpha, pDataArray->beta);
273                                         
274                                         chimeraFile << probability << '\t';
275                                         
276                                         if(probability > pDataArray->cutoff){ 
277                                                 chimeraFile << type << endl;
278                                                 accnosFile << sequences[j].seqName << endl;
279                                                 chimeras[j] = 1;
280                                                 numChimeras++;
281                                         }
282                                         else{
283                                                 chimeraFile << "good" << endl;
284                                         }
285                                         
286                                 }
287                                 else{
288                                         chimeraFile << j << '\t' << sequences[j].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
289                                 }
290                                 //report progress
291                                 if((j+1) % 100 == 0){   pDataArray->m->mothurOut("Processing sequence: " + toString(j+1) + "\n");               }
292                         }
293                         
294                         if((numSeqs) % 100 != 0){       pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n");           }
295                         
296                         chimeraFile.close();
297                         accnosFile.close();
298                         ////////////////////////////////////////////////////////////////////////////////////////
299
300                         totalSeqs += numSeqs;
301                         
302                         //append files
303                         pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName);
304                         pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName);
305                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + ".");        pDataArray->m->mothurOutEndLine();                                      
306                         
307                         if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
308                 }       
309                 
310                 pDataArray->count = totalSeqs;
311                 delete parser;
312                 return totalSeqs;
313                 
314         }
315         catch(exception& e) {
316                 pDataArray->m->errorOut(e, "ChimeraUchimeCommand", "MyPerseusThreadFunction");
317                 exit(1);
318         }
319
320 /**************************************************************************************************/
321
322 #endif
323
324 #endif
325
326