1 #ifndef CHIMERAPERSEUSCOMMAND_H
2 #define CHIMERAPERSEUSCOMMAND_H
6 * chimeraperseuscommand.h
9 * Created by westcott on 10/26/11.
10 * Copyright 2011 Schloss Lab. All rights reserved.
17 #include "command.hpp"
18 #include "sequenceparser.h"
19 #include "sequencecountparser.h"
20 #include "myPerseus.h"
21 #include "counttable.h"
23 /***********************************************************/
24 class ChimeraPerseusCommand : public Command {
26 ChimeraPerseusCommand(string);
27 ChimeraPerseusCommand();
28 ~ChimeraPerseusCommand() {}
30 vector<string> setParameters();
31 string getCommandName() { return "chimera.perseus"; }
32 string getCommandCategory() { return "Sequence Processing"; }
34 string getHelpString();
35 string getOutputPattern(string);
36 string getCitation() { return "Quince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011). Removing noise from pyrosequenced amplicons. BMC Bioinformatics 12:38.\nEdgar,R.C., Haas,B.J., Clemente,J.C., Quince,C. and Knight,R. (2011), UCHIME improves sensitivity and speed of chimera detection. Bioinformatics 27:2194.\nhttp://www.mothur.org/wiki/Chimera.perseus\n"; }
37 string getDescription() { return "detect chimeric sequences"; }
40 void help() { m->mothurOut(getHelpString()); }
46 linePair(int i, int j) : start(i), end(j) {}
49 bool abort, hasName, hasCount, dups;
50 string fastafile, groupfile, countfile, outputDir, namefile;
51 int processors, alignLength;
52 double cutoff, alpha, beta;
53 SequenceParser* parser;
54 SequenceCountParser* cparser;
56 vector<string> outputNames;
57 vector<string> fastaFileNames;
58 vector<string> nameFileNames;
59 vector<string> groupFileNames;
61 string getNamesFile(string&);
62 int driver(string, vector<seqData>&, string, int&);
63 vector<seqData> readFiles(string, string);
64 vector<seqData> readFiles(string inputFile, CountTable* ct);
65 vector<seqData> loadSequences(string);
66 int deconvoluteResults(map<string, string>&, string, string);
67 int driverGroups(string, string, string, int, int, vector<string>);
68 int createProcessesGroups(string, string, string, vector<string>, string, string, string);
69 string removeNs(string);
72 /**************************************************************************************************/
73 //custom data structure for threads to use.
74 // This is passed by void pointer so it can be any data type
75 // that can be passed using a single void pointer (LPVOID).
86 bool hasName, hasCount, dups;
87 int threadID, count, numChimeras;
88 double alpha, beta, cutoff;
89 vector<string> groups;
92 perseusData(bool dps, bool hn, bool hc, double a, double b, double c, string o, string f, string n, string g, string ac, string ctlist, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
114 /**************************************************************************************************/
115 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
117 static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
118 perseusData* pDataArray;
119 pDataArray = (perseusData*)lpParam;
124 ofstream out, out1, out2;
125 pDataArray->m->openOutputFile(pDataArray->outputFName, out); out.close();
126 pDataArray->m->openOutputFile(pDataArray->accnos, out1); out1.close();
128 //parse fasta and name file by group
129 SequenceParser* parser;
130 SequenceCountParser* cparser;
131 if (pDataArray->hasCount) {
132 CountTable* ct = new CountTable();
133 ct->readTable(pDataArray->namefile);
134 cparser = new SequenceCountParser(pDataArray->fastafile, *ct);
137 if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
138 else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); }
144 ofstream outCountList;
145 if (pDataArray->hasCount && pDataArray->dups) { pDataArray->m->openOutputFile(pDataArray->countlist, outCountList); }
147 for (int u = pDataArray->start; u < pDataArray->end; u++) {
149 int start = time(NULL); if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
151 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[u] + "..."); pDataArray->m->mothurOutEndLine();
153 //vector<seqData> sequences = loadSequences(parser, groups[i]); - same function below
154 ////////////////////////////////////////////////////////////////////////////////////////
157 vector<seqData> sequences;
158 if (pDataArray->hasCount) {
159 vector<Sequence> thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[u]);
160 map<string, int> counts = cparser->getCountTable(pDataArray->groups[u]);
161 map<string, int>::iterator it;
163 for (int i = 0; i < thisGroupsSeqs.size(); i++) {
165 if (pDataArray->m->control_pressed) { break; }
167 it = counts.find(thisGroupsSeqs[i].getName());
168 if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
171 string tempSeq = thisGroupsSeqs[i].getUnaligned();
172 for (int j = 0; j < tempSeq.length(); j++) { if (tempSeq[j] != 'N') { newSeq += tempSeq[j]; } }
173 thisGroupsSeqs[i].setAligned(newSeq);
175 sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
176 if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
180 vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[u]);
181 map<string, string> nameMap = parser->getNameMap(pDataArray->groups[u]);
182 map<string, string>::iterator it;
184 for (int i = 0; i < thisGroupsSeqs.size(); i++) {
186 if (pDataArray->m->control_pressed) { break; }
188 it = nameMap.find(thisGroupsSeqs[i].getName());
189 if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
191 int num = pDataArray->m->getNumNames(it->second);
193 string tempSeq = thisGroupsSeqs[i].getUnaligned();
194 for (int j = 0; j < tempSeq.length(); j++) { if (tempSeq[j] != 'N') { newSeq += tempSeq[j]; } }
195 thisGroupsSeqs[i].setAligned(newSeq);
197 sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
198 if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
205 if (error) { pDataArray->m->control_pressed = true; }
208 sort(sequences.rbegin(), sequences.rend());
209 ////////////////////////////////////////////////////////////////////////////////////////
211 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
213 //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below
214 ////////////////////////////////////////////////////////////////////////////////////////
215 string chimeraFileName = pDataArray->outputFName+pDataArray->groups[u];
216 string accnosFileName = pDataArray->accnos+pDataArray->groups[u];
218 vector<vector<double> > correctModel(4); //could be an option in the future to input own model matrix
219 for(int j=0;j<4;j++){ correctModel[j].resize(4); }
221 correctModel[0][0] = 0.000000; //AA
222 correctModel[1][0] = 11.619259; //CA
223 correctModel[2][0] = 11.694004; //TA
224 correctModel[3][0] = 7.748623; //GA
226 correctModel[1][1] = 0.000000; //CC
227 correctModel[2][1] = 7.619657; //TC
228 correctModel[3][1] = 12.852562; //GC
230 correctModel[2][2] = 0.000000; //TT
231 correctModel[3][2] = 10.964048; //TG
233 correctModel[3][3] = 0.000000; //GG
235 for(int k=0;k<4;k++){
236 for(int j=0;j<k;j++){
237 correctModel[j][k] = correctModel[k][j];
241 int numSeqs = sequences.size();
242 //int alignLength = sequences[0].sequence.size();
244 ofstream chimeraFile;
246 pDataArray->m->openOutputFile(chimeraFileName, chimeraFile);
247 pDataArray->m->openOutputFile(accnosFileName, accnosFile);
250 vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
252 chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
254 vector<bool> chimeras(numSeqs, 0);
256 for(int j=0;j<numSeqs;j++){
258 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
260 vector<bool> restricted = chimeras;
262 vector<vector<int> > leftDiffs(numSeqs);
263 vector<vector<int> > leftMaps(numSeqs);
264 vector<vector<int> > rightDiffs(numSeqs);
265 vector<vector<int> > rightMaps(numSeqs);
267 vector<int> singleLeft, bestLeft;
268 vector<int> singleRight, bestRight;
270 int bestSingleIndex, bestSingleDiff;
271 vector<pwAlign> alignments(numSeqs);
273 int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
275 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
277 int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
279 string dummyA, dummyB;
281 if(comparisons >= 2){
282 minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
284 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
286 int minMismatchToTrimera = numeric_limits<int>::max();
287 int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
289 if(minMismatchToChimera >= 3 && comparisons >= 3){
290 minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
292 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
295 double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
297 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
300 string chimeraRefSeq;
302 if(minMismatchToChimera - minMismatchToTrimera >= 3){
304 chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
308 chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
311 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
313 double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
315 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
317 double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq);
318 double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);
320 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
322 chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
323 chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
324 chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
325 chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
327 double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, pDataArray->alpha, pDataArray->beta);
329 chimeraFile << probability << '\t';
331 if(probability > pDataArray->cutoff){
332 chimeraFile << type << endl;
333 accnosFile << sequences[j].seqName << endl;
338 chimeraFile << "good" << endl;
343 chimeraFile << j << '\t' << sequences[j].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
346 if((j+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(j+1) + "\n"); }
349 if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n"); }
353 ////////////////////////////////////////////////////////////////////////////////////////
355 totalSeqs += numSeqs;
357 if (pDataArray->dups) {
358 if (!pDataArray->m->isBlank(accnosFileName)) {
360 pDataArray->m->openInputFile(accnosFileName, in);
362 if (pDataArray->hasCount) {
364 in >> name; pDataArray->m->gobble(in);
365 outCountList << name << '\t' << pDataArray->groups[u] << endl;
369 map<string, string> thisnamemap = parser->getNameMap(pDataArray->groups[u]);
370 map<string, string>::iterator itN;
372 pDataArray->m->openOutputFile(accnosFileName+".temp", out);
374 in >> name; pDataArray->m->gobble(in);
375 itN = thisnamemap.find(name);
376 if (itN != thisnamemap.end()) {
377 vector<string> tempNames; pDataArray->m->splitAtComma(itN->second, tempNames);
378 for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; }
380 }else { pDataArray->m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); pDataArray->m->control_pressed = true; }
384 pDataArray->m->renameFile(accnosFileName+".temp", accnosFileName);
391 pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName);
392 pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName);
393 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[u] + "."); pDataArray->m->mothurOutEndLine();
395 if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
398 if (pDataArray->hasCount && pDataArray->dups) { outCountList.close(); }
400 pDataArray->count = totalSeqs;
401 if (pDataArray->hasCount) { delete cparser; } { delete parser; }
405 catch(exception& e) {
406 pDataArray->m->errorOut(e, "ChimeraUchimeCommand", "MyPerseusThreadFunction");
410 /**************************************************************************************************/