X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=chimeraperseuscommand.h;fp=chimeraperseuscommand.h;h=84563ac6ecc006ca7150f7f5e025b340da964fdf;hb=74dc92cf53df65fd8b14d8eaf35489bbecbccac6;hp=0000000000000000000000000000000000000000;hpb=4c5e7a20a03ddc6feb49ff9d21fcb4c79bc5508d;p=mothur.git diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h new file mode 100644 index 0000000..84563ac --- /dev/null +++ b/chimeraperseuscommand.h @@ -0,0 +1,325 @@ +#ifndef CHIMERAPERSEUSCOMMAND_H +#define CHIMERAPERSEUSCOMMAND_H + + +/* + * chimeraperseuscommand.h + * Mothur + * + * Created by westcott on 10/26/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + + + +#include "mothur.h" +#include "command.hpp" +#include "sequenceparser.h" +#include "myPerseus.h" + +/***********************************************************/ +class ChimeraPerseusCommand : public Command { +public: + ChimeraPerseusCommand(string); + ChimeraPerseusCommand(); + ~ChimeraPerseusCommand() {} + + vector setParameters(); + string getCommandName() { return "chimera.perseus"; } + string getCommandCategory() { return "Sequence Processing"; } + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Chimera.perseus\n"; } + string getDescription() { return "detect chimeric sequences"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } + +private: + struct linePair { + int start; + int end; + linePair(int i, int j) : start(i), end(j) {} + }; + + bool abort; + string fastafile, groupfile, outputDir, namefile; + int processors; + double cutoff, alpha, beta; + + vector outputNames; + vector fastaFileNames; + vector nameFileNames; + vector groupFileNames; + + string getNamesFile(string&); + int driver(string, vector&, string, int&); + vector readFiles(string, string); + vector loadSequences(SequenceParser&, string); + int deconvoluteResults(SequenceParser&, string, string); + int driverGroups(SequenceParser&, string, string, int, int, vector); + int createProcessesGroups(SequenceParser&, string, string, vector, string, string, string); +}; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct perseusData { + string fastafile; + string namefile; + string groupfile; + string outputFName; + string accnos; + MothurOut* m; + int start; + int end; + int threadID, count, numChimeras; + double alpha, beta, cutoff; + vector groups; + + perseusData(){} + perseusData(double a, double b, double c, string o, string f, string n, string g, string ac, vector gr, MothurOut* mout, int st, int en, int tid) { + alpha = a; + beta = b; + cutoff = c; + fastafile = f; + namefile = n; + groupfile = g; + outputFName = o; + accnos = ac; + m = mout; + start = st; + end = en; + threadID = tid; + groups = gr; + count = 0; + numChimeras = 0; + } +}; +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else +static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ + perseusData* pDataArray; + pDataArray = (perseusData*)lpParam; + + try { + + //clears files + ofstream out, out1, out2; + pDataArray->m->openOutputFile(pDataArray->outputFName, out); out.close(); + pDataArray->m->openOutputFile(pDataArray->accnos, out1); out1.close(); + + //parse fasta and name file by group + SequenceParser* parser; + if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } + else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } + + int totalSeqs = 0; + int numChimeras = 0; + + for (int i = pDataArray->start; i < pDataArray->end; i++) { + + int start = time(NULL); if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "..."); pDataArray->m->mothurOutEndLine(); + + //vector sequences = loadSequences(parser, groups[i]); - same function below + //////////////////////////////////////////////////////////////////////////////////////// + vector thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]); + map nameMap = parser->getNameMap(pDataArray->groups[i]); + map::iterator it; + + vector sequences; + bool error = false; + + for (int j = 0; j < thisGroupsSeqs.size(); j++) { + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + + it = nameMap.find(thisGroupsSeqs[j].getName()); + if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[j].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + else { + int num = pDataArray->m->getNumNames(it->second); + sequences.push_back(seqData(thisGroupsSeqs[j].getName(), thisGroupsSeqs[j].getUnaligned(), num)); + } + } + + if (error) { pDataArray->m->control_pressed = true; } + + //sort by frequency + sort(sequences.rbegin(), sequences.rend()); + //////////////////////////////////////////////////////////////////////////////////////// + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + + //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below + //////////////////////////////////////////////////////////////////////////////////////// + string chimeraFileName = pDataArray->outputFName+pDataArray->groups[i]; + string accnosFileName = pDataArray->accnos+pDataArray->groups[i]; + + vector > correctModel(4); //could be an option in the future to input own model matrix + for(int j=0;j<4;j++){ correctModel[j].resize(4); } + + correctModel[0][0] = 0.000000; //AA + correctModel[1][0] = 11.619259; //CA + correctModel[2][0] = 11.694004; //TA + correctModel[3][0] = 7.748623; //GA + + correctModel[1][1] = 0.000000; //CC + correctModel[2][1] = 7.619657; //TC + correctModel[3][1] = 12.852562; //GC + + correctModel[2][2] = 0.000000; //TT + correctModel[3][2] = 10.964048; //TG + + correctModel[3][3] = 0.000000; //GG + + for(int k=0;k<4;k++){ + for(int j=0;jm->openOutputFile(chimeraFileName, chimeraFile); + pDataArray->m->openOutputFile(accnosFileName, accnosFile); + + Perseus myPerseus; + vector > binMatrix = myPerseus.binomial(alignLength); + + chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n"; + + vector chimeras(numSeqs, 0); + + for(int j=0;jm->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + vector restricted = chimeras; + + vector > leftDiffs(numSeqs); + vector > leftMaps(numSeqs); + vector > rightDiffs(numSeqs); + vector > rightMaps(numSeqs); + + vector singleLeft, bestLeft; + vector singleRight, bestRight; + + int bestSingleIndex, bestSingleDiff; + vector alignments(numSeqs); + + int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi; + + string dummyA, dummyB; + + if(comparisons >= 2){ + minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + int minMismatchToTrimera = numeric_limits::max(); + int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB; + + if(minMismatchToChimera >= 3 && comparisons >= 3){ + minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + } + + double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + string type; + string chimeraRefSeq; + + if(minMismatchToChimera - minMismatchToTrimera >= 3){ + type = "trimera"; + chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps); + } + else{ + type = "chimera"; + chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps); + } + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq); + double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + + chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t'; + chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t'; + chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t'; + chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t'; + + double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, pDataArray->alpha, pDataArray->beta); + + chimeraFile << probability << '\t'; + + if(probability > pDataArray->cutoff){ + chimeraFile << type << endl; + accnosFile << sequences[j].seqName << endl; + chimeras[j] = 1; + numChimeras++; + } + else{ + chimeraFile << "good" << endl; + } + + } + else{ + chimeraFile << j << '\t' << sequences[j].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl; + } + //report progress + if((j+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(j+1) + "\n"); } + } + + if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n"); } + + chimeraFile.close(); + accnosFile.close(); + //////////////////////////////////////////////////////////////////////////////////////// + + totalSeqs += numSeqs; + + //append files + pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName); + pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName); + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + "."); pDataArray->m->mothurOutEndLine(); + + if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + } + + pDataArray->count = totalSeqs; + delete parser; + return totalSeqs; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ChimeraUchimeCommand", "MyPerseusThreadFunction"); + exit(1); + } +} +/**************************************************************************************************/ + +#endif + +#endif + +