2 * chimeraperseuscommand.cpp
5 * Created by westcott on 10/26/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "chimeraperseuscommand.h"
11 #include "deconvolutecommand.h"
12 #include "sequence.hpp"
13 //**********************************************************************************************************************
14 vector<string> ChimeraPerseusCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
18 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
21 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
22 CommandParameter pcutoff("cutoff", "Number", "", "0.5", "", "", "",false,false); parameters.push_back(pcutoff);
23 CommandParameter palpha("alpha", "Number", "", "-5.54", "", "", "",false,false); parameters.push_back(palpha);
24 CommandParameter pbeta("beta", "Number", "", "0.33", "", "", "",false,false); parameters.push_back(pbeta);
26 vector<string> myArray;
27 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
31 m->errorOut(e, "ChimeraPerseusCommand", "setParameters");
35 //**********************************************************************************************************************
36 string ChimeraPerseusCommand::getHelpString(){
38 string helpString = "";
39 helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n";
40 helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n";
41 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
42 helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n";
43 helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
44 helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
45 helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
46 helpString += "The alpha parameter .... The default is -5.54. \n";
47 helpString += "The beta parameter .... The default is 0.33. \n";
48 helpString += "The cutoff parameter .... The default is 0.50. \n";
49 helpString += "The chimera.perseus command should be in the following format: \n";
50 helpString += "chimera.perseus(fasta=yourFastaFile, name=yourNameFile) \n";
51 helpString += "Example: chimera.perseus(fasta=AD.align, name=AD.names) \n";
52 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
56 m->errorOut(e, "ChimeraPerseusCommand", "getHelpString");
60 //**********************************************************************************************************************
61 ChimeraPerseusCommand::ChimeraPerseusCommand(){
63 abort = true; calledHelp = true;
65 vector<string> tempOutNames;
66 outputTypes["chimera"] = tempOutNames;
67 outputTypes["accnos"] = tempOutNames;
70 m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand");
74 //***************************************************************************************************************
75 ChimeraPerseusCommand::ChimeraPerseusCommand(string option) {
77 abort = false; calledHelp = false;
79 //allow user to run help
80 if(option == "help") { help(); abort = true; calledHelp = true; }
81 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
84 vector<string> myArray = setParameters();
86 OptionParser parser(option);
87 map<string,string> parameters = parser.getParameters();
89 ValidParameters validParameter("chimera.uchime");
90 map<string,string>::iterator it;
92 //check to make sure all parameters are valid for command
93 for (it = parameters.begin(); it != parameters.end(); it++) {
94 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
97 vector<string> tempOutNames;
98 outputTypes["chimera"] = tempOutNames;
99 outputTypes["accnos"] = tempOutNames;
101 //if the user changes the input directory command factory will send this info to us in the output parameter
102 string inputDir = validParameter.validFile(parameters, "inputdir", false);
103 if (inputDir == "not found"){ inputDir = ""; }
105 //check for required parameters
106 fastafile = validParameter.validFile(parameters, "fasta", false);
107 if (fastafile == "not found") {
108 //if there is a current fasta file, use it
109 string filename = m->getFastaFile();
110 if (filename != "") { fastaFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
111 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
113 m->splitAtDash(fastafile, fastaFileNames);
115 //go through files and make sure they are good, if not, then disregard them
116 for (int i = 0; i < fastaFileNames.size(); i++) {
119 if (fastaFileNames[i] == "current") {
120 fastaFileNames[i] = m->getFastaFile();
121 if (fastaFileNames[i] != "") { m->mothurOut("Using " + fastaFileNames[i] + " as input file for the fasta parameter where you had given current."); m->mothurOutEndLine(); }
123 m->mothurOut("You have no current fastafile, ignoring current."); m->mothurOutEndLine(); ignore=true;
124 //erase from file list
125 fastaFileNames.erase(fastaFileNames.begin()+i);
132 if (inputDir != "") {
133 string path = m->hasPath(fastaFileNames[i]);
134 //if the user has not given a path then, add inputdir. else leave path alone.
135 if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; }
141 ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror");
143 //if you can't open it, try default location
144 if (ableToOpen == 1) {
145 if (m->getDefaultPath() != "") { //default path is set
146 string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]);
147 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
149 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
151 fastaFileNames[i] = tryPath;
155 if (ableToOpen == 1) {
156 if (m->getOutputDir() != "") { //default path is set
157 string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]);
158 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
160 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
162 fastaFileNames[i] = tryPath;
168 if (ableToOpen == 1) {
169 m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
170 //erase from file list
171 fastaFileNames.erase(fastaFileNames.begin()+i);
174 m->setFastaFile(fastaFileNames[i]);
179 //make sure there is at least one valid file left
180 if (fastaFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
184 //check for required parameters
186 namefile = validParameter.validFile(parameters, "name", false);
187 if (namefile == "not found") {
188 //if there is a current fasta file, use it
189 string filename = m->getNameFile();
190 if (filename != "") { nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); }
191 else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
194 m->splitAtDash(namefile, nameFileNames);
196 //go through files and make sure they are good, if not, then disregard them
197 for (int i = 0; i < nameFileNames.size(); i++) {
200 if (nameFileNames[i] == "current") {
201 nameFileNames[i] = m->getNameFile();
202 if (nameFileNames[i] != "") { m->mothurOut("Using " + nameFileNames[i] + " as input file for the name parameter where you had given current."); m->mothurOutEndLine(); }
204 m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true;
205 //erase from file list
206 nameFileNames.erase(nameFileNames.begin()+i);
213 if (inputDir != "") {
214 string path = m->hasPath(nameFileNames[i]);
215 //if the user has not given a path then, add inputdir. else leave path alone.
216 if (path == "") { nameFileNames[i] = inputDir + nameFileNames[i]; }
222 ableToOpen = m->openInputFile(nameFileNames[i], in, "noerror");
224 //if you can't open it, try default location
225 if (ableToOpen == 1) {
226 if (m->getDefaultPath() != "") { //default path is set
227 string tryPath = m->getDefaultPath() + m->getSimpleName(nameFileNames[i]);
228 m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
230 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
232 nameFileNames[i] = tryPath;
236 if (ableToOpen == 1) {
237 if (m->getOutputDir() != "") { //default path is set
238 string tryPath = m->getOutputDir() + m->getSimpleName(nameFileNames[i]);
239 m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
241 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
243 nameFileNames[i] = tryPath;
249 if (ableToOpen == 1) {
250 m->mothurOut("Unable to open " + nameFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
251 //erase from file list
252 nameFileNames.erase(nameFileNames.begin()+i);
255 m->setNameFile(nameFileNames[i]);
260 //make sure there is at least one valid file left
261 if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; }
264 if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
266 bool hasGroup = true;
267 groupfile = validParameter.validFile(parameters, "group", false);
268 if (groupfile == "not found") { groupfile = ""; hasGroup = false; }
270 m->splitAtDash(groupfile, groupFileNames);
272 //go through files and make sure they are good, if not, then disregard them
273 for (int i = 0; i < groupFileNames.size(); i++) {
276 if (groupFileNames[i] == "current") {
277 groupFileNames[i] = m->getGroupFile();
278 if (groupFileNames[i] != "") { m->mothurOut("Using " + groupFileNames[i] + " as input file for the group parameter where you had given current."); m->mothurOutEndLine(); }
280 m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true;
281 //erase from file list
282 groupFileNames.erase(groupFileNames.begin()+i);
289 if (inputDir != "") {
290 string path = m->hasPath(groupFileNames[i]);
291 //if the user has not given a path then, add inputdir. else leave path alone.
292 if (path == "") { groupFileNames[i] = inputDir + groupFileNames[i]; }
298 ableToOpen = m->openInputFile(groupFileNames[i], in, "noerror");
300 //if you can't open it, try default location
301 if (ableToOpen == 1) {
302 if (m->getDefaultPath() != "") { //default path is set
303 string tryPath = m->getDefaultPath() + m->getSimpleName(groupFileNames[i]);
304 m->mothurOut("Unable to open " + groupFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
306 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
308 groupFileNames[i] = tryPath;
312 if (ableToOpen == 1) {
313 if (m->getOutputDir() != "") { //default path is set
314 string tryPath = m->getOutputDir() + m->getSimpleName(groupFileNames[i]);
315 m->mothurOut("Unable to open " + groupFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
317 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
319 groupFileNames[i] = tryPath;
325 if (ableToOpen == 1) {
326 m->mothurOut("Unable to open " + groupFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
327 //erase from file list
328 groupFileNames.erase(groupFileNames.begin()+i);
331 m->setGroupFile(groupFileNames[i]);
336 //make sure there is at least one valid file left
337 if (groupFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid group files."); m->mothurOutEndLine(); abort = true; }
340 if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
343 //if the user changes the output directory command factory will send this info to us in the output parameter
344 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
346 string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
347 m->setProcessors(temp);
348 m->mothurConvert(temp, processors);
350 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.50"; }
351 m->mothurConvert(temp, cutoff);
353 temp = validParameter.validFile(parameters, "alpha", false); if (temp == "not found"){ temp = "-5.54"; }
354 m->mothurConvert(temp, alpha);
356 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.33"; }
357 m->mothurConvert(temp, beta);
360 catch(exception& e) {
361 m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand");
365 //***************************************************************************************************************
367 int ChimeraPerseusCommand::execute(){
369 if (abort == true) { if (calledHelp) { return 0; } return 2; }
373 for (int s = 0; s < fastaFileNames.size(); s++) {
375 m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
377 int start = time(NULL);
378 if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it
379 string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "perseus.chimera";
380 string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "perseus.accnos";
381 //string newFasta = m->getRootName(fastaFileNames[s]) + "temp";
383 //you provided a groupfile
384 string groupFile = "";
385 if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; }
387 string nameFile = "";
388 if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one
389 nameFile = nameFileNames[s];
390 }else { nameFile = getNamesFile(fastaFileNames[s]); }
392 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
397 if (groupFile != "") {
398 //Parse sequences by group
399 SequenceParser parser(groupFile, fastaFileNames[s], nameFile);
400 vector<string> groups = parser.getNamesOfGroups();
402 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
405 ofstream out, out1, out2;
406 m->openOutputFile(outputFileName, out); out.close();
407 m->openOutputFile(accnosFileName, out1); out1.close();
409 if(processors == 1) { numSeqs = driverGroups(parser, outputFileName, accnosFileName, 0, groups.size(), groups); }
410 else { numSeqs = createProcessesGroups(parser, outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); }
412 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
414 numChimeras = deconvoluteResults(parser, outputFileName, accnosFileName);
416 m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine();
418 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
421 if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
423 //read sequences and store sorted by frequency
424 vector<seqData> sequences = readFiles(fastaFileNames[s], nameFile);
426 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
428 numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras);
431 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
433 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found."); m->mothurOutEndLine();
434 outputNames.push_back(outputFileName); outputTypes["chimera"].push_back(outputFileName);
435 outputNames.push_back(accnosFileName); outputTypes["accnos"].push_back(accnosFileName);
438 //set accnos file as new current accnosfile
440 itTypes = outputTypes.find("accnos");
441 if (itTypes != outputTypes.end()) {
442 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
445 m->mothurOutEndLine();
446 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
447 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
448 m->mothurOutEndLine();
453 catch(exception& e) {
454 m->errorOut(e, "ChimeraPerseusCommand", "execute");
458 //**********************************************************************************************************************
459 string ChimeraPerseusCommand::getNamesFile(string& inputFile){
461 string nameFile = "";
463 m->mothurOutEndLine(); m->mothurOut("No namesfile given, running unique.seqs command to generate one."); m->mothurOutEndLine(); m->mothurOutEndLine();
465 //use unique.seqs to create new name and fastafile
466 string inputString = "fasta=" + inputFile;
467 m->mothurOut("/******************************************/"); m->mothurOutEndLine();
468 m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
470 Command* uniqueCommand = new DeconvoluteCommand(inputString);
471 uniqueCommand->execute();
473 map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
475 delete uniqueCommand;
477 m->mothurOut("/******************************************/"); m->mothurOutEndLine();
479 nameFile = filenames["name"][0];
480 inputFile = filenames["fasta"][0];
484 catch(exception& e) {
485 m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
489 //**********************************************************************************************************************
490 int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFName, string accnos, int start, int end, vector<string> groups){
496 for (int i = start; i < end; i++) {
498 m->mothurOutEndLine(); m->mothurOut("Checking sequences from group " + groups[i] + "..."); m->mothurOutEndLine();
500 int start = time(NULL); if (m->control_pressed) { return 0; }
502 vector<seqData> sequences = loadSequences(parser, groups[i]);
504 if (m->control_pressed) { return 0; }
506 int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras);
507 totalSeqs += numSeqs;
509 if (m->control_pressed) { return 0; }
512 m->appendFiles((outputFName+groups[i]), outputFName); m->mothurRemove((outputFName+groups[i]));
513 m->appendFiles((accnos+groups[i]), accnos); m->mothurRemove((accnos+groups[i]));
515 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + groups[i] + "."); m->mothurOutEndLine();
521 catch(exception& e) {
522 m->errorOut(e, "ChimeraPerseusCommand", "driverGroups");
526 //**********************************************************************************************************************
527 vector<seqData> ChimeraPerseusCommand::loadSequences(SequenceParser& parser, string group){
530 vector<Sequence> thisGroupsSeqs = parser.getSeqs(group);
531 map<string, string> nameMap = parser.getNameMap(group);
532 map<string, string>::iterator it;
534 vector<seqData> sequences;
537 for (int i = 0; i < thisGroupsSeqs.size(); i++) {
539 if (m->control_pressed) { return sequences; }
541 it = nameMap.find(thisGroupsSeqs[i].getName());
542 if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
544 int num = m->getNumNames(it->second);
545 sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
549 if (error) { m->control_pressed = true; }
552 sort(sequences.rbegin(), sequences.rend());
556 catch(exception& e) {
557 m->errorOut(e, "ChimeraPerseusCommand", "loadSequences");
562 //**********************************************************************************************************************
563 vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, string name){
565 map<string, int>::iterator it;
566 map<string, int> nameMap = m->readNames(name);
568 //read fasta file and create sequenceData structure - checking for file mismatches
569 vector<seqData> sequences;
572 m->openInputFile(inputFile, in);
576 if (m->control_pressed) { in.close(); return sequences; }
578 Sequence temp(in); m->gobble(in);
580 it = nameMap.find(temp.getName());
581 if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + temp.getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
583 sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), it->second));
588 if (error) { m->control_pressed = true; }
591 sort(sequences.rbegin(), sequences.rend());
595 catch(exception& e) {
596 m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
600 //**********************************************************************************************************************
601 int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& sequences, string accnosFileName, int& numChimeras){
604 vector<vector<double> > correctModel(4); //could be an option in the future to input own model matrix
605 for(int i=0;i<4;i++){ correctModel[i].resize(4); }
607 correctModel[0][0] = 0.000000; //AA
608 correctModel[1][0] = 11.619259; //CA
609 correctModel[2][0] = 11.694004; //TA
610 correctModel[3][0] = 7.748623; //GA
612 correctModel[1][1] = 0.000000; //CC
613 correctModel[2][1] = 7.619657; //TC
614 correctModel[3][1] = 12.852562; //GC
616 correctModel[2][2] = 0.000000; //TT
617 correctModel[3][2] = 10.964048; //TG
619 correctModel[3][3] = 0.000000; //GG
621 for(int i=0;i<4;i++){
622 for(int j=0;j<i;j++){
623 correctModel[j][i] = correctModel[i][j];
627 int numSeqs = sequences.size();
628 int alignLength = sequences[0].sequence.size();
630 ofstream chimeraFile;
632 m->openOutputFile(chimeraFileName, chimeraFile);
633 m->openOutputFile(accnosFileName, accnosFile);
636 vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
638 chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
640 vector<bool> chimeras(numSeqs, 0);
642 for(int i=0;i<numSeqs;i++){
643 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
645 vector<bool> restricted = chimeras;
647 vector<vector<int> > leftDiffs(numSeqs);
648 vector<vector<int> > leftMaps(numSeqs);
649 vector<vector<int> > rightDiffs(numSeqs);
650 vector<vector<int> > rightMaps(numSeqs);
652 vector<int> singleLeft, bestLeft;
653 vector<int> singleRight, bestRight;
655 int bestSingleIndex, bestSingleDiff;
656 vector<pwAlign> alignments(numSeqs);
658 int comparisons = myPerseus.getAlignments(i, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
659 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
661 int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
663 string dummyA, dummyB;
665 if(comparisons >= 2){
666 minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
667 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
669 int minMismatchToTrimera = numeric_limits<int>::max();
670 int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
672 if(minMismatchToChimera >= 3 && comparisons >= 3){
673 minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
674 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
677 double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
679 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
682 string chimeraRefSeq;
684 if(minMismatchToChimera - minMismatchToTrimera >= 3){
686 chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
690 chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
693 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
695 double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
697 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
699 double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq);
700 double loonIndex = myPerseus.calcLoonIndex(sequences[i].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);
702 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
704 chimeraFile << i << '\t' << sequences[i].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
705 chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
706 chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
707 chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
709 double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, alpha, beta);
711 chimeraFile << probability << '\t';
713 if(probability > cutoff){
714 chimeraFile << type << endl;
715 accnosFile << sequences[i].seqName << endl;
720 chimeraFile << "good" << endl;
725 chimeraFile << i << '\t' << sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
729 if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1) + "\n"); }
732 if((numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n"); }
739 catch(exception& e) {
740 m->errorOut(e, "ChimeraPerseusCommand", "driver");
744 /**************************************************************************************************/
745 int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string accnos, vector<string> groups, string group, string fasta, string name) {
748 vector<int> processIDS;
753 if (groups.size() < processors) { processors = groups.size(); }
755 //divide the groups between the processors
756 vector<linePair> lines;
757 int numGroupsPerProcessor = groups.size() / processors;
758 for (int i = 0; i < processors; i++) {
759 int startIndex = i * numGroupsPerProcessor;
760 int endIndex = (i+1) * numGroupsPerProcessor;
761 if(i == (processors - 1)){ endIndex = groups.size(); }
762 lines.push_back(linePair(startIndex, endIndex));
765 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
767 //loop through and create all the processes you want
768 while (process != processors) {
772 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
775 num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
777 //pass numSeqs to parent
779 string tempFile = outputFName + toString(getpid()) + ".num.temp";
780 m->openOutputFile(tempFile, out);
786 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
787 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
793 num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
795 //force parent to wait until all the processes are done
796 for (int i=0;i<processIDS.size();i++) {
797 int temp = processIDS[i];
801 for (int i = 0; i < processIDS.size(); i++) {
803 string tempFile = outputFName + toString(processIDS[i]) + ".num.temp";
804 m->openInputFile(tempFile, in);
805 if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
806 in.close(); m->mothurRemove(tempFile);
810 //////////////////////////////////////////////////////////////////////////////////////////////////////
811 //Windows version shared memory, so be careful when passing variables through the preClusterData struct.
812 //Above fork() will clone, so memory is separate, but that's not the case with windows,
813 //////////////////////////////////////////////////////////////////////////////////////////////////////
815 vector<perseusData*> pDataArray;
816 DWORD dwThreadIdArray[processors-1];
817 HANDLE hThreadArray[processors-1];
819 //Create processor worker threads.
820 for( int i=1; i<processors; i++ ){
821 // Allocate memory for thread data.
822 string extension = toString(i) + ".temp";
824 perseusData* tempPerseus = new perseusData(alpha, beta, cutoff, outputFName+extension, fasta, name, group, accnos+extension, groups, m, lines[i].start, lines[i].end, i);
826 pDataArray.push_back(tempPerseus);
827 processIDS.push_back(i);
829 //MyPerseusThreadFunction is in header. It must be global or static to work with the threads.
830 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
831 hThreadArray[i-1] = CreateThread(NULL, 0, MyPerseusThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
835 //using the main process as a worker saves time and memory
836 num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
838 //Wait until all threads have terminated.
839 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
841 //Close all thread handles and free memory allocations.
842 for(int i=0; i < pDataArray.size(); i++){
843 num += pDataArray[i]->count;
844 CloseHandle(hThreadArray[i]);
845 delete pDataArray[i];
850 //append output files
851 for(int i=0;i<processIDS.size();i++){
852 m->appendFiles((outputFName + toString(processIDS[i]) + ".temp"), outputFName);
853 m->mothurRemove((outputFName + toString(processIDS[i]) + ".temp"));
855 m->appendFiles((accnos + toString(processIDS[i]) + ".temp"), accnos);
856 m->mothurRemove((accnos + toString(processIDS[i]) + ".temp"));
862 catch(exception& e) {
863 m->errorOut(e, "ChimeraPerseusCommand", "createProcessesGroups");
867 //**********************************************************************************************************************
868 int ChimeraPerseusCommand::deconvoluteResults(SequenceParser& parser, string outputFileName, string accnosFileName){
870 map<string, string> uniqueNames = parser.getAllSeqsMap();
871 map<string, string>::iterator itUnique;
876 m->openInputFile(accnosFileName, in2);
879 m->openOutputFile(accnosFileName+".temp", out2);
882 set<string> namesInFile; //this is so if a sequence is found to be chimera in several samples we dont write it to the results file more than once
883 set<string>::iterator itNames;
884 set<string> chimerasInFile;
885 set<string>::iterator itChimeras;
889 if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(outputFileName); m->mothurRemove((accnosFileName+".temp")); return 0; }
891 in2 >> name; m->gobble(in2);
894 itUnique = uniqueNames.find(name);
896 if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
898 itChimeras = chimerasInFile.find((itUnique->second));
900 if (itChimeras == chimerasInFile.end()) {
901 out2 << itUnique->second << endl;
902 chimerasInFile.insert((itUnique->second));
910 m->mothurRemove(accnosFileName);
911 rename((accnosFileName+".temp").c_str(), accnosFileName.c_str());
915 m->openInputFile(outputFileName, in);
918 m->openOutputFile(outputFileName+".temp", out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
920 int DiffsToBestMatch, BestMatchIndex, DiffstToChimera, IndexofLeftParent, IndexOfRightParent;
921 float temp1,temp2, temp3, temp4, temp5, temp6, temp7, temp8;
922 string index, BestMatchName, parent1, parent2, flag;
925 //assumptions - in file each read will always look like
927 SequenceIndex Name DiffsToBestMatch BestMatchIndex BestMatchName DiffstToChimera IndexofLeftParent IndexOfRightParent NameOfLeftParent NameOfRightParent DistanceToBestMatch cIndex (cIndex - singleDist) loonIndex MismatchesToChimera MismatchToTrimera ChimeraBreakPoint LogisticProbability TypeOfSequence
928 0 F01QG4L02JVBQY 0 0 Null 0 0 0 Null Null 0.0 0.0 0.0 0.0 0 0 0 0.0 0.0 good
929 1 F01QG4L02ICTC6 0 0 Null 0 0 0 Null Null 0.0 0.0 0.0 0.0 0 0 0 0.0 0.0 good
930 2 F01QG4L02JZOEC 48 0 F01QG4L02JVBQY 47 0 0 F01QG4L02JVBQY F01QG4L02JVBQY 2.0449 2.03545 -0.00944493 0 47 2147483647 138 0 good
931 3 F01QG4L02G7JEC 42 0 F01QG4L02JVBQY 40 1 0 F01QG4L02ICTC6 F01QG4L02JVBQY 1.87477 1.81113 -0.0636404 5.80145 40 2147483647 25 0 good
934 //get and print headers
935 BestMatchName = m->getline(in); m->gobble(in);
936 out << BestMatchName << endl;
940 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove((outputFileName+".temp")); return 0; }
943 in >> index; m->gobble(in);
945 if (index != "SequenceIndex") { //if you are not a header line, there will be a header line for each group if group file is given
946 in >> name; m->gobble(in);
947 in >> DiffsToBestMatch; m->gobble(in);
948 in >> BestMatchIndex; m->gobble(in);
949 in >> BestMatchName; m->gobble(in);
950 in >> DiffstToChimera; m->gobble(in);
951 in >> IndexofLeftParent; m->gobble(in);
952 in >> IndexOfRightParent; m->gobble(in);
953 in >> parent1; m->gobble(in);
954 in >> parent2; m->gobble(in);
955 in >> temp1 >> temp2 >> temp3 >> temp4 >> temp5 >> temp6 >> temp7 >> temp8 >> flag; m->gobble(in);
958 itUnique = uniqueNames.find(name);
960 if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
962 name = itUnique->second;
963 //is this name already in the file
964 itNames = namesInFile.find((name));
966 if (itNames == namesInFile.end()) { //no not in file
967 if (flag == "good") { //are you really a no??
968 //is this sequence really not chimeric??
969 itChimeras = chimerasInFile.find(name);
971 //then you really are a no so print, otherwise skip
972 if (itChimeras == chimerasInFile.end()) { print = true; }
973 }else{ print = true; }
978 out << index << '\t' << name << '\t' << DiffsToBestMatch << '\t' << BestMatchIndex << '\t';
979 namesInFile.insert(name);
981 if (BestMatchName != "Null") {
982 itUnique = uniqueNames.find(BestMatchName);
983 if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find BestMatchName "+ BestMatchName + "."); m->mothurOutEndLine(); m->control_pressed = true; }
984 else { out << itUnique->second << '\t'; }
985 }else { out << "Null" << '\t'; }
987 out << DiffstToChimera << '\t' << IndexofLeftParent << '\t' << IndexOfRightParent << '\t';
989 if (parent1 != "Null") {
990 itUnique = uniqueNames.find(parent1);
991 if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find parent1 "+ parent1 + "."); m->mothurOutEndLine(); m->control_pressed = true; }
992 else { out << itUnique->second << '\t'; }
993 }else { out << "Null" << '\t'; }
995 if (parent1 != "Null") {
996 itUnique = uniqueNames.find(parent2);
997 if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find parent2 "+ parent2 + "."); m->mothurOutEndLine(); m->control_pressed = true; }
998 else { out << itUnique->second << '\t'; }
999 }else { out << "Null" << '\t'; }
1001 out << temp1 << '\t' << temp2 << '\t' << temp3 << '\t' << temp4 << '\t' << temp5 << '\t' << temp6 << '\t' << temp7 << '\t' << temp8 << '\t' << flag << endl;
1003 }else { index = m->getline(in); m->gobble(in); }
1008 m->mothurRemove(outputFileName);
1009 rename((outputFileName+".temp").c_str(), outputFileName.c_str());
1013 catch(exception& e) {
1014 m->errorOut(e, "ChimeraPerseusCommand", "deconvoluteResults");
1018 //**********************************************************************************************************************