2 * chimeraperseuscommand.cpp
5 * Created by westcott on 10/26/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "chimeraperseuscommand.h"
11 #include "deconvolutecommand.h"
12 #include "sequence.hpp"
13 //**********************************************************************************************************************
14 vector<string> ChimeraPerseusCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
18 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
21 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
22 CommandParameter pcutoff("cutoff", "Number", "", "0.5", "", "", "",false,false); parameters.push_back(pcutoff);
23 CommandParameter palpha("alpha", "Number", "", "-5.54", "", "", "",false,false); parameters.push_back(palpha);
24 CommandParameter pbeta("beta", "Number", "", "0.33", "", "", "",false,false); parameters.push_back(pbeta);
26 vector<string> myArray;
27 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
31 m->errorOut(e, "ChimeraPerseusCommand", "setParameters");
35 //**********************************************************************************************************************
36 string ChimeraPerseusCommand::getHelpString(){
38 string helpString = "";
39 helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n";
40 helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n";
41 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
42 helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n";
43 helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
44 helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
45 helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
46 helpString += "The alpha parameter .... The default is -5.54. \n";
47 helpString += "The beta parameter .... The default is 0.33. \n";
48 helpString += "The cutoff parameter .... The default is 0.50. \n";
49 helpString += "The chimera.perseus command should be in the following format: \n";
50 helpString += "chimera.perseus(fasta=yourFastaFile, name=yourNameFile) \n";
51 helpString += "Example: chimera.perseus(fasta=AD.align, name=AD.names) \n";
52 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
56 m->errorOut(e, "ChimeraPerseusCommand", "getHelpString");
60 //**********************************************************************************************************************
61 string ChimeraPerseusCommand::getOutputFileNameTag(string type, string inputName=""){
63 string outputFileName = "";
64 map<string, vector<string> >::iterator it;
66 //is this a type this command creates
67 it = outputTypes.find(type);
68 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
70 if (type == "chimera") { outputFileName = "perseus.chimeras"; }
71 else if (type == "accnos") { outputFileName = "perseus.accnos"; }
72 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
74 return outputFileName;
77 m->errorOut(e, "ChimeraPerseusCommand", "getOutputFileNameTag");
81 //**********************************************************************************************************************
82 ChimeraPerseusCommand::ChimeraPerseusCommand(){
84 abort = true; calledHelp = true;
86 vector<string> tempOutNames;
87 outputTypes["chimera"] = tempOutNames;
88 outputTypes["accnos"] = tempOutNames;
91 m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand");
95 //***************************************************************************************************************
96 ChimeraPerseusCommand::ChimeraPerseusCommand(string option) {
98 abort = false; calledHelp = false;
100 //allow user to run help
101 if(option == "help") { help(); abort = true; calledHelp = true; }
102 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
105 vector<string> myArray = setParameters();
107 OptionParser parser(option);
108 map<string,string> parameters = parser.getParameters();
110 ValidParameters validParameter("chimera.uchime");
111 map<string,string>::iterator it;
113 //check to make sure all parameters are valid for command
114 for (it = parameters.begin(); it != parameters.end(); it++) {
115 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
118 vector<string> tempOutNames;
119 outputTypes["chimera"] = tempOutNames;
120 outputTypes["accnos"] = tempOutNames;
122 //if the user changes the input directory command factory will send this info to us in the output parameter
123 string inputDir = validParameter.validFile(parameters, "inputdir", false);
124 if (inputDir == "not found"){ inputDir = ""; }
126 //check for required parameters
127 fastafile = validParameter.validFile(parameters, "fasta", false);
128 if (fastafile == "not found") {
129 //if there is a current fasta file, use it
130 string filename = m->getFastaFile();
131 if (filename != "") { fastaFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
132 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
134 m->splitAtDash(fastafile, fastaFileNames);
136 //go through files and make sure they are good, if not, then disregard them
137 for (int i = 0; i < fastaFileNames.size(); i++) {
140 if (fastaFileNames[i] == "current") {
141 fastaFileNames[i] = m->getFastaFile();
142 if (fastaFileNames[i] != "") { m->mothurOut("Using " + fastaFileNames[i] + " as input file for the fasta parameter where you had given current."); m->mothurOutEndLine(); }
144 m->mothurOut("You have no current fastafile, ignoring current."); m->mothurOutEndLine(); ignore=true;
145 //erase from file list
146 fastaFileNames.erase(fastaFileNames.begin()+i);
153 if (inputDir != "") {
154 string path = m->hasPath(fastaFileNames[i]);
155 //if the user has not given a path then, add inputdir. else leave path alone.
156 if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; }
162 ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror");
164 //if you can't open it, try default location
165 if (ableToOpen == 1) {
166 if (m->getDefaultPath() != "") { //default path is set
167 string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]);
168 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
170 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
172 fastaFileNames[i] = tryPath;
176 if (ableToOpen == 1) {
177 if (m->getOutputDir() != "") { //default path is set
178 string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]);
179 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
181 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
183 fastaFileNames[i] = tryPath;
189 if (ableToOpen == 1) {
190 m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
191 //erase from file list
192 fastaFileNames.erase(fastaFileNames.begin()+i);
195 m->setFastaFile(fastaFileNames[i]);
200 //make sure there is at least one valid file left
201 if (fastaFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
205 //check for required parameters
207 namefile = validParameter.validFile(parameters, "name", false);
208 if (namefile == "not found") {
209 //if there is a current fasta file, use it
210 string filename = m->getNameFile();
211 if (filename != "") { nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); }
212 else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
215 m->splitAtDash(namefile, nameFileNames);
217 //go through files and make sure they are good, if not, then disregard them
218 for (int i = 0; i < nameFileNames.size(); i++) {
221 if (nameFileNames[i] == "current") {
222 nameFileNames[i] = m->getNameFile();
223 if (nameFileNames[i] != "") { m->mothurOut("Using " + nameFileNames[i] + " as input file for the name parameter where you had given current."); m->mothurOutEndLine(); }
225 m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true;
226 //erase from file list
227 nameFileNames.erase(nameFileNames.begin()+i);
234 if (inputDir != "") {
235 string path = m->hasPath(nameFileNames[i]);
236 //if the user has not given a path then, add inputdir. else leave path alone.
237 if (path == "") { nameFileNames[i] = inputDir + nameFileNames[i]; }
243 ableToOpen = m->openInputFile(nameFileNames[i], in, "noerror");
245 //if you can't open it, try default location
246 if (ableToOpen == 1) {
247 if (m->getDefaultPath() != "") { //default path is set
248 string tryPath = m->getDefaultPath() + m->getSimpleName(nameFileNames[i]);
249 m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
251 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
253 nameFileNames[i] = tryPath;
257 if (ableToOpen == 1) {
258 if (m->getOutputDir() != "") { //default path is set
259 string tryPath = m->getOutputDir() + m->getSimpleName(nameFileNames[i]);
260 m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
262 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
264 nameFileNames[i] = tryPath;
270 if (ableToOpen == 1) {
271 m->mothurOut("Unable to open " + nameFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
272 //erase from file list
273 nameFileNames.erase(nameFileNames.begin()+i);
276 m->setNameFile(nameFileNames[i]);
281 //make sure there is at least one valid file left
282 if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; }
285 if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
287 bool hasGroup = true;
288 groupfile = validParameter.validFile(parameters, "group", false);
289 if (groupfile == "not found") { groupfile = ""; hasGroup = false; }
291 m->splitAtDash(groupfile, groupFileNames);
293 //go through files and make sure they are good, if not, then disregard them
294 for (int i = 0; i < groupFileNames.size(); i++) {
297 if (groupFileNames[i] == "current") {
298 groupFileNames[i] = m->getGroupFile();
299 if (groupFileNames[i] != "") { m->mothurOut("Using " + groupFileNames[i] + " as input file for the group parameter where you had given current."); m->mothurOutEndLine(); }
301 m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true;
302 //erase from file list
303 groupFileNames.erase(groupFileNames.begin()+i);
310 if (inputDir != "") {
311 string path = m->hasPath(groupFileNames[i]);
312 //if the user has not given a path then, add inputdir. else leave path alone.
313 if (path == "") { groupFileNames[i] = inputDir + groupFileNames[i]; }
319 ableToOpen = m->openInputFile(groupFileNames[i], in, "noerror");
321 //if you can't open it, try default location
322 if (ableToOpen == 1) {
323 if (m->getDefaultPath() != "") { //default path is set
324 string tryPath = m->getDefaultPath() + m->getSimpleName(groupFileNames[i]);
325 m->mothurOut("Unable to open " + groupFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
327 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
329 groupFileNames[i] = tryPath;
333 if (ableToOpen == 1) {
334 if (m->getOutputDir() != "") { //default path is set
335 string tryPath = m->getOutputDir() + m->getSimpleName(groupFileNames[i]);
336 m->mothurOut("Unable to open " + groupFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
338 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
340 groupFileNames[i] = tryPath;
346 if (ableToOpen == 1) {
347 m->mothurOut("Unable to open " + groupFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
348 //erase from file list
349 groupFileNames.erase(groupFileNames.begin()+i);
352 m->setGroupFile(groupFileNames[i]);
357 //make sure there is at least one valid file left
358 if (groupFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid group files."); m->mothurOutEndLine(); abort = true; }
361 if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
364 //if the user changes the output directory command factory will send this info to us in the output parameter
365 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
367 string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
368 m->setProcessors(temp);
369 m->mothurConvert(temp, processors);
371 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.50"; }
372 m->mothurConvert(temp, cutoff);
374 temp = validParameter.validFile(parameters, "alpha", false); if (temp == "not found"){ temp = "-5.54"; }
375 m->mothurConvert(temp, alpha);
377 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.33"; }
378 m->mothurConvert(temp, beta);
381 catch(exception& e) {
382 m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand");
386 //***************************************************************************************************************
388 int ChimeraPerseusCommand::execute(){
390 if (abort == true) { if (calledHelp) { return 0; } return 2; }
394 for (int s = 0; s < fastaFileNames.size(); s++) {
396 m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
398 int start = time(NULL);
399 if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it
400 string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + getOutputFileNameTag("chimera");
401 string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + getOutputFileNameTag("accnos");
403 //string newFasta = m->getRootName(fastaFileNames[s]) + "temp";
405 //you provided a groupfile
406 string groupFile = "";
407 if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; }
409 string nameFile = "";
410 if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one
411 nameFile = nameFileNames[s];
412 }else { nameFile = getNamesFile(fastaFileNames[s]); }
414 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
419 if (groupFile != "") {
420 //Parse sequences by group
421 SequenceParser parser(groupFile, fastaFileNames[s], nameFile);
422 vector<string> groups = parser.getNamesOfGroups();
424 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
427 ofstream out, out1, out2;
428 m->openOutputFile(outputFileName, out); out.close();
429 m->openOutputFile(accnosFileName, out1); out1.close();
431 if(processors == 1) { numSeqs = driverGroups(parser, outputFileName, accnosFileName, 0, groups.size(), groups); }
432 else { numSeqs = createProcessesGroups(parser, outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); }
434 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
436 numChimeras = deconvoluteResults(parser, outputFileName, accnosFileName);
438 m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine();
440 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
443 if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
445 //read sequences and store sorted by frequency
446 vector<seqData> sequences = readFiles(fastaFileNames[s], nameFile);
448 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
450 numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras);
453 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
455 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found."); m->mothurOutEndLine();
456 outputNames.push_back(outputFileName); outputTypes["chimera"].push_back(outputFileName);
457 outputNames.push_back(accnosFileName); outputTypes["accnos"].push_back(accnosFileName);
460 //set accnos file as new current accnosfile
462 itTypes = outputTypes.find("accnos");
463 if (itTypes != outputTypes.end()) {
464 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
467 m->mothurOutEndLine();
468 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
469 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
470 m->mothurOutEndLine();
475 catch(exception& e) {
476 m->errorOut(e, "ChimeraPerseusCommand", "execute");
480 //**********************************************************************************************************************
481 string ChimeraPerseusCommand::getNamesFile(string& inputFile){
483 string nameFile = "";
485 m->mothurOutEndLine(); m->mothurOut("No namesfile given, running unique.seqs command to generate one."); m->mothurOutEndLine(); m->mothurOutEndLine();
487 //use unique.seqs to create new name and fastafile
488 string inputString = "fasta=" + inputFile;
489 m->mothurOut("/******************************************/"); m->mothurOutEndLine();
490 m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
491 m->mothurCalling = true;
493 Command* uniqueCommand = new DeconvoluteCommand(inputString);
494 uniqueCommand->execute();
496 map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
498 delete uniqueCommand;
499 m->mothurCalling = false;
500 m->mothurOut("/******************************************/"); m->mothurOutEndLine();
502 nameFile = filenames["name"][0];
503 inputFile = filenames["fasta"][0];
507 catch(exception& e) {
508 m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
512 //**********************************************************************************************************************
513 int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFName, string accnos, int start, int end, vector<string> groups){
519 for (int i = start; i < end; i++) {
521 m->mothurOutEndLine(); m->mothurOut("Checking sequences from group " + groups[i] + "..."); m->mothurOutEndLine();
523 int start = time(NULL); if (m->control_pressed) { return 0; }
525 vector<seqData> sequences = loadSequences(parser, groups[i]);
527 if (m->control_pressed) { return 0; }
529 int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras);
530 totalSeqs += numSeqs;
532 if (m->control_pressed) { return 0; }
535 m->appendFiles((outputFName+groups[i]), outputFName); m->mothurRemove((outputFName+groups[i]));
536 m->appendFiles((accnos+groups[i]), accnos); m->mothurRemove((accnos+groups[i]));
538 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + groups[i] + "."); m->mothurOutEndLine();
544 catch(exception& e) {
545 m->errorOut(e, "ChimeraPerseusCommand", "driverGroups");
549 //**********************************************************************************************************************
550 vector<seqData> ChimeraPerseusCommand::loadSequences(SequenceParser& parser, string group){
553 vector<Sequence> thisGroupsSeqs = parser.getSeqs(group);
554 map<string, string> nameMap = parser.getNameMap(group);
555 map<string, string>::iterator it;
557 vector<seqData> sequences;
561 for (int i = 0; i < thisGroupsSeqs.size(); i++) {
563 if (m->control_pressed) { return sequences; }
565 it = nameMap.find(thisGroupsSeqs[i].getName());
566 if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
568 int num = m->getNumNames(it->second);
569 sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
570 if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
574 if (error) { m->control_pressed = true; }
577 sort(sequences.rbegin(), sequences.rend());
581 catch(exception& e) {
582 m->errorOut(e, "ChimeraPerseusCommand", "loadSequences");
587 //**********************************************************************************************************************
588 vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, string name){
590 map<string, int>::iterator it;
591 map<string, int> nameMap = m->readNames(name);
593 //read fasta file and create sequenceData structure - checking for file mismatches
594 vector<seqData> sequences;
597 m->openInputFile(inputFile, in);
602 if (m->control_pressed) { in.close(); return sequences; }
604 Sequence temp(in); m->gobble(in);
606 it = nameMap.find(temp.getName());
607 if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + temp.getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
609 sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), it->second));
610 if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); }
615 if (error) { m->control_pressed = true; }
618 sort(sequences.rbegin(), sequences.rend());
622 catch(exception& e) {
623 m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
627 //**********************************************************************************************************************
628 int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& sequences, string accnosFileName, int& numChimeras){
631 vector<vector<double> > correctModel(4); //could be an option in the future to input own model matrix
632 for(int i=0;i<4;i++){ correctModel[i].resize(4); }
634 correctModel[0][0] = 0.000000; //AA
635 correctModel[1][0] = 11.619259; //CA
636 correctModel[2][0] = 11.694004; //TA
637 correctModel[3][0] = 7.748623; //GA
639 correctModel[1][1] = 0.000000; //CC
640 correctModel[2][1] = 7.619657; //TC
641 correctModel[3][1] = 12.852562; //GC
643 correctModel[2][2] = 0.000000; //TT
644 correctModel[3][2] = 10.964048; //TG
646 correctModel[3][3] = 0.000000; //GG
648 for(int i=0;i<4;i++){
649 for(int j=0;j<i;j++){
650 correctModel[j][i] = correctModel[i][j];
654 int numSeqs = sequences.size();
655 //int alignLength = sequences[0].sequence.size();
657 ofstream chimeraFile;
659 m->openOutputFile(chimeraFileName, chimeraFile);
660 m->openOutputFile(accnosFileName, accnosFile);
663 vector<vector<double> > binMatrix = myPerseus.binomial(alignLength);
665 chimeraFile << "SequenceIndex\tName\tDiffsToBestMatch\tBestMatchIndex\tBestMatchName\tDiffstToChimera\tIndexofLeftParent\tIndexOfRightParent\tNameOfLeftParent\tNameOfRightParent\tDistanceToBestMatch\tcIndex\t(cIndex - singleDist)\tloonIndex\tMismatchesToChimera\tMismatchToTrimera\tChimeraBreakPoint\tLogisticProbability\tTypeOfSequence\n";
667 vector<bool> chimeras(numSeqs, 0);
669 for(int i=0;i<numSeqs;i++){
670 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
672 vector<bool> restricted = chimeras;
674 vector<vector<int> > leftDiffs(numSeqs);
675 vector<vector<int> > leftMaps(numSeqs);
676 vector<vector<int> > rightDiffs(numSeqs);
677 vector<vector<int> > rightMaps(numSeqs);
679 vector<int> singleLeft, bestLeft;
680 vector<int> singleRight, bestRight;
682 int bestSingleIndex, bestSingleDiff;
683 vector<pwAlign> alignments(numSeqs);
685 int comparisons = myPerseus.getAlignments(i, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
686 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
688 int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
690 string dummyA, dummyB;
692 if (sequences[i].sequence.size() < 3) {
693 chimeraFile << i << '\t' << sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
694 }else if(comparisons >= 2){
695 minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
696 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
698 int minMismatchToTrimera = numeric_limits<int>::max();
699 int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
701 if(minMismatchToChimera >= 3 && comparisons >= 3){
702 minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
703 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
706 double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
708 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
711 string chimeraRefSeq;
713 if(minMismatchToChimera - minMismatchToTrimera >= 3){
715 chimeraRefSeq = myPerseus.stitchTrimera(alignments, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, leftMaps, rightMaps);
719 chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
722 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
724 double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
726 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
728 double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq);
729 double loonIndex = myPerseus.calcLoonIndex(sequences[i].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);
731 if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
733 chimeraFile << i << '\t' << sequences[i].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
734 chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
735 chimeraFile << singleDist << '\t' << cIndex << '\t' << (cIndex - singleDist) << '\t' << loonIndex << '\t';
736 chimeraFile << minMismatchToChimera << '\t' << minMismatchToTrimera << '\t' << breakPointBi << '\t';
738 double probability = myPerseus.classifyChimera(singleDist, cIndex, loonIndex, alpha, beta);
740 chimeraFile << probability << '\t';
742 if(probability > cutoff){
743 chimeraFile << type << endl;
744 accnosFile << sequences[i].seqName << endl;
749 chimeraFile << "good" << endl;
754 chimeraFile << i << '\t' << sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
758 if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1) + "\n"); }
761 if((numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(numSeqs) + "\n"); }
768 catch(exception& e) {
769 m->errorOut(e, "ChimeraPerseusCommand", "driver");
773 /**************************************************************************************************/
774 int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string accnos, vector<string> groups, string group, string fasta, string name) {
777 vector<int> processIDS;
782 if (groups.size() < processors) { processors = groups.size(); }
784 //divide the groups between the processors
785 vector<linePair> lines;
786 int numGroupsPerProcessor = groups.size() / processors;
787 for (int i = 0; i < processors; i++) {
788 int startIndex = i * numGroupsPerProcessor;
789 int endIndex = (i+1) * numGroupsPerProcessor;
790 if(i == (processors - 1)){ endIndex = groups.size(); }
791 lines.push_back(linePair(startIndex, endIndex));
794 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
796 //loop through and create all the processes you want
797 while (process != processors) {
801 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
804 num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
806 //pass numSeqs to parent
808 string tempFile = outputFName + toString(getpid()) + ".num.temp";
809 m->openOutputFile(tempFile, out);
815 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
816 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
822 num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
824 //force parent to wait until all the processes are done
825 for (int i=0;i<processIDS.size();i++) {
826 int temp = processIDS[i];
830 for (int i = 0; i < processIDS.size(); i++) {
832 string tempFile = outputFName + toString(processIDS[i]) + ".num.temp";
833 m->openInputFile(tempFile, in);
834 if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
835 in.close(); m->mothurRemove(tempFile);
839 //////////////////////////////////////////////////////////////////////////////////////////////////////
840 //Windows version shared memory, so be careful when passing variables through the preClusterData struct.
841 //Above fork() will clone, so memory is separate, but that's not the case with windows,
842 //////////////////////////////////////////////////////////////////////////////////////////////////////
844 vector<perseusData*> pDataArray;
845 DWORD dwThreadIdArray[processors-1];
846 HANDLE hThreadArray[processors-1];
848 //Create processor worker threads.
849 for( int i=1; i<processors; i++ ){
850 // Allocate memory for thread data.
851 string extension = toString(i) + ".temp";
853 perseusData* tempPerseus = new perseusData(alpha, beta, cutoff, outputFName+extension, fasta, name, group, accnos+extension, groups, m, lines[i].start, lines[i].end, i);
855 pDataArray.push_back(tempPerseus);
856 processIDS.push_back(i);
858 //MyPerseusThreadFunction is in header. It must be global or static to work with the threads.
859 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
860 hThreadArray[i-1] = CreateThread(NULL, 0, MyPerseusThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
864 //using the main process as a worker saves time and memory
865 num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
867 //Wait until all threads have terminated.
868 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
870 //Close all thread handles and free memory allocations.
871 for(int i=0; i < pDataArray.size(); i++){
872 num += pDataArray[i]->count;
873 CloseHandle(hThreadArray[i]);
874 delete pDataArray[i];
879 //append output files
880 for(int i=0;i<processIDS.size();i++){
881 m->appendFiles((outputFName + toString(processIDS[i]) + ".temp"), outputFName);
882 m->mothurRemove((outputFName + toString(processIDS[i]) + ".temp"));
884 m->appendFiles((accnos + toString(processIDS[i]) + ".temp"), accnos);
885 m->mothurRemove((accnos + toString(processIDS[i]) + ".temp"));
891 catch(exception& e) {
892 m->errorOut(e, "ChimeraPerseusCommand", "createProcessesGroups");
896 //**********************************************************************************************************************
897 int ChimeraPerseusCommand::deconvoluteResults(SequenceParser& parser, string outputFileName, string accnosFileName){
899 map<string, string> uniqueNames = parser.getAllSeqsMap();
900 map<string, string>::iterator itUnique;
905 m->openInputFile(accnosFileName, in2);
908 m->openOutputFile(accnosFileName+".temp", out2);
911 set<string> namesInFile; //this is so if a sequence is found to be chimera in several samples we dont write it to the results file more than once
912 set<string>::iterator itNames;
913 set<string> chimerasInFile;
914 set<string>::iterator itChimeras;
918 if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(outputFileName); m->mothurRemove((accnosFileName+".temp")); return 0; }
920 in2 >> name; m->gobble(in2);
923 itUnique = uniqueNames.find(name);
925 if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
927 itChimeras = chimerasInFile.find((itUnique->second));
929 if (itChimeras == chimerasInFile.end()) {
930 out2 << itUnique->second << endl;
931 chimerasInFile.insert((itUnique->second));
939 m->mothurRemove(accnosFileName);
940 rename((accnosFileName+".temp").c_str(), accnosFileName.c_str());
944 m->openInputFile(outputFileName, in);
947 m->openOutputFile(outputFileName+".temp", out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
949 int DiffsToBestMatch, BestMatchIndex, DiffstToChimera, IndexofLeftParent, IndexOfRightParent;
950 float temp1,temp2, temp3, temp4, temp5, temp6, temp7, temp8;
951 string index, BestMatchName, parent1, parent2, flag;
954 //assumptions - in file each read will always look like
956 SequenceIndex Name DiffsToBestMatch BestMatchIndex BestMatchName DiffstToChimera IndexofLeftParent IndexOfRightParent NameOfLeftParent NameOfRightParent DistanceToBestMatch cIndex (cIndex - singleDist) loonIndex MismatchesToChimera MismatchToTrimera ChimeraBreakPoint LogisticProbability TypeOfSequence
957 0 F01QG4L02JVBQY 0 0 Null 0 0 0 Null Null 0.0 0.0 0.0 0.0 0 0 0 0.0 0.0 good
958 1 F01QG4L02ICTC6 0 0 Null 0 0 0 Null Null 0.0 0.0 0.0 0.0 0 0 0 0.0 0.0 good
959 2 F01QG4L02JZOEC 48 0 F01QG4L02JVBQY 47 0 0 F01QG4L02JVBQY F01QG4L02JVBQY 2.0449 2.03545 -0.00944493 0 47 2147483647 138 0 good
960 3 F01QG4L02G7JEC 42 0 F01QG4L02JVBQY 40 1 0 F01QG4L02ICTC6 F01QG4L02JVBQY 1.87477 1.81113 -0.0636404 5.80145 40 2147483647 25 0 good
963 //get and print headers
964 BestMatchName = m->getline(in); m->gobble(in);
965 out << BestMatchName << endl;
969 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove((outputFileName+".temp")); return 0; }
972 in >> index; m->gobble(in);
974 if (index != "SequenceIndex") { //if you are not a header line, there will be a header line for each group if group file is given
975 in >> name; m->gobble(in);
976 in >> DiffsToBestMatch; m->gobble(in);
977 in >> BestMatchIndex; m->gobble(in);
978 in >> BestMatchName; m->gobble(in);
979 in >> DiffstToChimera; m->gobble(in);
980 in >> IndexofLeftParent; m->gobble(in);
981 in >> IndexOfRightParent; m->gobble(in);
982 in >> parent1; m->gobble(in);
983 in >> parent2; m->gobble(in);
984 in >> temp1 >> temp2 >> temp3 >> temp4 >> temp5 >> temp6 >> temp7 >> temp8 >> flag; m->gobble(in);
987 itUnique = uniqueNames.find(name);
989 if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
991 name = itUnique->second;
992 //is this name already in the file
993 itNames = namesInFile.find((name));
995 if (itNames == namesInFile.end()) { //no not in file
996 if (flag == "good") { //are you really a no??
997 //is this sequence really not chimeric??
998 itChimeras = chimerasInFile.find(name);
1000 //then you really are a no so print, otherwise skip
1001 if (itChimeras == chimerasInFile.end()) { print = true; }
1002 }else{ print = true; }
1007 out << index << '\t' << name << '\t' << DiffsToBestMatch << '\t' << BestMatchIndex << '\t';
1008 namesInFile.insert(name);
1010 if (BestMatchName != "Null") {
1011 itUnique = uniqueNames.find(BestMatchName);
1012 if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find BestMatchName "+ BestMatchName + "."); m->mothurOutEndLine(); m->control_pressed = true; }
1013 else { out << itUnique->second << '\t'; }
1014 }else { out << "Null" << '\t'; }
1016 out << DiffstToChimera << '\t' << IndexofLeftParent << '\t' << IndexOfRightParent << '\t';
1018 if (parent1 != "Null") {
1019 itUnique = uniqueNames.find(parent1);
1020 if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find parent1 "+ parent1 + "."); m->mothurOutEndLine(); m->control_pressed = true; }
1021 else { out << itUnique->second << '\t'; }
1022 }else { out << "Null" << '\t'; }
1024 if (parent1 != "Null") {
1025 itUnique = uniqueNames.find(parent2);
1026 if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing chimera results. Cannot find parent2 "+ parent2 + "."); m->mothurOutEndLine(); m->control_pressed = true; }
1027 else { out << itUnique->second << '\t'; }
1028 }else { out << "Null" << '\t'; }
1030 out << temp1 << '\t' << temp2 << '\t' << temp3 << '\t' << temp4 << '\t' << temp5 << '\t' << temp6 << '\t' << temp7 << '\t' << temp8 << '\t' << flag << endl;
1032 }else { index = m->getline(in); m->gobble(in); }
1037 m->mothurRemove(outputFileName);
1038 rename((outputFileName+".temp").c_str(), outputFileName.c_str());
1042 catch(exception& e) {
1043 m->errorOut(e, "ChimeraPerseusCommand", "deconvoluteResults");
1047 //**********************************************************************************************************************