5 * Created by Pat Schloss on 7/15/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "seqerrorcommand.h"
11 #include "reportfile.h"
12 #include "qualityscores.h"
13 #include "refchimeratest.h"
14 #include "filterseqscommand.h"
17 //**********************************************************************************************************************
18 vector<string> SeqErrorCommand::setParameters(){
20 CommandParameter pquery("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pquery);
21 CommandParameter preference("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(preference);
22 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(pqfile);
23 CommandParameter preport("report", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(preport);
24 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
25 CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pignorechimeras);
26 CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pthreshold);
27 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28 CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave);
29 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
30 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
32 vector<string> myArray;
33 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
37 m->errorOut(e, "SeqErrorCommand", "setParameters");
41 //**********************************************************************************************************************
42 string SeqErrorCommand::getHelpString(){
44 string helpString = "";
45 helpString += "The seq.error command reads a query alignment file and a reference alignment file and creates .....\n";
46 helpString += "The fasta parameter...\n";
47 helpString += "The reference parameter...\n";
48 helpString += "The qfile parameter...\n";
49 helpString += "The report parameter...\n";
50 helpString += "The name parameter...\n";
51 helpString += "The ignorechimeras parameter...\n";
52 helpString += "The threshold parameter...\n";
53 helpString += "The processors parameter...\n";
54 helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f.";
55 helpString += "Example seq.error(...).\n";
56 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
57 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n";
61 m->errorOut(e, "SeqErrorCommand", "getHelpString");
65 //**********************************************************************************************************************
66 string SeqErrorCommand::getOutputFileNameTag(string type, string inputName=""){
68 string outputFileName = "";
69 map<string, vector<string> >::iterator it;
71 //is this a type this command creates
72 it = outputTypes.find(type);
73 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
75 if (type == "errorsummary") { outputFileName = "error.summary"; }
76 else if (type == "errorseq") { outputFileName = "error.seq"; }
77 else if (type == "errorquality") { outputFileName = "error.quality"; }
78 else if (type == "errorqualforward") { outputFileName = "error.qual.forward"; }
79 else if (type == "errorqualreverse") { outputFileName = "error.qual.reverse"; }
80 else if (type == "errorforward") { outputFileName = "error.seq.forward"; }
81 else if (type == "errorreverse") { outputFileName = "error.seq.reverse"; }
82 else if (type == "errorcount") { outputFileName = "error.count"; }
83 else if (type == "errormatrix") { outputFileName = "error.matrix"; }
84 else if (type == "errorchimera") { outputFileName = "error.chimera"; }
85 else if (type == "errorref-query") { outputFileName = "error.ref-query"; }
86 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
88 return outputFileName;
91 m->errorOut(e, "SeqErrorCommand", "getOutputFileNameTag");
95 //**********************************************************************************************************************
96 SeqErrorCommand::SeqErrorCommand(){
98 abort = true; calledHelp = true;
100 vector<string> tempOutNames;
101 outputTypes["errorsummary"] = tempOutNames;
102 outputTypes["errorseq"] = tempOutNames;
103 outputTypes["errorquality"] = tempOutNames;
104 outputTypes["errorqualforward"] = tempOutNames;
105 outputTypes["errorqualreverse"] = tempOutNames;
106 outputTypes["errorforward"] = tempOutNames;
107 outputTypes["errorreverse"] = tempOutNames;
108 outputTypes["errorcount"] = tempOutNames;
109 outputTypes["errormatrix"] = tempOutNames;
110 outputTypes["errorchimera"] = tempOutNames;
111 outputTypes["errorref-query"] = tempOutNames;
113 catch(exception& e) {
114 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
118 //***************************************************************************************************************
120 SeqErrorCommand::SeqErrorCommand(string option) {
123 abort = false; calledHelp = false;
124 rdb = ReferenceDB::getInstance();
126 //allow user to run help
127 if(option == "help") { help(); abort = true; calledHelp = true; }
128 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
132 vector<string> myArray = setParameters();
134 OptionParser parser(option);
135 map<string,string> parameters = parser.getParameters();
137 ValidParameters validParameter;
138 map<string,string>::iterator it;
140 //check to make sure all parameters are valid for command
141 for (it = parameters.begin(); it != parameters.end(); it++) {
142 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
145 //initialize outputTypes
146 vector<string> tempOutNames;
147 outputTypes["errorsummary"] = tempOutNames;
148 outputTypes["errorseq"] = tempOutNames;
149 outputTypes["errorquality"] = tempOutNames;
150 outputTypes["errorqualforward"] = tempOutNames;
151 outputTypes["errorqualreverse"] = tempOutNames;
152 outputTypes["errorforward"] = tempOutNames;
153 outputTypes["errorreverse"] = tempOutNames;
154 outputTypes["errorcount"] = tempOutNames;
155 outputTypes["errormatrix"] = tempOutNames;
156 outputTypes["errorchimera"] = tempOutNames;
157 outputTypes["errorref-query"] = tempOutNames;
160 //if the user changes the input directory command factory will send this info to us in the output parameter
161 string inputDir = validParameter.validFile(parameters, "inputdir", false);
162 if (inputDir == "not found"){ inputDir = ""; }
165 it = parameters.find("fasta");
166 //user has given a template file
167 if(it != parameters.end()){
168 path = m->hasPath(it->second);
169 //if the user has not given a path then, add inputdir. else leave path alone.
170 if (path == "") { parameters["fasta"] = inputDir + it->second; }
173 it = parameters.find("reference");
174 //user has given a template file
175 if(it != parameters.end()){
176 path = m->hasPath(it->second);
177 //if the user has not given a path then, add inputdir. else leave path alone.
178 if (path == "") { parameters["reference"] = inputDir + it->second; }
181 it = parameters.find("name");
182 //user has given a names file
183 if(it != parameters.end()){
184 path = m->hasPath(it->second);
185 //if the user has not given a path then, add inputdir. else leave path alone.
186 if (path == "") { parameters["name"] = inputDir + it->second; }
189 it = parameters.find("qfile");
190 //user has given a quality score file
191 if(it != parameters.end()){
192 path = m->hasPath(it->second);
193 //if the user has not given a path then, add inputdir. else leave path alone.
194 if (path == "") { parameters["qfile"] = inputDir + it->second; }
197 it = parameters.find("report");
198 //user has given a alignment report file
199 if(it != parameters.end()){
200 path = m->hasPath(it->second);
201 //if the user has not given a path then, add inputdir. else leave path alone.
202 if (path == "") { parameters["report"] = inputDir + it->second; }
206 //check for required parameters
207 queryFileName = validParameter.validFile(parameters, "fasta", true);
208 if (queryFileName == "not found") {
209 queryFileName = m->getFastaFile();
210 if (queryFileName != "") { m->mothurOut("Using " + queryFileName + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
211 else { m->mothurOut("You have no current fasta file and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
213 else if (queryFileName == "not open") { queryFileName = ""; abort = true; }
214 else { m->setFastaFile(queryFileName); }
216 referenceFileName = validParameter.validFile(parameters, "reference", true);
217 if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
218 else if (referenceFileName == "not open") { abort = true; }
220 //check for optional parameters
221 namesFileName = validParameter.validFile(parameters, "name", true);
222 if(namesFileName == "not found"){ namesFileName = ""; }
223 else if (namesFileName == "not open") { namesFileName = ""; abort = true; }
224 else { m->setNameFile(namesFileName); }
226 qualFileName = validParameter.validFile(parameters, "qfile", true);
227 if(qualFileName == "not found"){ qualFileName = ""; }
228 else if (qualFileName == "not open") { qualFileName = ""; abort = true; }
229 else { m->setQualFile(qualFileName); }
231 reportFileName = validParameter.validFile(parameters, "report", true);
232 if(reportFileName == "not found"){ reportFileName = ""; }
233 else if (reportFileName == "not open") { reportFileName = ""; abort = true; }
235 if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){
236 m->mothurOut("if you use either a qual file or a report file, you have to have both.");
237 m->mothurOutEndLine();
241 outputDir = validParameter.validFile(parameters, "outputdir", false);
242 if (outputDir == "not found"){
244 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it
247 //check for optional parameter and set defaults
248 // ...at some point should added some additional type checking...
249 temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; }
250 m->mothurConvert(temp, threshold);
252 temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; }
253 save = m->isTrue(temp);
255 if (save) { //clear out old references
259 //this has to go after save so that if the user sets save=t and provides no reference we abort
260 referenceFileName = validParameter.validFile(parameters, "reference", true);
261 if (referenceFileName == "not found") {
262 //check for saved reference sequences
263 if (rdb->referenceSeqs.size() != 0) {
264 referenceFileName = "saved";
266 m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required.");
267 m->mothurOutEndLine();
270 }else if (referenceFileName == "not open") { abort = true; }
271 else { if (save) { rdb->setSavedReference(referenceFileName); } }
274 temp = validParameter.validFile(parameters, "ignorechimeras", false); if (temp == "not found") { temp = "T"; }
275 ignoreChimeras = m->isTrue(temp);
277 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
278 m->setProcessors(temp);
279 m->mothurConvert(temp, processors);
281 substitutionMatrix.resize(6);
282 for(int i=0;i<6;i++){ substitutionMatrix[i].resize(6,0); }
284 if ((namesFileName == "") && (queryFileName != "")){
285 vector<string> files; files.push_back(queryFileName);
286 parser.getNameFile(files);
290 catch(exception& e) {
291 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
295 //***************************************************************************************************************
297 int SeqErrorCommand::execute(){
299 if (abort == true) { if (calledHelp) { return 0; } return 2; }
301 int start = time(NULL);
306 string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(queryFileName));
307 string errorSummaryFileName = fileNameRoot + getOutputFileNameTag("errorsummary");
308 outputNames.push_back(errorSummaryFileName); outputTypes["errorsummary"].push_back(errorSummaryFileName);
310 string errorSeqFileName = fileNameRoot + getOutputFileNameTag("errorseq");
311 outputNames.push_back(errorSeqFileName); outputTypes["errorseq"].push_back(errorSeqFileName);
313 string errorChimeraFileName = fileNameRoot + getOutputFileNameTag("errorchimera");
314 outputNames.push_back(errorChimeraFileName); outputTypes["errorchimera"].push_back(errorChimeraFileName);
316 getReferences(); //read in reference sequences - make sure there's no ambiguous bases
318 if(namesFileName != ""){ weights = getWeights(); }
320 vector<unsigned long long> fastaFilePos;
321 vector<unsigned long long> qFilePos;
322 vector<unsigned long long> reportFilePos;
324 setLines(queryFileName, qualFileName, reportFileName, fastaFilePos, qFilePos, reportFilePos);
326 if (m->control_pressed) { return 0; }
328 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
329 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
330 if (qualFileName != "") { qLines.push_back(linePair(qFilePos[i], qFilePos[(i+1)])); }
331 if (reportFileName != "") { rLines.push_back(linePair(reportFilePos[i], reportFilePos[(i+1)])); }
333 if(qualFileName == "") { qLines = lines; rLines = lines; } //fills with duds
336 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
338 numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]);
340 numSeqs = createProcesses(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName);
343 numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]);
346 if(qualFileName != "" && reportFileName != ""){
347 printErrorQuality(qScoreErrorMap);
348 printQualityFR(qualForwardMap, qualReverseMap);
351 printErrorFRFile(errorForward, errorReverse);
353 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
355 string errorCountFileName = fileNameRoot + getOutputFileNameTag("errorcount");
356 ofstream errorCountFile;
357 m->openOutputFile(errorCountFileName, errorCountFile);
358 outputNames.push_back(errorCountFileName); outputTypes["errorcount"].push_back(errorCountFileName);
359 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n");
360 m->mothurOut("Errors\tSequences\n");
361 errorCountFile << "Errors\tSequences\n";
362 for(int i=0;i<misMatchCounts.size();i++){
363 m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
364 errorCountFile << i << '\t' << misMatchCounts[i] << endl;
366 errorCountFile.close();
368 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
372 string megAlignmentFileName = fileNameRoot + getOutputFileNameTag("errorref-query");
373 ofstream megAlignmentFile;
374 m->openOutputFile(megAlignmentFileName, megAlignmentFile);
375 outputNames.push_back(megAlignmentFileName); outputTypes["errorref-query"].push_back(megAlignmentFileName);
377 for(int i=0;i<numRefs;i++){
378 megAlignmentFile << referenceSeqs[i].getInlineSeq() << endl;
379 megAlignmentFile << megaAlignVector[i] << endl;
382 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences.");
383 m->mothurOutEndLine();
385 m->mothurOutEndLine();
386 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
387 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
388 m->mothurOutEndLine();
392 catch(exception& e) {
393 m->errorOut(e, "SeqErrorCommand", "execute");
397 //**********************************************************************************************************************
398 int SeqErrorCommand::createProcesses(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName) {
402 map<char, vector<int> >::iterator it;
404 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
406 //loop through and create all the processes you want
407 while (process != processors) {
411 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
415 num = driver(filename, qFileName, rFileName, summaryFileName + toString(getpid()) + ".temp", errorOutputFileName+ toString(getpid()) + ".temp", chimeraOutputFileName + toString(getpid()) + ".temp", lines[process], qLines[process], rLines[process]);
417 //pass groupCounts to parent
419 string tempFile = filename + toString(getpid()) + ".info.temp";
420 m->openOutputFile(tempFile, out);
422 //output totalBases and totalMatches
423 out << num << '\t' << totalBases << '\t' << totalMatches << endl << endl;
425 //output substitutionMatrix
426 for(int i = 0; i < substitutionMatrix.size(); i++) {
427 for (int j = 0; j < substitutionMatrix[i].size(); j++) {
428 out << substitutionMatrix[i][j] << '\t';
434 //output qScoreErrorMap
435 for (it = qScoreErrorMap.begin(); it != qScoreErrorMap.end(); it++) {
436 vector<int> thisScoreErrorMap = it->second;
437 out << it->first << '\t';
438 for (int i = 0; i < thisScoreErrorMap.size(); i++) {
439 out << thisScoreErrorMap[i] << '\t';
445 //output qualForwardMap
446 for(int i = 0; i < qualForwardMap.size(); i++) {
447 for (int j = 0; j < qualForwardMap[i].size(); j++) {
448 out << qualForwardMap[i][j] << '\t';
454 //output qualReverseMap
455 for(int i = 0; i < qualReverseMap.size(); i++) {
456 for (int j = 0; j < qualReverseMap[i].size(); j++) {
457 out << qualReverseMap[i][j] << '\t';
464 //output errorForward
465 for (it = errorForward.begin(); it != errorForward.end(); it++) {
466 vector<int> thisErrorForward = it->second;
467 out << it->first << '\t';
468 for (int i = 0; i < thisErrorForward.size(); i++) {
469 out << thisErrorForward[i] << '\t';
475 //output errorReverse
476 for (it = errorReverse.begin(); it != errorReverse.end(); it++) {
477 vector<int> thisErrorReverse = it->second;
478 out << it->first << '\t';
479 for (int i = 0; i < thisErrorReverse.size(); i++) {
480 out << thisErrorReverse[i] << '\t';
486 //output misMatchCounts
487 out << misMatchCounts.size() << endl;
488 for (int j = 0; j < misMatchCounts.size(); j++) {
489 out << misMatchCounts[j] << '\t';
494 //output megaAlignVector
495 for (int j = 0; j < megaAlignVector.size(); j++) {
496 out << megaAlignVector[j] << endl;
504 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
505 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
511 num = driver(filename, qFileName, rFileName, summaryFileName, errorOutputFileName, chimeraOutputFileName, lines[0], qLines[0], rLines[0]);
513 //force parent to wait until all the processes are done
514 for (int i=0;i<processIDS.size();i++) {
515 int temp = processIDS[i];
520 for(int i=0;i<processIDS.size();i++){
522 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
524 m->appendFiles((summaryFileName + toString(processIDS[i]) + ".temp"), summaryFileName);
525 m->mothurRemove((summaryFileName + toString(processIDS[i]) + ".temp"));
526 m->appendFiles((errorOutputFileName + toString(processIDS[i]) + ".temp"), errorOutputFileName);
527 m->mothurRemove((errorOutputFileName + toString(processIDS[i]) + ".temp"));
528 m->appendFiles((chimeraOutputFileName + toString(processIDS[i]) + ".temp"), chimeraOutputFileName);
529 m->mothurRemove((chimeraOutputFileName + toString(processIDS[i]) + ".temp"));
532 string tempFile = filename + toString(processIDS[i]) + ".info.temp";
533 m->openInputFile(tempFile, in);
535 //input totalBases and totalMatches
536 int tempBases, tempMatches, tempNumSeqs;
537 in >> tempNumSeqs >> tempBases >> tempMatches; m->gobble(in);
538 totalBases += tempBases; totalMatches += tempMatches; num += tempNumSeqs;
540 //input substitutionMatrix
542 for(int i = 0; i < substitutionMatrix.size(); i++) {
543 for (int j = 0; j < substitutionMatrix[i].size(); j++) {
544 in >> tempNum; substitutionMatrix[i][j] += tempNum;
550 //input qScoreErrorMap
552 for (int i = 0; i < qScoreErrorMap.size(); i++) {
554 vector<int> thisScoreErrorMap = qScoreErrorMap[first];
556 for (int i = 0; i < thisScoreErrorMap.size(); i++) {
557 in >> tempNum; thisScoreErrorMap[i] += tempNum;
559 qScoreErrorMap[first] = thisScoreErrorMap;
564 //input qualForwardMap
565 for(int i = 0; i < qualForwardMap.size(); i++) {
566 for (int j = 0; j < qualForwardMap[i].size(); j++) {
567 in >> tempNum; qualForwardMap[i][j] += tempNum;
573 //input qualReverseMap
574 for(int i = 0; i < qualReverseMap.size(); i++) {
575 for (int j = 0; j < qualReverseMap[i].size(); j++) {
576 in >> tempNum; qualReverseMap[i][j] += tempNum;
583 for (int i = 0; i < errorForward.size(); i++) {
585 vector<int> thisErrorForward = errorForward[first];
587 for (int i = 0; i < thisErrorForward.size(); i++) {
588 in >> tempNum; thisErrorForward[i] += tempNum;
590 errorForward[first] = thisErrorForward;
596 for (int i = 0; i < errorReverse.size(); i++) {
598 vector<int> thisErrorReverse = errorReverse[first];
600 for (int i = 0; i < thisErrorReverse.size(); i++) {
601 in >> tempNum; thisErrorReverse[i] += tempNum;
603 errorReverse[first] = thisErrorReverse;
608 //input misMatchCounts
610 in >> misMatchSize; m->gobble(in);
611 if (misMatchSize > misMatchCounts.size()) { misMatchCounts.resize(misMatchSize, 0); }
612 for (int j = 0; j < misMatchCounts.size(); j++) {
613 in >> tempNum; misMatchCounts[j] += tempNum;
617 //input megaAlignVector
619 for (int j = 0; j < megaAlignVector.size(); j++) {
620 thisLine = m->getline(in); m->gobble(in); megaAlignVector[j] += thisLine + '\n';
624 in.close(); m->mothurRemove(tempFile);
630 catch(exception& e) {
631 m->errorOut(e, "SeqErrorCommand", "createProcesses");
635 //**********************************************************************************************************************
636 int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName, linePair line, linePair qline, linePair rline) {
640 QualityScores quality;
642 misMatchCounts.resize(11, 0);
646 map<string, int>::iterator it;
647 qScoreErrorMap['m'].assign(41, 0);
648 qScoreErrorMap['s'].assign(41, 0);
649 qScoreErrorMap['i'].assign(41, 0);
650 qScoreErrorMap['a'].assign(41, 0);
652 errorForward['m'].assign(maxLength,0);
653 errorForward['s'].assign(maxLength,0);
654 errorForward['i'].assign(maxLength,0);
655 errorForward['d'].assign(maxLength,0);
656 errorForward['a'].assign(maxLength,0);
658 errorReverse['m'].assign(maxLength,0);
659 errorReverse['s'].assign(maxLength,0);
660 errorReverse['i'].assign(maxLength,0);
661 errorReverse['d'].assign(maxLength,0);
662 errorReverse['a'].assign(maxLength,0);
664 //open inputfiles and go to beginning place for this processor
666 m->openInputFile(filename, queryFile);
667 queryFile.seekg(line.start);
671 if(qFileName != "" && rFileName != ""){
672 m->openInputFile(qFileName, qualFile);
673 qualFile.seekg(qline.start);
676 if (rline.start == 0) { report = ReportFile(reportFile, rFileName); }
678 m->openInputFile(rFileName, reportFile);
679 reportFile.seekg(rline.start);
682 qualForwardMap.resize(maxLength);
683 qualReverseMap.resize(maxLength);
684 for(int i=0;i<maxLength;i++){
685 qualForwardMap[i].assign(41,0);
686 qualReverseMap[i].assign(41,0);
690 ofstream outChimeraReport;
691 m->openOutputFile(chimeraOutputFileName, outChimeraReport);
692 RefChimeraTest chimeraTest(referenceSeqs);
693 if (line.start == 0) { chimeraTest.printHeader(outChimeraReport); }
695 ofstream errorSummaryFile;
696 m->openOutputFile(summaryFileName, errorSummaryFile);
697 if (line.start == 0) { printErrorHeader(errorSummaryFile); }
699 ofstream errorSeqFile;
700 m->openOutputFile(errorOutputFileName, errorSeqFile);
702 megaAlignVector.resize(numRefs, "");
710 if (m->control_pressed) { queryFile.close(); if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); } outChimeraReport.close(); errorSummaryFile.close();errorSeqFile.close(); return 0; }
712 Sequence query(queryFile);
714 int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned(), outChimeraReport);
715 int closestRefIndex = chimeraTest.getClosestRefIndex();
717 if(numParentSeqs > 1 && ignoreChimeras == 1) { ignoreSeq = 1; }
718 else { ignoreSeq = 0; }
721 getErrors(query, referenceSeqs[closestRefIndex], minCompare);
723 if(namesFileName != ""){
724 it = weights.find(query.getName());
725 minCompare.weight = it->second;
727 else{ minCompare.weight = 1; }
729 printErrorData(minCompare, numParentSeqs, errorSummaryFile, errorSeqFile);
733 for(int i=0;i<minCompare.sequence.length();i++){
734 char letter = minCompare.sequence[i];
737 errorForward[letter][i] += minCompare.weight;
738 errorReverse[letter][minCompare.total-i-1] += minCompare.weight;
743 if(qualFileName != "" && reportFileName != ""){
744 report = ReportFile(reportFile);
746 // int origLength = report.getQueryLength();
747 int startBase = report.getQueryStart();
748 int endBase = report.getQueryEnd();
750 quality = QualityScores(qualFile);
753 // cout << query.getName() << '\t';
755 quality.updateQScoreErrorMap(qScoreErrorMap, minCompare.sequence, startBase, endBase, minCompare.weight);
756 quality.updateForwardMap(qualForwardMap, startBase, endBase, minCompare.weight);
757 quality.updateReverseMap(qualReverseMap, startBase, endBase, minCompare.weight);
763 if(minCompare.errorRate < threshold && !ignoreSeq){
764 totalBases += (minCompare.total * minCompare.weight);
765 totalMatches += minCompare.matches * minCompare.weight;
766 if(minCompare.mismatches > maxMismatch){
767 maxMismatch = minCompare.mismatches;
768 misMatchCounts.resize(maxMismatch + 1, 0);
770 misMatchCounts[minCompare.mismatches] += minCompare.weight;
773 megaAlignVector[closestRefIndex] += query.getInlineSeq() + '\n';
778 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
779 unsigned long long pos = queryFile.tellg();
780 if ((pos == -1) || (pos >= line.end)) { break; }
782 if (queryFile.eof()) { break; }
785 if(index % 100 == 0){ m->mothurOut(toString(index) + '\n'); }
788 if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); }
789 errorSummaryFile.close();
790 errorSeqFile.close();
793 if(index % 100 != 0){ m->mothurOut(toString(index) + '\n'); }
797 catch(exception& e) {
798 m->errorOut(e, "SeqErrorCommand", "driver");
802 //***************************************************************************************************************
804 void SeqErrorCommand::getReferences(){
806 int numAmbigSeqs = 0;
809 int minEndPos = 100000;
811 if (referenceFileName == "saved") {
812 int start = time(NULL);
813 m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine();
815 for (int i = 0; i < rdb->referenceSeqs.size(); i++) {
816 int numAmbigs = rdb->referenceSeqs[i].getAmbigBases();
817 if(numAmbigs > 0){ numAmbigSeqs++; }
819 // int startPos = rdb->referenceSeqs[i].getStartPos();
820 // if(startPos > maxStartPos) { maxStartPos = startPos; }
822 // int endPos = rdb->referenceSeqs[i].getEndPos();
823 // if(endPos < minEndPos) { minEndPos = endPos; }
825 referenceSeqs.push_back(rdb->referenceSeqs[i]);
827 referenceFileName = rdb->getSavedReference();
829 m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences.");m->mothurOutEndLine();
832 int start = time(NULL);
834 ifstream referenceFile;
835 m->openInputFile(referenceFileName, referenceFile);
837 while(referenceFile){
838 Sequence currentSeq(referenceFile);
839 int numAmbigs = currentSeq.getAmbigBases();
840 if(numAmbigs > 0){ numAmbigSeqs++; }
842 // int startPos = currentSeq.getStartPos();
843 // if(startPos > maxStartPos) { maxStartPos = startPos; }
845 // int endPos = currentSeq.getEndPos();
846 // if(endPos < minEndPos) { minEndPos = endPos; }
847 referenceSeqs.push_back(currentSeq);
849 if (rdb->save) { rdb->referenceSeqs.push_back(currentSeq); }
851 m->gobble(referenceFile);
853 referenceFile.close();
855 m->mothurOut("It took " + toString(time(NULL) - start) + " to read " + toString(referenceSeqs.size()) + " sequences.");m->mothurOutEndLine();
858 numRefs = referenceSeqs.size();
860 for(int i=0;i<numRefs;i++){
861 referenceSeqs[i].padToPos(maxStartPos);
862 referenceSeqs[i].padFromPos(minEndPos);
865 if(numAmbigSeqs != 0){
866 m->mothurOut("Warning: " + toString(numAmbigSeqs) + " reference sequences have ambiguous bases, these bases will be ignored\n");
870 catch(exception& e) {
871 m->errorOut(e, "SeqErrorCommand", "getReferences");
876 //***************************************************************************************************************
878 int SeqErrorCommand::getErrors(Sequence query, Sequence reference, Compare& errors){
880 if(query.getAlignLength() != reference.getAlignLength()){
881 m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
883 int alignLength = query.getAlignLength();
885 string q = query.getAligned();
886 string r = reference.getAligned();
891 for(int i=0;i<alignLength;i++){
892 // cout << r[i] << '\t' << q[i] << '\t';
893 if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){ // no missing data and no double gaps
898 if(r[i] == 'A'){ errors.AA++; errors.matches++; errors.sequence += 'm'; }
899 if(r[i] == 'T'){ errors.AT++; errors.sequence += 's'; }
900 if(r[i] == 'G'){ errors.AG++; errors.sequence += 's'; }
901 if(r[i] == 'C'){ errors.AC++; errors.sequence += 's'; }
902 if(r[i] == '-'){ errors.Ai++; errors.sequence += 'i'; }
904 else if(q[i] == 'T'){
905 if(r[i] == 'A'){ errors.TA++; errors.sequence += 's'; }
906 if(r[i] == 'T'){ errors.TT++; errors.matches++; errors.sequence += 'm'; }
907 if(r[i] == 'G'){ errors.TG++; errors.sequence += 's'; }
908 if(r[i] == 'C'){ errors.TC++; errors.sequence += 's'; }
909 if(r[i] == '-'){ errors.Ti++; errors.sequence += 'i'; }
911 else if(q[i] == 'G'){
912 if(r[i] == 'A'){ errors.GA++; errors.sequence += 's'; }
913 if(r[i] == 'T'){ errors.GT++; errors.sequence += 's'; }
914 if(r[i] == 'G'){ errors.GG++; errors.matches++; errors.sequence += 'm'; }
915 if(r[i] == 'C'){ errors.GC++; errors.sequence += 's'; }
916 if(r[i] == '-'){ errors.Gi++; errors.sequence += 'i'; }
918 else if(q[i] == 'C'){
919 if(r[i] == 'A'){ errors.CA++; errors.sequence += 's'; }
920 if(r[i] == 'T'){ errors.CT++; errors.sequence += 's'; }
921 if(r[i] == 'G'){ errors.CG++; errors.sequence += 's'; }
922 if(r[i] == 'C'){ errors.CC++; errors.matches++; errors.sequence += 'm'; }
923 if(r[i] == '-'){ errors.Ci++; errors.sequence += 'i'; }
925 else if(q[i] == 'N'){
926 if(r[i] == 'A'){ errors.NA++; errors.sequence += 'a'; }
927 if(r[i] == 'T'){ errors.NT++; errors.sequence += 'a'; }
928 if(r[i] == 'G'){ errors.NG++; errors.sequence += 'a'; }
929 if(r[i] == 'C'){ errors.NC++; errors.sequence += 'a'; }
930 if(r[i] == '-'){ errors.Ni++; errors.sequence += 'a'; }
932 else if(q[i] == '-' && r[i] != '-'){
933 if(r[i] == 'A'){ errors.dA++; errors.sequence += 'd'; }
934 if(r[i] == 'T'){ errors.dT++; errors.sequence += 'd'; }
935 if(r[i] == 'G'){ errors.dG++; errors.sequence += 'd'; }
936 if(r[i] == 'C'){ errors.dC++; errors.sequence += 'd'; }
943 errors.sequence += 'd'; errors.total++;
946 errors.sequence += 'r';
951 else if(q[i] == '.' && r[i] != '.'){ // reference extends beyond query
952 if(started == 1){ break; }
954 else if(q[i] != '.' && r[i] == '.'){ // query extends beyond reference
955 if(started == 1){ break; }
957 else if(q[i] == '.' && r[i] == '.'){ // both are missing data
958 if(started == 1){ break; }
960 // cout << errors.sequence[errors.sequence.length()-1] << endl;
962 // cout << errors.sequence << endl;
963 errors.mismatches = errors.total-errors.matches;
964 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
965 errors.queryName = query.getName();
966 errors.refName = reference.getName();
971 catch(exception& e) {
972 m->errorOut(e, "SeqErrorCommand", "getErrors");
977 //***************************************************************************************************************
979 map<string, int> SeqErrorCommand::getWeights(){
981 m->openInputFile(namesFileName, nameFile);
984 string redundantSeqs;
985 map<string, int> nameCountMap;
988 nameFile >> seqName >> redundantSeqs;
989 nameCountMap[seqName] = m->getNumNames(redundantSeqs);
996 //***************************************************************************************************************
998 void SeqErrorCommand::printErrorHeader(ofstream& errorSummaryFile){
1000 errorSummaryFile << "query\treference\tweight\t";
1001 errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
1002 errorSummaryFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\tnumparents\n";
1004 errorSummaryFile << setprecision(6);
1005 errorSummaryFile.setf(ios::fixed);
1007 catch(exception& e) {
1008 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
1013 //***************************************************************************************************************
1015 void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs, ofstream& errorSummaryFile, ofstream& errorSeqFile){
1018 errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
1019 errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
1020 errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
1021 errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
1022 errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
1023 errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
1024 errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t';
1025 errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
1027 errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t'; //insertions
1028 errorSummaryFile << error.dA + error.dT + error.dG + error.dC << '\t'; //deletions
1029 errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t'; //substitutions
1030 errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t'; //ambiguities
1031 errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << '\t' << numParentSeqs << endl;
1033 errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl;
1035 int a=0; int t=1; int g=2; int c=3;
1038 if(numParentSeqs == 1 || ignoreChimeras == 0){
1039 substitutionMatrix[a][a] += error.weight * error.AA;
1040 substitutionMatrix[a][t] += error.weight * error.TA;
1041 substitutionMatrix[a][g] += error.weight * error.GA;
1042 substitutionMatrix[a][c] += error.weight * error.CA;
1043 substitutionMatrix[a][gap] += error.weight * error.dA;
1044 substitutionMatrix[a][n] += error.weight * error.NA;
1046 substitutionMatrix[t][a] += error.weight * error.AT;
1047 substitutionMatrix[t][t] += error.weight * error.TT;
1048 substitutionMatrix[t][g] += error.weight * error.GT;
1049 substitutionMatrix[t][c] += error.weight * error.CT;
1050 substitutionMatrix[t][gap] += error.weight * error.dT;
1051 substitutionMatrix[t][n] += error.weight * error.NT;
1053 substitutionMatrix[g][a] += error.weight * error.AG;
1054 substitutionMatrix[g][t] += error.weight * error.TG;
1055 substitutionMatrix[g][g] += error.weight * error.GG;
1056 substitutionMatrix[g][c] += error.weight * error.CG;
1057 substitutionMatrix[g][gap] += error.weight * error.dG;
1058 substitutionMatrix[g][n] += error.weight * error.NG;
1060 substitutionMatrix[c][a] += error.weight * error.AC;
1061 substitutionMatrix[c][t] += error.weight * error.TC;
1062 substitutionMatrix[c][g] += error.weight * error.GC;
1063 substitutionMatrix[c][c] += error.weight * error.CC;
1064 substitutionMatrix[c][gap] += error.weight * error.dC;
1065 substitutionMatrix[c][n] += error.weight * error.NC;
1067 substitutionMatrix[gap][a] += error.weight * error.Ai;
1068 substitutionMatrix[gap][t] += error.weight * error.Ti;
1069 substitutionMatrix[gap][g] += error.weight * error.Gi;
1070 substitutionMatrix[gap][c] += error.weight * error.Ci;
1071 substitutionMatrix[gap][n] += error.weight * error.Ni;
1074 catch(exception& e) {
1075 m->errorOut(e, "SeqErrorCommand", "printErrorData");
1080 //***************************************************************************************************************
1082 void SeqErrorCommand::printSubMatrix(){
1084 string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(queryFileName));
1085 string subMatrixFileName = fileNameRoot + getOutputFileNameTag("errormatrix");
1086 ofstream subMatrixFile;
1087 m->openOutputFile(subMatrixFileName, subMatrixFile);
1088 outputNames.push_back(subMatrixFileName); outputTypes["errormatrix"].push_back(subMatrixFileName);
1089 vector<string> bases(6);
1096 vector<int> refSums(5,1);
1098 for(int i=0;i<5;i++){
1099 subMatrixFile << "\tr" << bases[i];
1101 for(int j=0;j<6;j++){
1102 refSums[i] += substitutionMatrix[i][j];
1105 subMatrixFile << endl;
1107 for(int i=0;i<6;i++){
1108 subMatrixFile << 'q' << bases[i];
1109 for(int j=0;j<5;j++){
1110 subMatrixFile << '\t' << substitutionMatrix[j][i];
1112 subMatrixFile << endl;
1115 subMatrixFile << "total";
1116 for(int i=0;i<5;i++){
1117 subMatrixFile << '\t' << refSums[i];
1119 subMatrixFile << endl;
1120 subMatrixFile.close();
1122 catch(exception& e) {
1123 m->errorOut(e, "SeqErrorCommand", "printSubMatrix");
1127 //***************************************************************************************************************
1129 void SeqErrorCommand::printErrorFRFile(map<char, vector<int> > errorForward, map<char, vector<int> > errorReverse){
1131 string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(queryFileName));
1132 string errorForwardFileName = fileNameRoot + getOutputFileNameTag("errorforward");
1133 ofstream errorForwardFile;
1134 m->openOutputFile(errorForwardFileName, errorForwardFile);
1135 outputNames.push_back(errorForwardFileName); outputTypes["errorforward"].push_back(errorForwardFileName);
1137 errorForwardFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
1138 for(int i=0;i<maxLength;i++){
1139 float match = (float)errorForward['m'][i];
1140 float subst = (float)errorForward['s'][i];
1141 float insert = (float)errorForward['i'][i];
1142 float del = (float)errorForward['d'][i];
1143 float amb = (float)errorForward['a'][i];
1144 float total = match + subst + insert + del + amb;
1145 if(total == 0){ break; }
1146 errorForwardFile << i+1 << '\t' << total << '\t' << match/total << '\t' << subst/total << '\t' << insert/total << '\t' << del/total << '\t' << amb/total << endl;
1148 errorForwardFile.close();
1150 string errorReverseFileName = fileNameRoot + getOutputFileNameTag("errorreverse");
1151 ofstream errorReverseFile;
1152 m->openOutputFile(errorReverseFileName, errorReverseFile);
1153 outputNames.push_back(errorReverseFileName); outputTypes["errorreverse"].push_back(errorReverseFileName);
1155 errorReverseFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
1156 for(int i=0;i<maxLength;i++){
1157 float match = (float)errorReverse['m'][i];
1158 float subst = (float)errorReverse['s'][i];
1159 float insert = (float)errorReverse['i'][i];
1160 float del = (float)errorReverse['d'][i];
1161 float amb = (float)errorReverse['a'][i];
1162 float total = match + subst + insert + del + amb;
1163 if(total == 0){ break; }
1164 errorReverseFile << i+1 << '\t' << total << '\t' << match/total << '\t' << subst/total << '\t' << insert/total << '\t' << del/total << '\t' << amb/total << endl;
1166 errorReverseFile.close();
1168 catch(exception& e) {
1169 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
1174 //***************************************************************************************************************
1176 void SeqErrorCommand::printErrorQuality(map<char, vector<int> > qScoreErrorMap){
1178 string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(queryFileName));
1179 string errorQualityFileName = fileNameRoot + getOutputFileNameTag("errorquality");
1180 ofstream errorQualityFile;
1181 m->openOutputFile(errorQualityFileName, errorQualityFile);
1182 outputNames.push_back(errorQualityFileName); outputTypes["errorquality"].push_back(errorQualityFileName);
1184 errorQualityFile << "qscore\tmatches\tsubstitutions\tinsertions\tambiguous" << endl;
1185 for(int i=0;i<41;i++){
1186 errorQualityFile << i << '\t' << qScoreErrorMap['m'][i] << '\t' << qScoreErrorMap['s'][i] << '\t' << qScoreErrorMap['i'][i] << '\t'<< qScoreErrorMap['a'][i] << endl;
1188 errorQualityFile.close();
1190 catch(exception& e) {
1191 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
1197 //***************************************************************************************************************
1199 void SeqErrorCommand::printQualityFR(vector<vector<int> > qualForwardMap, vector<vector<int> > qualReverseMap){
1203 int numColumns = qualForwardMap[0].size();
1205 for(int i=0;i<qualForwardMap.size();i++){
1206 for(int j=0;j<numColumns;j++){
1207 if(qualForwardMap[i][j] != 0){
1208 if(numRows < i) { numRows = i+20; }
1212 string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(queryFileName));
1213 string qualityForwardFileName = fileNameRoot + getOutputFileNameTag("errorqualforward");
1214 ofstream qualityForwardFile;
1215 m->openOutputFile(qualityForwardFileName, qualityForwardFile);
1216 outputNames.push_back(qualityForwardFileName); outputTypes["errorqualforward"].push_back(qualityForwardFileName);
1218 for(int i=0;i<numColumns;i++){ qualityForwardFile << '\t' << i; } qualityForwardFile << endl;
1220 for(int i=0;i<numRows;i++){
1221 qualityForwardFile << i+1;
1222 for(int j=0;j<numColumns;j++){
1223 qualityForwardFile << '\t' << qualForwardMap[i][j];
1226 qualityForwardFile << endl;
1228 qualityForwardFile.close();
1231 string qualityReverseFileName = fileNameRoot + getOutputFileNameTag("errorqualreverse");
1232 ofstream qualityReverseFile;
1233 m->openOutputFile(qualityReverseFileName, qualityReverseFile);
1234 outputNames.push_back(qualityReverseFileName); outputTypes["errorqualreverse"].push_back(qualityReverseFileName);
1236 for(int i=0;i<numColumns;i++){ qualityReverseFile << '\t' << i; } qualityReverseFile << endl;
1237 for(int i=0;i<numRows;i++){
1239 qualityReverseFile << i+1;
1240 for(int j=0;j<numColumns;j++){
1241 qualityReverseFile << '\t' << qualReverseMap[i][j];
1243 qualityReverseFile << endl;
1245 qualityReverseFile.close();
1247 catch(exception& e) {
1248 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
1253 /**************************************************************************************************/
1255 int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos, vector<unsigned long long>& rfileFilePos) {
1257 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1258 //set file positions for fasta file
1259 fastaFilePos = m->divideFile(filename, processors);
1261 if (qfilename == "") { return processors; }
1263 //get name of first sequence in each chunk
1264 map<string, int> firstSeqNames;
1265 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1267 m->openInputFile(filename, in);
1268 in.seekg(fastaFilePos[i]);
1271 firstSeqNames[temp.getName()] = i;
1276 //make copy to use below
1277 map<string, int> firstSeqNamesReport = firstSeqNames;
1279 //seach for filePos of each first name in the qfile and save in qfileFilePos
1281 m->openInputFile(qfilename, inQual);
1284 while(!inQual.eof()){
1285 input = m->getline(inQual);
1287 if (input.length() != 0) {
1288 if(input[0] == '>'){ //this is a sequence name line
1289 istringstream nameStream(input);
1291 string sname = ""; nameStream >> sname;
1292 sname = sname.substr(1);
1294 map<string, int>::iterator it = firstSeqNames.find(sname);
1296 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1297 unsigned long long pos = inQual.tellg();
1298 qfileFilePos.push_back(pos - input.length() - 1);
1299 firstSeqNames.erase(it);
1304 if (firstSeqNames.size() == 0) { break; }
1308 if (firstSeqNames.size() != 0) {
1309 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1310 m->mothurOut(it->first + " is in your fasta file and not in your quality file, aborting."); m->mothurOutEndLine();
1312 m->control_pressed = true;
1316 //get last file position of qfile
1318 unsigned long long size;
1320 //get num bytes in file
1321 pFile = fopen (qfilename.c_str(),"rb");
1322 if (pFile==NULL) perror ("Error opening file");
1324 fseek (pFile, 0, SEEK_END);
1329 qfileFilePos.push_back(size);
1331 //seach for filePos of each first name in the rfile and save in rfileFilePos
1334 m->openInputFile(rfilename, inR);
1336 //read column headers
1337 for (int i = 0; i < 16; i++) {
1338 if (!inR.eof()) { inR >> junk; }
1344 if (m->control_pressed) { inR.close(); return processors; }
1346 input = m->getline(inR);
1348 if (input.length() != 0) {
1350 istringstream nameStream(input);
1351 string sname = ""; nameStream >> sname;
1353 map<string, int>::iterator it = firstSeqNamesReport.find(sname);
1355 if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk
1356 unsigned long long pos = inR.tellg();
1357 rfileFilePos.push_back(pos - input.length() - 1);
1358 firstSeqNamesReport.erase(it);
1362 if (firstSeqNamesReport.size() == 0) { break; }
1367 if (firstSeqNamesReport.size() != 0) {
1368 for (map<string, int>::iterator it = firstSeqNamesReport.begin(); it != firstSeqNamesReport.end(); it++) {
1369 m->mothurOut(it->first + " is in your fasta file and not in your report file, aborting."); m->mothurOutEndLine();
1371 m->control_pressed = true;
1375 //get last file position of qfile
1377 unsigned long long sizeR;
1379 //get num bytes in file
1380 rFile = fopen (rfilename.c_str(),"rb");
1381 if (rFile==NULL) perror ("Error opening file");
1383 fseek (rFile, 0, SEEK_END);
1384 sizeR=ftell (rFile);
1388 rfileFilePos.push_back(sizeR);
1394 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1395 //get last file position of fastafile
1397 unsigned long long size;
1399 //get num bytes in file
1400 pFile = fopen (filename.c_str(),"rb");
1401 if (pFile==NULL) perror ("Error opening file");
1403 fseek (pFile, 0, SEEK_END);
1407 fastaFilePos.push_back(size);
1409 //get last file position of fastafile
1412 //get num bytes in file
1413 qFile = fopen (qfilename.c_str(),"rb");
1414 if (qFile==NULL) perror ("Error opening file");
1416 fseek (qFile, 0, SEEK_END);
1420 qfileFilePos.push_back(size);
1426 catch(exception& e) {
1427 m->errorOut(e, "SeqErrorCommand", "setLines");
1431 //***************************************************************************************************************