+ string tempBuf = buffer;
+ if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
+ istringstream iss (tempBuf,istringstream::in);
+
+ delete buffer;
+ MPI_File_close(&inMPIAccnos);
+
+ badSeqNames.clear();
+ string tempName, trashCode;
+ while (!iss.eof()) {
+ iss >> tempName >> trashCode; m->gobble(iss);
+ badSeqNames[tempName] = trashCode;
+ }
+ }
+#endif
+
+
+ return numFastaSeqs;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "runFastaScreening");
+ exit(1);
+ }
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenReports(map<string, string>& badSeqNames){
+ try{
+ int numFastaSeqs = 0;
+ bool summarizedFasta = false;
+
+ //did not provide a summary file, but set a parameter that requires summarizing the fasta file
+ //or did provide a summary file, but set maxn parameter so we must summarize the fasta file
+ vector<unsigned long long> positions;
+ if (((summaryfile == "") && ((m->inUsersGroups("maxambig", optimize)) ||(m->inUsersGroups("maxhomop", optimize)) ||(m->inUsersGroups("maxlength", optimize)) || (m->inUsersGroups("minlength", optimize)) || (m->inUsersGroups("start", optimize)) || (m->inUsersGroups("end", optimize)))) || ((summaryfile != "") && m->inUsersGroups("maxn", optimize))) {
+ //use the namefile to optimize correctly
+ if (namefile != "") { nameMap = m->readNames(namefile); }
+ else if (countfile != "") {
+ CountTable ct;
+ ct.readTable(countfile);
+ nameMap = ct.getNameMap();
+ }
+ getSummary(positions);
+ summarizedFasta = true;
+ } else {
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFile(fastafile, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
+ #else
+ if(processors == 1){ lines.push_back(linePair(0, 1000)); }
+ else {
+ int numFastaSeqs = 0;
+ positions = m->setFilePosFasta(fastafile, numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
+ #endif
+ }
+
+ if ((summaryfile != "") && ((m->inUsersGroups("maxambig", optimize)) ||(m->inUsersGroups("maxhomop", optimize)) ||(m->inUsersGroups("maxlength", optimize)) || (m->inUsersGroups("minlength", optimize)) || (m->inUsersGroups("start", optimize)) || (m->inUsersGroups("end", optimize))) && !summarizedFasta) { //summarize based on summaryfile
+ if (namefile != "") { nameMap = m->readNames(namefile); }
+ else if (countfile != "") {
+ CountTable ct;
+ ct.readTable(countfile);
+ nameMap = ct.getNameMap();
+ }
+ getSummaryReport();
+ }else if ((contigsreport != "") && ((m->inUsersGroups("minoverlap", optimize)) || (m->inUsersGroups("ostart", optimize)) || (m->inUsersGroups("oend", optimize)) || (m->inUsersGroups("mismatches", optimize)))) { //optimize settings based on contigs file
+ optimizeContigs();
+ }else if ((alignreport != "") && ((m->inUsersGroups("minsim", optimize)) || (m->inUsersGroups("minscore", optimize)) || (m->inUsersGroups("maxinsert", optimize)))) { //optimize settings based on contigs file
+ optimizeAlign();
+ }
+
+
+ //provided summary file, and did not set maxn so no need to summarize fasta
+ if (summaryfile != "") { numFastaSeqs = screenSummary(badSeqNames); }
+ //add in any seqs that fail due to contigs report results
+ else if (contigsreport != "") { numFastaSeqs = screenContigs(badSeqNames); }
+ //add in any seqs that fail due to align report
+ else if (alignreport != "") { numFastaSeqs = screenAlignReport(badSeqNames); }
+
+ return numFastaSeqs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenReports");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+int ScreenSeqsCommand::screenAlignReport(map<string, string>& badSeqNames){
+ try {
+
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(alignreport));
+ string outSummary = getOutputFileName("alignreport",variables);
+ outputNames.push_back(outSummary); outputTypes["alignreport"].push_back(outSummary);
+
+ string name, TemplateName, SearchMethod, AlignmentMethod;
+ //QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template
+ //checking for minScore, maxInsert, minSim
+ int length, TemplateLength, QueryStart, QueryEnd, TemplateStart, TemplateEnd, PairwiseAlignmentLength, GapsInQuery, GapsInTemplate, LongestInsert;
+ float SearchScore, SimBtwnQueryTemplate;
+
+ ofstream out;
+ m->openOutputFile(outSummary, out);
+
+ //read summary file
+ ifstream in;
+ m->openInputFile(alignreport, in);
+ out << (m->getline(in)) << endl; //skip headers
+
+ int count = 0;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); out.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in);
+
+ bool goodSeq = 1; // innocent until proven guilty
+ string trashCode = "";
+ if(maxInsert != -1 && maxInsert < LongestInsert) { goodSeq = 0; trashCode += "insert|"; }
+ if(minScore != -1 && minScore > SearchScore) { goodSeq = 0; trashCode += "score|"; }
+ if(minSim != -1 && minSim > SimBtwnQueryTemplate) { goodSeq = 0; trashCode += "sim|"; }
+
+ if(goodSeq == 1){
+ out << name << '\t' << length << '\t' << TemplateName << '\t' << TemplateLength << '\t' << SearchMethod << '\t' << SearchScore << '\t' << AlignmentMethod << '\t' << QueryStart << '\t' << QueryEnd << '\t' << TemplateStart << '\t' << TemplateEnd << '\t' << PairwiseAlignmentLength << '\t' << GapsInQuery << '\t' << GapsInTemplate << '\t' << LongestInsert << '\t' << SimBtwnQueryTemplate << endl;
+ }
+ else{ badSeqNames[name] = trashCode; }
+ count++;
+ }
+ in.close();
+ out.close();
+
+ int oldBadSeqsCount = badSeqNames.size();
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
+ m->renameFile(outSummary, outSummary+".temp");
+
+ ofstream out2;
+ m->openOutputFile(outSummary, out2);
+
+ //read summary file
+ ifstream in2;
+ m->openInputFile(outSummary+".temp", in2);
+ out2 << (m->getline(in2)) << endl; //skip headers
+
+ while (!in2.eof()) {
+
+ if (m->control_pressed) { in2.close(); out2.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in2 >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in2);
+
+ if (badSeqNames.count(name) == 0) { //are you good?
+ out2 << name << '\t' << length << '\t' << TemplateName << '\t' << TemplateLength << '\t' << SearchMethod << '\t' << SearchScore << '\t' << AlignmentMethod << '\t' << QueryStart << '\t' << QueryEnd << '\t' << TemplateStart << '\t' << TemplateEnd << '\t' << PairwiseAlignmentLength << '\t' << GapsInQuery << '\t' << GapsInTemplate << '\t' << LongestInsert << '\t' << SimBtwnQueryTemplate << endl;
+ }
+ }
+ in2.close();
+ out2.close();
+ m->mothurRemove(outSummary+".temp");
+ }
+
+ if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
+
+
+ return count;
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
+ exit(1);
+ }
+
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenContigs(map<string, string>& badSeqNames){
+ try{
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(contigsreport));
+ string outSummary = getOutputFileName("contigsreport",variables);
+ outputNames.push_back(outSummary); outputTypes["contigsreport"].push_back(outSummary);
+
+ string name;
+ //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns
+ int length, OLength, thisOStart, thisOEnd, numMisMatches, numNs;
+
+ ofstream out;
+ m->openOutputFile(outSummary, out);
+
+ //read summary file
+ ifstream in;
+ m->openInputFile(contigsreport, in);
+ out << (m->getline(in)) << endl; //skip headers
+
+ int count = 0;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); out.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in);
+
+ bool goodSeq = 1; // innocent until proven guilty
+ string trashCode = "";
+ if(oStart != -1 && oStart < thisOStart) { goodSeq = 0; trashCode += "ostart|"; }
+ if(oEnd != -1 && oEnd > thisOEnd) { goodSeq = 0; trashCode += "oend|"; }
+ if(maxN != -1 && maxN < numNs) { goodSeq = 0; trashCode += "n|"; }
+ if(minOverlap != -1 && minOverlap > OLength) { goodSeq = 0; trashCode += "olength|"; }
+ if(mismatches != -1 && mismatches < numMisMatches) { goodSeq = 0; trashCode += "mismatches|"; }
+
+ if(goodSeq == 1){
+ out << name << '\t' << length << '\t' << OLength << '\t' << thisOStart << '\t' << thisOEnd << '\t' << numMisMatches << '\t' << numNs << endl;
+ }
+ else{ badSeqNames[name] = trashCode; }
+ count++;
+ }
+ in.close();
+ out.close();
+
+ int oldBadSeqsCount = badSeqNames.size();
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
+ m->renameFile(outSummary, outSummary+".temp");
+
+ ofstream out2;
+ m->openOutputFile(outSummary, out2);
+
+ //read summary file
+ ifstream in2;
+ m->openInputFile(outSummary+".temp", in2);
+ out2 << (m->getline(in2)) << endl; //skip headers
+
+ while (!in2.eof()) {
+
+ if (m->control_pressed) { in2.close(); out2.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in2 >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in2);
+
+ if (badSeqNames.count(name) == 0) { //are you good?
+ out2 << name << '\t' << length << '\t' << OLength << '\t' << thisOStart << '\t' << thisOEnd << '\t' << numMisMatches << '\t' << numNs << endl;
+ }
+ }
+ in2.close();
+ out2.close();
+ m->mothurRemove(outSummary+".temp");
+ }
+
+ if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
+
+
+ return count;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenContigs");
+ exit(1);
+ }
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenSummary(map<string, string>& badSeqNames){
+ try{
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(summaryfile));
+ string outSummary = getOutputFileName("summary",variables);
+ outputNames.push_back(outSummary); outputTypes["summary"].push_back(outSummary);
+
+ string name;
+ int start, end, length, ambigs, polymer, numReps;
+
+ ofstream out;
+ m->openOutputFile(outSummary, out);
+
+ //read summary file
+ ifstream in;
+ m->openInputFile(summaryfile, in);
+ out << (m->getline(in)) << endl; //skip headers
+
+ int count = 0;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); out.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in);
+
+ bool goodSeq = 1; // innocent until proven guilty
+ string trashCode = "";
+ if(startPos != -1 && startPos < start) { goodSeq = 0; trashCode += "start|"; }
+ if(endPos != -1 && endPos > end) { goodSeq = 0; trashCode += "end|"; }
+ if(maxAmbig != -1 && maxAmbig < ambigs) { goodSeq = 0; trashCode += "ambig|"; }
+ if(maxHomoP != -1 && maxHomoP < polymer) { goodSeq = 0; trashCode += "homop|"; }
+ if(minLength != -1 && minLength > length) { goodSeq = 0; trashCode += "<length|"; }
+ if(maxLength != -1 && maxLength < length) { goodSeq = 0; trashCode += ">length|"; }
+
+ if(goodSeq == 1){
+ out << name << '\t' << start << '\t' << end << '\t' << length << '\t' << ambigs << '\t' << polymer << '\t' << numReps << endl;
+ }
+ else{ badSeqNames[name] = trashCode; }
+ count++;
+ }
+ in.close();
+ out.close();
+
+ int oldBadSeqsCount = badSeqNames.size();
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
+ m->renameFile(outSummary, outSummary+".temp");
+
+ ofstream out2;
+ m->openOutputFile(outSummary, out2);
+
+ //read summary file
+ ifstream in2;
+ m->openInputFile(outSummary+".temp", in2);
+ out2 << (m->getline(in2)) << endl; //skip headers
+
+ while (!in2.eof()) {
+
+ if (m->control_pressed) { in2.close(); out2.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in2 >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in2);
+
+ if (badSeqNames.count(name) == 0) { //are you good?
+ out2 << name << '\t' << start << '\t' << end << '\t' << length << '\t' << ambigs << '\t' << polymer << '\t' << numReps << endl;
+ }
+ }
+ in2.close();
+ out2.close();
+ m->mothurRemove(outSummary+".temp");
+ }
+
+ if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your summary file, quitting.\n"); m->control_pressed = true; }
+
+
+
+ return count;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenSummary");
+ exit(1);
+ }
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenFasta(map<string, string>& badSeqNames){
+ try{
+
+
+ //if the user want to optimize we need to know the 90% mark
+ vector<unsigned long long> positions;
+ if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
+ //use the namefile to optimize correctly
+ if (namefile != "") { nameMap = m->readNames(namefile); }
+ else if (countfile != "") {
+ CountTable ct;
+ ct.readTable(countfile);
+ nameMap = ct.getNameMap();
+ }
+ getSummary(positions);
+ }else {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFile(fastafile, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
+#else
+ if(processors == 1){ lines.push_back(linePair(0, 1000)); }
+ else {
+ int numFastaSeqs = 0;
+ positions = m->setFilePosFasta(fastafile, numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
+#endif
+ }
+
+ if (m->control_pressed) { return 0; }
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ return numFastaSeqs;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenFasta");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+int ScreenSeqsCommand::screenNameGroupFile(map<string, string> badSeqNames){
+ try {
+ ifstream inputNames;
+ m->openInputFile(namefile, inputNames);
+ map<string, string> badSeqGroups;
+ string seqName, seqList, group;
+ map<string, string>::iterator it;
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
+ variables["[extension]"] = m->getExtension(namefile);
+ string goodNameFile = getOutputFileName("name", variables);
+ outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
+
+ ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
+
+ while(!inputNames.eof()){
+ if (m->control_pressed) { goodNameOut.close(); inputNames.close(); m->mothurRemove(goodNameFile); return 0; }
+
+ inputNames >> seqName; m->gobble(inputNames); inputNames >> seqList;
+ it = badSeqNames.find(seqName);