+ bool goodSeq = 1; // innocent until proven guilty
+ string trashCode = "";
+ if(maxInsert != -1 && maxInsert < LongestInsert) { goodSeq = 0; trashCode += "insert|"; }
+ if(minScore != -1 && minScore > SearchScore) { goodSeq = 0; trashCode += "score|"; }
+ if(minSim != -1 && minSim > SimBtwnQueryTemplate) { goodSeq = 0; trashCode += "sim|"; }
+
+ if(goodSeq == 1){
+ out << name << '\t' << length << '\t' << TemplateName << '\t' << TemplateLength << '\t' << SearchMethod << '\t' << SearchScore << '\t' << AlignmentMethod << '\t' << QueryStart << '\t' << QueryEnd << '\t' << TemplateStart << '\t' << TemplateEnd << '\t' << PairwiseAlignmentLength << '\t' << GapsInQuery << '\t' << GapsInTemplate << '\t' << LongestInsert << '\t' << SimBtwnQueryTemplate << endl;
+ }
+ else{ badSeqNames[name] = trashCode; }
+ count++;
+ }
+ in.close();
+ out.close();
+
+ int oldBadSeqsCount = badSeqNames.size();
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
+ m->renameFile(outSummary, outSummary+".temp");
+
+ ofstream out2;
+ m->openOutputFile(outSummary, out2);
+
+ //read summary file
+ ifstream in2;
+ m->openInputFile(outSummary+".temp", in2);
+ out2 << (m->getline(in2)) << endl; //skip headers
+
+ while (!in2.eof()) {
+
+ if (m->control_pressed) { in2.close(); out2.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in2 >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in2);
+
+ if (badSeqNames.count(name) == 0) { //are you good?
+ out2 << name << '\t' << length << '\t' << TemplateName << '\t' << TemplateLength << '\t' << SearchMethod << '\t' << SearchScore << '\t' << AlignmentMethod << '\t' << QueryStart << '\t' << QueryEnd << '\t' << TemplateStart << '\t' << TemplateEnd << '\t' << PairwiseAlignmentLength << '\t' << GapsInQuery << '\t' << GapsInTemplate << '\t' << LongestInsert << '\t' << SimBtwnQueryTemplate << endl;
+ }
+ }
+ in2.close();
+ out2.close();
+ m->mothurRemove(outSummary+".temp");
+ }
+
+ if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
+
+
+ return count;
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
+ exit(1);
+ }
+
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenContigs(map<string, string>& badSeqNames){
+ try{
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(contigsreport));
+ string outSummary = getOutputFileName("contigsreport",variables);
+ outputNames.push_back(outSummary); outputTypes["contigsreport"].push_back(outSummary);
+
+ string name;
+ //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns
+ int length, OLength, thisOStart, thisOEnd, numMisMatches, numNs;
+
+ ofstream out;
+ m->openOutputFile(outSummary, out);
+
+ //read summary file
+ ifstream in;
+ m->openInputFile(contigsreport, in);
+ out << (m->getline(in)) << endl; //skip headers
+
+ int count = 0;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); out.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in);
+
+ bool goodSeq = 1; // innocent until proven guilty
+ string trashCode = "";
+ if(oStart != -1 && oStart < thisOStart) { goodSeq = 0; trashCode += "ostart|"; }
+ if(oEnd != -1 && oEnd > thisOEnd) { goodSeq = 0; trashCode += "oend|"; }
+ if(maxN != -1 && maxN < numNs) { goodSeq = 0; trashCode += "n|"; }
+ if(minOverlap != -1 && minOverlap > OLength) { goodSeq = 0; trashCode += "olength|"; }
+ if(mismatches != -1 && mismatches < numMisMatches) { goodSeq = 0; trashCode += "mismatches|"; }
+
+ if(goodSeq == 1){
+ out << name << '\t' << length << '\t' << OLength << '\t' << thisOStart << '\t' << thisOEnd << '\t' << numMisMatches << '\t' << numNs << endl;
+ }
+ else{ badSeqNames[name] = trashCode; }
+ count++;
+ }
+ in.close();
+ out.close();
+
+ int oldBadSeqsCount = badSeqNames.size();
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
+ m->renameFile(outSummary, outSummary+".temp");
+
+ ofstream out2;
+ m->openOutputFile(outSummary, out2);
+
+ //read summary file
+ ifstream in2;
+ m->openInputFile(outSummary+".temp", in2);
+ out2 << (m->getline(in2)) << endl; //skip headers
+
+ while (!in2.eof()) {
+
+ if (m->control_pressed) { in2.close(); out2.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in2 >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in2);
+
+ if (badSeqNames.count(name) == 0) { //are you good?
+ out2 << name << '\t' << length << '\t' << OLength << '\t' << thisOStart << '\t' << thisOEnd << '\t' << numMisMatches << '\t' << numNs << endl;
+ }
+ }
+ in2.close();
+ out2.close();
+ m->mothurRemove(outSummary+".temp");
+ }
+
+ if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
+
+
+ return count;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenContigs");
+ exit(1);
+ }
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenSummary(map<string, string>& badSeqNames){
+ try{
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(summaryfile));
+ string outSummary = getOutputFileName("summary",variables);
+ outputNames.push_back(outSummary); outputTypes["summary"].push_back(outSummary);
+
+ string name;
+ int start, end, length, ambigs, polymer, numReps;
+
+ ofstream out;
+ m->openOutputFile(outSummary, out);
+
+ //read summary file
+ ifstream in;
+ m->openInputFile(summaryfile, in);
+ out << (m->getline(in)) << endl; //skip headers
+
+ int count = 0;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); out.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in);
+
+ bool goodSeq = 1; // innocent until proven guilty
+ string trashCode = "";
+ if(startPos != -1 && startPos < start) { goodSeq = 0; trashCode += "start|"; }
+ if(endPos != -1 && endPos > end) { goodSeq = 0; trashCode += "end|"; }
+ if(maxAmbig != -1 && maxAmbig < ambigs) { goodSeq = 0; trashCode += "ambig|"; }
+ if(maxHomoP != -1 && maxHomoP < polymer) { goodSeq = 0; trashCode += "homop|"; }
+ if(minLength != -1 && minLength > length) { goodSeq = 0; trashCode += "<length|"; }
+ if(maxLength != -1 && maxLength < length) { goodSeq = 0; trashCode += ">length|"; }
+
+ if(goodSeq == 1){
+ out << name << '\t' << start << '\t' << end << '\t' << length << '\t' << ambigs << '\t' << polymer << '\t' << numReps << endl;
+ }
+ else{ badSeqNames[name] = trashCode; }
+ count++;
+ }
+ in.close();
+ out.close();
+
+ int oldBadSeqsCount = badSeqNames.size();
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
+ m->renameFile(outSummary, outSummary+".temp");
+
+ ofstream out2;
+ m->openOutputFile(outSummary, out2);
+
+ //read summary file
+ ifstream in2;
+ m->openInputFile(outSummary+".temp", in2);
+ out2 << (m->getline(in2)) << endl; //skip headers
+
+ while (!in2.eof()) {
+
+ if (m->control_pressed) { in2.close(); out2.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in2 >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in2);
+
+ if (badSeqNames.count(name) == 0) { //are you good?
+ out2 << name << '\t' << start << '\t' << end << '\t' << length << '\t' << ambigs << '\t' << polymer << '\t' << numReps << endl;
+ }
+ }
+ in2.close();
+ out2.close();
+ m->mothurRemove(outSummary+".temp");
+ }
+
+ if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your summary file, quitting.\n"); m->control_pressed = true; }
+
+
+
+ return count;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenSummary");
+ exit(1);
+ }
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenFasta(map<string, string>& badSeqNames){
+ try{
+
+
+ //if the user want to optimize we need to know the 90% mark
+ vector<unsigned long long> positions;
+ if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
+ //use the namefile to optimize correctly
+ if (namefile != "") { nameMap = m->readNames(namefile); }
+ else if (countfile != "") {
+ CountTable ct;
+ ct.readTable(countfile);
+ nameMap = ct.getNameMap();
+ }
+ getSummary(positions);
+ }else {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFile(fastafile, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
+#else
+ if(processors == 1){ lines.push_back(linePair(0, 1000)); }
+ else {
+ int numFastaSeqs = 0;
+ positions = m->setFilePosFasta(fastafile, numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
+#endif
+ }
+
+ if (m->control_pressed) { return 0; }
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ return numFastaSeqs;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenFasta");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+int ScreenSeqsCommand::screenNameGroupFile(map<string, string> badSeqNames){
+ try {
+ ifstream inputNames;
+ m->openInputFile(namefile, inputNames);
+ map<string, string> badSeqGroups;
+ string seqName, seqList, group;
+ map<string, string>::iterator it;
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
+ variables["[extension]"] = m->getExtension(namefile);
+ string goodNameFile = getOutputFileName("name", variables);
+ outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
+
+ ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
+
+ while(!inputNames.eof()){
+ if (m->control_pressed) { goodNameOut.close(); inputNames.close(); m->mothurRemove(goodNameFile); return 0; }
+
+ inputNames >> seqName; m->gobble(inputNames); inputNames >> seqList;
+ it = badSeqNames.find(seqName);
+
+ if(it != badSeqNames.end()){
+ if(namefile != ""){
+ int start = 0;
+ for(int i=0;i<seqList.length();i++){
+ if(seqList[i] == ','){
+ badSeqGroups[seqList.substr(start,i-start)] = it->second;
+ start = i+1;
+ }
+ }
+ badSeqGroups[seqList.substr(start,seqList.length()-start)] = it->second;
+ }
+ badSeqNames.erase(it);
+ }
+ else{
+ goodNameOut << seqName << '\t' << seqList << endl;
+ }
+ m->gobble(inputNames);
+ }
+ inputNames.close();
+ goodNameOut.close();
+
+ //we were unable to remove some of the bad sequences
+ if (badSeqNames.size() != 0) {
+ for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
+ m->mothurOut("Your namefile does not include the sequence " + it->first + " please correct.");
+ m->mothurOutEndLine();
+ }
+ }