#include "trimseqscommand.h"
#include "needlemanoverlap.hpp"
+#include "trimoligos.h"
//**********************************************************************************************************************
vector<string> TrimSeqsCommand::setParameters(){
CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
+ CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
string helpString = "";
helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
- helpString += "The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
+ helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
helpString += "The fasta parameter is required.\n";
helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
helpString += "The oligos parameter allows you to provide an oligos file.\n";
+ helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
outputTypes["fasta"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
outputTypes["group"] = tempOutNames;
+ outputTypes["name"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
//allow user to run help
if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
else {
vector<string> myArray = setParameters();
outputTypes["fasta"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
outputTypes["group"] = tempOutNames;
+ outputTypes["name"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
if (path == "") { parameters["qfile"] = inputDir + it->second; }
}
+ it = parameters.find("name");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["name"] = inputDir + it->second; }
+ }
+
}
if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
}else if (fastaFile == "not open") { abort = true; }
+ else { m->setFastaFile(fastaFile); }
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
temp = validParameter.validFile(parameters, "oligos", true);
if (temp == "not found"){ oligoFile = ""; }
else if(temp == "not open"){ abort = true; }
- else { oligoFile = temp; }
+ else { oligoFile = temp; m->setOligosFile(oligoFile); }
temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
temp = validParameter.validFile(parameters, "qfile", true);
if (temp == "not found") { qFileName = ""; }
else if(temp == "not open") { abort = true; }
- else { qFileName = temp; }
+ else { qFileName = temp; m->setQualFile(qFileName); }
+
+ temp = validParameter.validFile(parameters, "name", true);
+ if (temp == "not found") { nameFile = ""; }
+ else if(temp == "not open") { nameFile = ""; abort = true; }
+ else { nameFile = temp; m->setNameFile(nameFile); }
temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
convert(temp, qThreshold);
numFPrimers = 0; //this needs to be initialized
numRPrimers = 0;
+ createGroup = false;
vector<vector<string> > fastaFileNames;
vector<vector<string> > qualFileNames;
+ vector<vector<string> > nameFileNames;
string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
+
if (qFileName != "") {
outputNames.push_back(trimQualFile);
outputNames.push_back(scrapQualFile);
outputTypes["qfile"].push_back(scrapQualFile);
}
+ string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
+ string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
+
+ if (nameFile != "") {
+ m->readNames(nameFile, nameMap);
+ outputNames.push_back(trimNameFile);
+ outputNames.push_back(scrapNameFile);
+ outputTypes["name"].push_back(trimNameFile);
+ outputTypes["name"].push_back(scrapNameFile);
+ }
+
+ if (m->control_pressed) { return 0; }
+
string outputGroupFileName;
if(oligoFile != ""){
- outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
- outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
- getOligos(fastaFileNames, qualFileNames);
+ createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
+ if (createGroup) {
+ outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
+ outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
+ }
}
-
- vector<unsigned long int> fastaFilePos;
- vector<unsigned long int> qFilePos;
+
+ vector<unsigned long long> fastaFilePos;
+ vector<unsigned long long> qFilePos;
setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
if(processors == 1){
- driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
+ driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
}else{
- createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames);
+ createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
}
#else
- driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
+ driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
#endif
if (m->control_pressed) { return 0; }
-
+
if(allFiles){
map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
map<string, string>::iterator it;
for(int i=0;i<fastaFileNames.size();i++){
for(int j=0;j<fastaFileNames[0].size();j++){
if (fastaFileNames[i][j] != "") {
- if(m->isBlank(fastaFileNames[i][j])){
- remove(fastaFileNames[i][j].c_str());
- namesToRemove.insert(fastaFileNames[i][j]);
+ if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
+ if(m->isBlank(fastaFileNames[i][j])){
+ m->mothurRemove(fastaFileNames[i][j]);
+ namesToRemove.insert(fastaFileNames[i][j]);
- if(qFileName != ""){
- remove(qualFileNames[i][j].c_str());
- namesToRemove.insert(qualFileNames[i][j]);
+ if(qFileName != ""){
+ m->mothurRemove(qualFileNames[i][j]);
+ namesToRemove.insert(qualFileNames[i][j]);
+ }
+
+ if(nameFile != ""){
+ m->mothurRemove(nameFileNames[i][j]);
+ namesToRemove.insert(nameFileNames[i][j]);
+ }
+ }else{
+ it = uniqueFastaNames.find(fastaFileNames[i][j]);
+ if (it == uniqueFastaNames.end()) {
+ uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
+ }
}
- }else{
- it = uniqueFastaNames.find(fastaFileNames[i][j]);
- if (it == uniqueFastaNames.end()) {
- uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
- }
}
}
}
}
}
- if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
//output group counts
m->mothurOutEndLine();
int total = 0;
+ if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
- total += it->second; m->mothurOut("Group " + it->first + " contains " + toString(it->second) + " sequences."); m->mothurOutEndLine();
+ total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
}
if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
- if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
//set fasta file as new current fastafile
string current = "";
if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
}
+ itTypes = outputTypes.find("name");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
+ }
+
itTypes = outputTypes.find("qfile");
if (itTypes != outputTypes.end()) {
if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
/**************************************************************************************/
-int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {
+int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
try {
m->openOutputFile(scrapQFileName, scrapQualFile);
}
+ ofstream trimNameFile;
+ ofstream scrapNameFile;
+ if(nameFile != ""){
+ m->openOutputFile(trimNFileName, trimNameFile);
+ m->openOutputFile(scrapNFileName, scrapNameFile);
+ }
+
+
ofstream outGroupsFile;
- if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
+ if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
if(allFiles){
for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
if(qFileName != ""){
m->openOutputFile(qualFileNames[i][j], temp); temp.close();
}
+
+ if(nameFile != ""){
+ m->openOutputFile(nameFileNames[i][j], temp); temp.close();
+ }
}
}
}
int count = 0;
bool moreSeqs = 1;
+ TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
while (moreSeqs) {
if (m->control_pressed) {
inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
- if (oligoFile != "") { outGroupsFile.close(); }
+ if (createGroup) { outGroupsFile.close(); }
if(qFileName != ""){
qFile.close();
}
- for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
return 0;
}
int currentSeqsDiffs = 0;
Sequence currSeq(inFASTA); m->gobble(inFASTA);
-
+ //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
QualityScores currQual;
if(qFileName != ""){
currQual = QualityScores(qFile); m->gobble(qFile);
}
-
+
string origSeq = currSeq.getUnaligned();
if (origSeq != "") {
int primerIndex = 0;
if(barcodes.size() != 0){
- success = stripBarcode(currSeq, currQual, barcodeIndex);
+ success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
if(success > bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
if(numFPrimers != 0){
- success = stripForward(currSeq, currQual, primerIndex);
+ success = trimOligos.stripForward(currSeq, currQual, primerIndex);
if(success > pdiffs) { trashCode += 'f'; }
else{ currentSeqsDiffs += success; }
}
if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
if(numRPrimers != 0){
- success = stripReverse(currSeq, currQual);
+ success = trimOligos.stripReverse(currSeq, currQual);
if(!success) { trashCode += 'r'; }
}
currQual.printQScores(trimQualFile);
}
- if(barcodes.size() != 0){
- string thisGroup = barcodeNameVector[barcodeIndex];
- if (primers.size() != 0) { thisGroup += "." + primerNameVector[primerIndex]; }
-
- outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
-
- map<string, int>::iterator it = groupCounts.find(thisGroup);
- if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
- else { groupCounts[it->first]++; }
-
+ if(nameFile != ""){
+ map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+ if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
+ else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
}
+ if (createGroup) {
+ if(barcodes.size() != 0){
+ string thisGroup = barcodeNameVector[barcodeIndex];
+ if (primers.size() != 0) {
+ if (primerNameVector[primerIndex] != "") {
+ if(thisGroup != "") {
+ thisGroup += "." + primerNameVector[primerIndex];
+ }else {
+ thisGroup = primerNameVector[primerIndex];
+ }
+ }
+ }
+
+ outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
+
+ if (nameFile != "") {
+ map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+ if (itName != nameMap.end()) {
+ vector<string> thisSeqsNames;
+ m->splitAtChar(itName->second, thisSeqsNames, ',');
+ for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+ outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
+ }
+ }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
+ }
+
+ map<string, int>::iterator it = groupCounts.find(thisGroup);
+ if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
+ else { groupCounts[it->first]++; }
+
+ }
+ }
if(allFiles){
ofstream output;
currQual.printQScores(output);
output.close();
}
+
+ if(nameFile != ""){
+ map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+ if (itName != nameMap.end()) {
+ m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
+ output << itName->first << '\t' << itName->second << endl;
+ output.close();
+ }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
+ }
}
}
else{
+ if(nameFile != ""){ //needs to be before the currSeq name is changed
+ map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+ if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
+ else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
+ }
currSeq.setName(currSeq.getName() + '|' + trashCode);
currSeq.setUnaligned(origSeq);
currSeq.setAligned(origSeq);
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = inFASTA.tellg();
+ unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= line->end)) { break; }
+
#else
if (inFASTA.eof()) { break; }
#endif
-
+
//report progress
if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
}
//report progress
if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
-
+
inFASTA.close();
trimFASTAFile.close();
scrapFASTAFile.close();
- if (oligoFile != "") { outGroupsFile.close(); }
+ if (createGroup) { outGroupsFile.close(); }
if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
+ if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
return count;
}
/**************************************************************************************************/
-int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
+int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
int process = 1;
vector<vector<string> > tempFASTAFileNames = fastaFileNames;
vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
+ vector<vector<string> > tempNameFileNames = nameFileNames;
if(allFiles){
ofstream temp;
tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
}
+ if(nameFile != ""){
+ tempNameFileNames[i][j] += toString(getpid()) + ".temp";
+ m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
+ }
}
}
}
(scrapFASTAFileName + toString(getpid()) + ".temp"),
(trimQualFileName + toString(getpid()) + ".temp"),
(scrapQualFileName + toString(getpid()) + ".temp"),
+ (trimNameFileName + toString(getpid()) + ".temp"),
+ (scrapNameFileName + toString(getpid()) + ".temp"),
(groupFile + toString(getpid()) + ".temp"),
tempFASTAFileNames,
tempPrimerQualFileNames,
+ tempNameFileNames,
lines[process],
qLines[process]);
//pass groupCounts to parent
- ofstream out;
- string tempFile = filename + toString(getpid()) + ".num.temp";
- m->openOutputFile(tempFile, out);
- for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
- out << it->first << '\t' << it->second << endl;
+ if(createGroup){
+ ofstream out;
+ string tempFile = filename + toString(getpid()) + ".num.temp";
+ m->openOutputFile(tempFile, out);
+
+ out << groupCounts.size() << endl;
+
+ for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
+ out << it->first << '\t' << it->second << endl;
+ }
+ out.close();
}
- out.close();
-
exit(0);
}else {
m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
ofstream temp;
m->openOutputFile(trimFASTAFileName, temp); temp.close();
m->openOutputFile(scrapFASTAFileName, temp); temp.close();
- m->openOutputFile(trimQualFileName, temp); temp.close();
- m->openOutputFile(scrapQualFileName, temp); temp.close();
+ if(qFileName != ""){
+ m->openOutputFile(trimQualFileName, temp); temp.close();
+ m->openOutputFile(scrapQualFileName, temp); temp.close();
+ }
+ if (nameFile != "") {
+ m->openOutputFile(trimNameFileName, temp); temp.close();
+ m->openOutputFile(scrapNameFileName, temp); temp.close();
+ }
- driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
+ driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
//force parent to wait until all the processes are done
for (int i=0;i<processIDS.size();i++) {
m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
- remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
+ m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
- remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
+ m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
if(qFileName != ""){
m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
- remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
+ m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
- remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
+ m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
+ }
+
+ if(nameFile != ""){
+ m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
+ m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
+ m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
+ m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
}
- m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
- remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
+ if(createGroup){
+ m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
+ m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
+ }
if(allFiles){
for(int k=0;k<fastaFileNames[j].size();k++){
if (fastaFileNames[j][k] != "") {
m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
- remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
+ m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
if(qFileName != ""){
m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
- remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
+ m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
+ }
+
+ if(nameFile != ""){
+ m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
+ m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
}
}
}
}
}
- ifstream in;
- string tempFile = filename + toString(processIDS[i]) + ".num.temp";
- m->openInputFile(tempFile, in);
- int tempNum;
- string group;
- while (!in.eof()) {
- in >> group >> tempNum; m->gobble(in);
+ if(createGroup){
+ ifstream in;
+ string tempFile = filename + toString(processIDS[i]) + ".num.temp";
+ m->openInputFile(tempFile, in);
+ int tempNum;
+ string group;
+
+ in >> tempNum; m->gobble(in);
- map<string, int>::iterator it = groupCounts.find(group);
- if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
- else { groupCounts[it->first] += tempNum; }
+ if (tempNum != 0) {
+ while (!in.eof()) {
+ in >> group >> tempNum; m->gobble(in);
+
+ map<string, int>::iterator it = groupCounts.find(group);
+ if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
+ else { groupCounts[it->first] += tempNum; }
+ }
+ }
+ in.close(); m->mothurRemove(tempFile);
}
- in.close(); remove(tempFile.c_str());
}
/**************************************************************************************************/
-int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
+int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos) {
try {
-
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
//set file positions for fasta file
fastaFilePos = m->divideFile(filename, processors);
map<string, int>::iterator it = firstSeqNames.find(sname);
if(it != firstSeqNames.end()) { //this is the start of a new chunk
- unsigned long int pos = inQual.tellg();
+ unsigned long long pos = inQual.tellg();
qfileFilePos.push_back(pos - input.length() - 1);
firstSeqNames.erase(it);
}
//get last file position of qfile
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
//get num bytes in file
pFile = fopen (qfilename.c_str(),"rb");
qfileFilePos.push_back(size);
return processors;
+
+ #else
+
+ fastaFilePos.push_back(0); qfileFilePos.push_back(0);
+ //get last file position of fastafile
+ FILE * pFile;
+ unsigned long long size;
+
+ //get num bytes in file
+ pFile = fopen (filename.c_str(),"rb");
+ if (pFile==NULL) perror ("Error opening file");
+ else{
+ fseek (pFile, 0, SEEK_END);
+ size=ftell (pFile);
+ fclose (pFile);
+ }
+ fastaFilePos.push_back(size);
+
+ //get last file position of fastafile
+ FILE * qFile;
+
+ //get num bytes in file
+ qFile = fopen (qfilename.c_str(),"rb");
+ if (qFile==NULL) perror ("Error opening file");
+ else{
+ fseek (qFile, 0, SEEK_END);
+ size=ftell (qFile);
+ fclose (qFile);
+ }
+ qfileFilePos.push_back(size);
+
+ return 1;
+
+ #endif
}
catch(exception& e) {
m->errorOut(e, "TrimSeqsCommand", "setLines");
//***************************************************************************************************************
-void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
+bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
try {
ifstream inOligos;
m->openInputFile(oligoFile, inOligos);
while(!inOligos.eof()){
- inOligos >> type; m->gobble(inOligos);
+ inOligos >> type;
if(type[0] == '#'){
- while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
+ while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
+ m->gobble(inOligos);
}
else{
+ m->gobble(inOligos);
//make type case insensitive
for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
for(int i=0;i<fastaFileNames.size();i++){
fastaFileNames[i].assign(primerNameVector.size(), "");
}
- if(qFileName != ""){ qualFileNames = fastaFileNames; }
+ if(qFileName != "") { qualFileNames = fastaFileNames; }
+ if(nameFile != "") { nameFileNames = fastaFileNames; }
if(allFiles){
set<string> uniqueNames; //used to cleanup outputFileNames
string comboGroupName = "";
string fastaFileName = "";
string qualFileName = "";
+ string nameFileName = "";
if(primerName == ""){
comboGroupName = barcodeNameVector[itBar->second];
comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
}
}
-
+
+
ofstream temp;
fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
if (uniqueNames.count(fastaFileName) == 0) {
fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
m->openOutputFile(fastaFileName, temp); temp.close();
-
+
if(qFileName != ""){
qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
- if (uniqueNames.count(fastaFileName) == 0) {
+ if (uniqueNames.count(qualFileName) == 0) {
outputNames.push_back(qualFileName);
outputTypes["qfile"].push_back(qualFileName);
}
qualFileNames[itBar->second][itPrimer->second] = qualFileName;
m->openOutputFile(qualFileName, temp); temp.close();
}
+
+ if(nameFile != ""){
+ nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
+ if (uniqueNames.count(nameFileName) == 0) {
+ outputNames.push_back(nameFileName);
+ outputTypes["name"].push_back(nameFileName);
+ }
+
+ nameFileNames[itBar->second][itPrimer->second] = nameFileName;
+ m->openOutputFile(nameFileName, temp); temp.close();
+ }
+
}
}
}
numFPrimers = primers.size();
numRPrimers = revPrimer.size();
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "getOligos");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
- try {
-
- string rawSequence = seq.getUnaligned();
- int success = bdiffs + 1; //guilty until proven innocent
- //can you find the barcode
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- string oligo = it->first;
- if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
- group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
-
- if(qual.getName() != ""){
- qual.trimQScores(oligo.length(), -1);
- }
-
- success = 0;
+ bool allBlank = true;
+ for (int i = 0; i < barcodeNameVector.size(); i++) {
+ if (barcodeNameVector[i] != "") {
+ allBlank = false;
break;
}
}
-
- //if you found the barcode or if you don't want to allow for diffs
- if ((bdiffs == 0) || (success == 0)) { return success; }
-
- else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
- Alignment* alignment;
- if (barcodes.size() > 0) {
- map<string,int>::iterator it=barcodes.begin();
-
- for(it;it!=barcodes.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
-
- }else{ alignment = NULL; }
-
- //can you find the barcode
- int minDiff = 1e6;
- int minCount = 1;
- int minGroup = -1;
- int minPos = 0;
-
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- string oligo = it->first;
-// int length = oligo.length();
-
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10;
- break;
- }
-
- //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
- oligo = alignment->getSeqAAln();
- string temp = alignment->getSeqBAln();
-
- int alnLength = oligo.length();
-
- for(int i=oligo.length()-1;i>=0;i--){
- if(oligo[i] != '-'){ alnLength = i+1; break; }
- }
- oligo = oligo.substr(0,alnLength);
- temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
-
- if(numDiff < minDiff){
- minDiff = numDiff;
- minCount = 1;
- minGroup = it->second;
- minPos = 0;
- for(int i=0;i<alnLength;i++){
- if(temp[i] != '-'){
- minPos++;
- }
- }
- }
- else if(numDiff == minDiff){
- minCount++;
- }
-
- }
-
- if(minDiff > bdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
- else{ //use the best match
- group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
-
- if(qual.getName() != ""){
- qual.trimQScores(minPos, -1);
- }
- success = minDiff;
- }
-
- if (alignment != NULL) { delete alignment; }
-
- }
-
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
- exit(1);
- }
-
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
- try {
- string rawSequence = seq.getUnaligned();
- int success = pdiffs + 1; //guilty until proven innocent
-
- //can you find the primer
- for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
- string oligo = it->first;
- if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
- success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
- group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
- if(qual.getName() != ""){
- qual.trimQScores(oligo.length(), -1);
- }
- success = 0;
+ for (int i = 0; i < primerNameVector.size(); i++) {
+ if (primerNameVector[i] != "") {
+ allBlank = false;
break;
}
}
-
- //if you found the barcode or if you don't want to allow for diffs
- if ((pdiffs == 0) || (success == 0)) { return success; }
-
- else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
- Alignment* alignment;
- if (primers.size() > 0) {
- map<string,int>::iterator it=primers.begin();
-
- for(it;it!=primers.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
-
- }else{ alignment = NULL; }
-
- //can you find the barcode
- int minDiff = 1e6;
- int minCount = 1;
- int minGroup = -1;
- int minPos = 0;
-
- for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
- string oligo = it->first;
-// int length = oligo.length();
-
- if(rawSequence.length() < maxLength){
- success = pdiffs + 100;
- break;
- }
-
- //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
- oligo = alignment->getSeqAAln();
- string temp = alignment->getSeqBAln();
- int alnLength = oligo.length();
-
- for(int i=oligo.length()-1;i>=0;i--){
- if(oligo[i] != '-'){ alnLength = i+1; break; }
- }
- oligo = oligo.substr(0,alnLength);
- temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
-
- if(numDiff < minDiff){
- minDiff = numDiff;
- minCount = 1;
- minGroup = it->second;
- minPos = 0;
- for(int i=0;i<alnLength;i++){
- if(temp[i] != '-'){
- minPos++;
- }
- }
- }
- else if(numDiff == minDiff){
- minCount++;
- }
-
- }
-
- if(minDiff > pdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
- else{ //use the best match
- group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
- if(qual.getName() != ""){
- qual.trimQScores(minPos, -1);
- }
- success = minDiff;
- }
-
- if (alignment != NULL) { delete alignment; }
-
+ if (allBlank) {
+ m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
+ allFiles = false;
+ return false;
}
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "stripForward");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
-
-bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
- try {
- string rawSequence = seq.getUnaligned();
- bool success = 0; //guilty until proven innocent
-
- for(int i=0;i<numRPrimers;i++){
- string oligo = revPrimer[i];
-
- if(rawSequence.length() < oligo.length()){
- success = 0;
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
- seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
- if(qual.getName() != ""){
- qual.trimQScores(-1, rawSequence.length()-oligo.length());
- }
- success = 1;
- break;
- }
- }
- return success;
+ return true;
}
catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "stripReverse");
+ m->errorOut(e, "TrimSeqsCommand", "getOligos");
exit(1);
}
}
-
//***************************************************************************************************************
bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
}
}
-
-//***************************************************************************************************************
-
-bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
- try {
- bool success = 1;
- int length = oligo.length();
-
- for(int i=0;i<length;i++){
-
- if(oligo[i] != seq[i]){
- if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
- else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
- else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
- else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
- else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
- else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
- else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
-
- if(success == 0) { break; }
- }
- else{
- success = 1;
- }
- }
-
- return success;
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
- exit(1);
- }
-
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::countDiffs(string oligo, string seq){
- try {
-
- int length = oligo.length();
- int countDiffs = 0;
-
- for(int i=0;i<length;i++){
-
- if(oligo[i] != seq[i]){
- if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
- else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
- else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
- else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
- else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
- else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
- else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
- }
-
- }
-
- return countDiffs;
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "countDiffs");
- exit(1);
- }
-
-}
-
//***************************************************************************************************************