+ delete read;
+ delete clusterNameMap;
+
+ RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
+
+ Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
+ string tag = cluster->getTag();
+
+ double clusterCutoff = cutoff;
+ while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
+
+ if (m->control_pressed) { break; }
+
+ cluster->update(clusterCutoff);
+ }
+
+ list->setLabel(toString(cutoff));
+
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ ofstream listFile;
+ m->openOutputFile(listFileName, listFile);
+ list->print(listFile);
+ listFile.close();
+
+ delete matrix; delete cluster; delete rabund; delete list;
+
+ return listFileName;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "cluster");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::calcCentroidsDriver(int start, int finish){
+
+ //this function gets the most likely homopolymer length at a flow position for a group of sequences
+ //within an otu
+
+ try{
+
+ for(int i=start;i<finish;i++){
+
+ if (m->control_pressed) { break; }
+
+ double count = 0;
+ int position = 0;
+ int minFlowGram = 100000000;
+ double minFlowValue = 1e8;
+ change[i] = 0; //FALSE
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ count += singleTau[seqNumber[cumNumSeqs[i] + j]];
+ }
+
+ if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
+ vector<double> adF(nSeqsPerOTU[i]);
+ vector<int> anL(nSeqsPerOTU[i]);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ int nIU = mapSeqToUnique[nI];
+
+ int k;
+ for(k=0;k<position;k++){
+ if(nIU == anL[k]){
+ break;
+ }
+ }
+ if(k == position){
+ anL[position] = nIU;
+ adF[position] = 0.0000;
+ position++;
+ }
+ }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+
+ double tauValue = singleTau[seqNumber[index]];
+
+ for(int k=0;k<position;k++){
+ double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
+ adF[k] += dist * tauValue;
+ }
+ }
+
+ for(int j=0;j<position;j++){
+ if(adF[j] < minFlowValue){
+ minFlowGram = j;
+ minFlowValue = adF[j];
+ }
+ }
+
+ if(centroids[i] != anL[minFlowGram]){
+ change[i] = 1;
+ centroids[i] = anL[minFlowGram];
+ }
+ }
+ else if(centroids[i] != -1){
+ change[i] = 1;
+ centroids[i] = -1;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
+ try{
+
+ int flowAValue = cent * numFlowCells;
+ int flowBValue = flow * numFlowCells;
+
+ double dist = 0;
+
+ for(int i=0;i<length;i++){
+ dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ return dist / (double)length;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getDistToCentroid");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getNewWeights(){
+ try{
+
+ double maxChange = 0;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ double difference = weight[i];
+ weight[i] = 0;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ weight[i] += tauValue;
+ }
+
+ difference = fabs(weight[i] - difference);
+ if(difference > maxChange){ maxChange = difference; }
+ }
+ return maxChange;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+ /**************************************************************************************************/
+
+double ShhherCommand::getLikelihood(){
+
+ try{
+
+ vector<long double> P(numSeqs, 0);
+ int effNumOTUs = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ if(weight[i] > MIN_WEIGHT){
+ effNumOTUs++;
+ }
+ }
+
+ string hold;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ double singleDist = dist[seqNumber[index]];
+
+ P[nI] += weight[i] * exp(-singleDist * sigma);
+ }
+ }
+ double nLL = 0.00;
+ for(int i=0;i<numSeqs;i++){
+ if(P[i] == 0){ P[i] = DBL_EPSILON; }
+
+ nLL += -log(P[i]);
+ }
+
+ nLL = nLL -(double)numSeqs * log(sigma);
+
+ return nLL;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::checkCentroids(){
+ try{
+ vector<int> unique(numOTUs, 1);
+
+ for(int i=0;i<numOTUs;i++){
+ if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
+ unique[i] = -1;
+ }
+ }
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ if(unique[i] == 1){
+ for(int j=i+1;j<numOTUs;j++){
+ if(unique[j] == 1){
+
+ if(centroids[j] == centroids[i]){
+ unique[j] = 0;
+ centroids[j] = -1;
+
+ weight[i] += weight[j];
+ weight[j] = 0.0;
+ }
+ }
+ }
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "checkCentroids");
+ exit(1);
+ }
+}
+ /**************************************************************************************************/
+
+
+
+void ShhherCommand::writeQualities(vector<int> otuCounts){
+
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string qualityFileName = getOutputFileName("qfile",variables);
+
+ ofstream qualityFile;
+ m->openOutputFile(qualityFileName, qualityFile);
+
+ qualityFile.setf(ios::fixed, ios::floatfield);
+ qualityFile.setf(ios::showpoint);
+ qualityFile << setprecision(6);
+
+ vector<vector<int> > qualities(numOTUs);
+ vector<double> pr(HOMOPS, 0);
+
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = 0;
+ int base = 0;
+
+ if(nSeqsPerOTU[i] > 0){
+ qualities[i].assign(1024, -1);
+
+ while(index < numFlowCells){
+ double maxPrValue = 1e8;
+ short maxPrIndex = -1;
+ double count = 0.0000;
+
+ pr.assign(HOMOPS, 0);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int lIndex = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[lIndex]];
+ int sequenceIndex = aaI[i][j];
+ short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
+
+ count += tauValue;
+
+ for(int s=0;s<HOMOPS;s++){
+ pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
+ }
+ }
+
+ maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
+ maxPrValue = pr[maxPrIndex];
+
+ if(count > MIN_COUNT){
+ double U = 0.0000;
+ double norm = 0.0000;
+
+ for(int s=0;s<HOMOPS;s++){
+ norm += exp(-(pr[s] - maxPrValue));
+ }
+
+ for(int s=1;s<=maxPrIndex;s++){
+ int value = 0;
+ double temp = 0.0000;
+
+ U += exp(-(pr[s-1]-maxPrValue))/norm;
+
+ if(U>0.00){
+ temp = log10(U);
+ }
+ else{
+ temp = -10.1;
+ }
+ temp = floor(-10 * temp);
+ value = (int)floor(temp);
+ if(value > 100){ value = 100; }
+
+ qualities[i][base] = (int)value;
+ base++;
+ }
+ }
+
+ index++;
+ }
+ }
+
+
+ if(otuCounts[i] > 0){
+ qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
+
+ int j=4; //need to get past the first four bases
+ while(qualities[i][j] != -1){
+ qualityFile << qualities[i][j] << ' ';
+ j++;
+ }
+ qualityFile << endl;
+ }
+ }
+ qualityFile.close();
+ outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeQualities");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeSequences(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string fastaFileName = getOutputFileName("fasta",variables);
+ ofstream fastaFile;
+ m->openOutputFile(fastaFileName, fastaFile);
+
+ vector<string> names(numOTUs, "");
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = centroids[i];
+
+ if(otuCounts[i] > 0){
+ fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
+
+ string newSeq = "";
+
+ for(int j=0;j<numFlowCells;j++){
+
+ char base = flowOrder[j % flowOrder.length()];
+ for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
+ newSeq += base;
+ }
+ }
+
+ fastaFile << newSeq.substr(4) << endl;
+ }
+ }
+ fastaFile.close();
+
+ outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
+
+ if(compositeFASTAFileName != ""){
+ m->appendFiles(fastaFileName, compositeFASTAFileName);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeSequences");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeNames(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string nameFileName = getOutputFileName("name",variables);
+ ofstream nameFile;
+ m->openOutputFile(nameFileName, nameFile);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ if(otuCounts[i] > 0){
+ nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
+
+ for(int j=1;j<nSeqsPerOTU[i];j++){
+ nameFile << ',' << seqNameVector[aaI[i][j]];
+ }
+
+ nameFile << endl;
+ }
+ }
+ nameFile.close();
+ outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
+
+
+ if(compositeNamesFileName != ""){
+ m->appendFiles(nameFileName, compositeNamesFileName);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeNames");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeGroups(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
+ int pos = fileRoot.find_first_of('.');
+ string fileGroup = fileRoot;
+ if (pos != string::npos) { fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1))); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + fileRoot;
+ string groupFileName = getOutputFileName("group",variables);
+ ofstream groupFile;
+ m->openOutputFile(groupFileName, groupFile);
+
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { break; }
+ groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
+ }
+ groupFile.close();
+ outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeGroups");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeClusters(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string otuCountsFileName = getOutputFileName("counts",variables);
+ ofstream otuCountsFile;
+ m->openOutputFile(otuCountsFileName, otuCountsFile);
+
+ string bases = flowOrder;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) {
+ break;
+ }
+ //output the translated version of the centroid sequence for the otu
+ if(otuCounts[i] > 0){
+ int index = centroids[i];
+
+ otuCountsFile << "ideal\t";
+ for(int j=8;j<numFlowCells;j++){
+ char base = bases[j % bases.length()];
+ for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
+ otuCountsFile << base;
+ }
+ }
+ otuCountsFile << endl;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int sequence = aaI[i][j];
+ otuCountsFile << seqNameVector[sequence] << '\t';
+
+ string newSeq = "";
+
+ for(int k=0;k<lengths[sequence];k++){
+ char base = bases[k % bases.length()];
+ int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
+
+ for(int s=0;s<freq;s++){
+ newSeq += base;
+ //otuCountsFile << base;
+ }
+ }
+ otuCountsFile << newSeq.substr(4) << endl;
+ }
+ otuCountsFile << endl;
+ }
+ }
+ otuCountsFile.close();
+ outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeClusters");
+ exit(1);
+ }
+}
+
+#else
+//**********************************************************************************************************************
+
+int ShhherCommand::execute(){
+ try {
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ getSingleLookUp(); if (m->control_pressed) { return 0; }
+ getJointLookUp(); if (m->control_pressed) { return 0; }
+
+ int numFiles = flowFileVector.size();
+
+ if (numFiles < processors) { processors = numFiles; }
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName); }
+ else { createProcesses(flowFileVector); } //each processor processes one file
+#else
+ driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
+#endif
+
+ if(compositeFASTAFileName != ""){
+ outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
+ outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
+ }
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "execute");
+ exit(1);
+ }
+}
+#endif
+//********************************************************************************************************************
+//sorts biggest to smallest
+inline bool compareFileSizes(string left, string right){
+
+ FILE * pFile;
+ long leftsize = 0;
+
+ //get num bytes in file
+ string filename = left;
+ pFile = fopen (filename.c_str(),"rb");
+ string error = "Error opening " + filename;
+ if (pFile==NULL) perror (error.c_str());
+ else{
+ fseek (pFile, 0, SEEK_END);
+ leftsize=ftell (pFile);
+ fclose (pFile);
+ }
+
+ FILE * pFile2;
+ long rightsize = 0;
+
+ //get num bytes in file
+ filename = right;
+ pFile2 = fopen (filename.c_str(),"rb");
+ error = "Error opening " + filename;
+ if (pFile2==NULL) perror (error.c_str());
+ else{
+ fseek (pFile2, 0, SEEK_END);
+ rightsize=ftell (pFile2);
+ fclose (pFile2);
+ }
+
+ return (leftsize > rightsize);
+}
+/**************************************************************************************************/
+
+int ShhherCommand::createProcesses(vector<string> filenames){
+ try {
+ vector<int> processIDS;
+ int process = 1;
+ int num = 0;
+
+ //sanity check
+ if (filenames.size() < processors) { processors = filenames.size(); }
+
+ //sort file names by size to divide load better
+ sort(filenames.begin(), filenames.end(), compareFileSizes);
+
+ vector < vector <string> > dividedFiles; //dividedFiles[1] = vector of filenames for process 1...
+ dividedFiles.resize(processors);
+
+ //for each file, figure out which process will complete it
+ //want to divide the load intelligently so the big files are spread between processes
+ for (int i = 0; i < filenames.size(); i++) {
+ int processToAssign = (i+1) % processors;
+ if (processToAssign == 0) { processToAssign = processors; }
+
+ dividedFiles[(processToAssign-1)].push_back(filenames[i]);
+ }
+
+ //now lets reverse the order of ever other process, so we balance big files running with little ones
+ for (int i = 0; i < processors; i++) {
+ int remainder = ((i+1) % processors);
+ if (remainder) { reverse(dividedFiles[i].begin(), dividedFiles[i].end()); }
+ }
+
+
+ //divide the groups between the processors
+ /*vector<linePair> lines;
+ vector<int> numFilesToComplete;
+ int numFilesPerProcessor = filenames.size() / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numFilesPerProcessor;
+ int endIndex = (i+1) * numFilesPerProcessor;
+ if(i == (processors - 1)){ endIndex = filenames.size(); }
+ lines.push_back(linePair(startIndex, endIndex));
+ numFilesToComplete.push_back((endIndex-startIndex));
+ }*/
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ num = driver(dividedFiles[process], compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp");
+
+ //pass numSeqs to parent
+ ofstream out;
+ string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
+ m->openOutputFile(tempFile, out);
+ out << num << endl;
+ out.close();
+
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
+
+ //do my part
+ driver(dividedFiles[0], compositeFASTAFileName, compositeNamesFileName);
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ #else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ /*
+ vector<shhhFlowsData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+ // Allocate memory for thread data.
+ string extension = "";
+ if (i != 0) { extension = toString(i) + ".temp"; }
+
+ shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
+ pDataArray.push_back(tempFlow);
+ processIDS.push_back(i);
+
+ hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+
+ //using the main process as a worker saves time and memory
+ //do my part
+ driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+ */
+ #endif
+
+ for (int i=0;i<processIDS.size();i++) {
+ ifstream in;
+ string tempFile = compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
+ m->openInputFile(tempFile, in);
+ if (!in.eof()) {
+ int tempNum = 0;
+ in >> tempNum;
+ if (tempNum != dividedFiles[i+1].size()) {
+ m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(dividedFiles[i+1].size()) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches. The flow files may be too large to process with multiple processors. \n");
+ }
+ }
+ in.close(); m->mothurRemove(tempFile);
+
+ if (compositeFASTAFileName != "") {
+ m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
+ m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
+ m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
+ m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
+ }
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "createProcesses");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+vector<string> ShhherCommand::parseFlowFiles(string filename){
+ try {
+ vector<string> files;
+ int count = 0;
+
+ ifstream in;
+ m->openInputFile(filename, in);
+
+ int thisNumFLows = 0;
+ in >> thisNumFLows; m->gobble(in);
+
+ while (!in.eof()) {
+ if (m->control_pressed) { break; }
+
+ ofstream out;
+ string outputFileName = filename + toString(count) + ".temp";
+ m->openOutputFile(outputFileName, out);
+ out << thisNumFLows << endl;
+ files.push_back(outputFileName);
+
+ int numLinesWrote = 0;
+ for (int i = 0; i < largeSize; i++) {
+ if (in.eof()) { break; }
+ string line = m->getline(in); m->gobble(in);
+ out << line << endl;
+ numLinesWrote++;
+ }
+ out.close();
+
+ if (numLinesWrote == 0) { m->mothurRemove(outputFileName); files.pop_back(); }
+ count++;
+ }
+ in.close();
+
+ if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); } files.clear(); }
+
+ m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n");
+
+ return files;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "parseFlowFiles");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
+ try {
+
+ int numCompleted = 0;
+
+ for(int i=0;i<filenames.size();i++){
+
+ if (m->control_pressed) { break; }
+
+ vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
+ if (large) { theseFlowFileNames = parseFlowFiles(filenames[i]); }
+
+ if (m->control_pressed) { break; }
+
+ double begClock = clock();
+ unsigned long long begTime;
+
+ string fileNameForOutput = filenames[i];
+
+ for (int g = 0; g < theseFlowFileNames.size(); g++) {
+
+ string flowFileName = theseFlowFileNames[g];
+ m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
+ m->mothurOut("Reading flowgrams...\n");
+
+ vector<string> seqNameVector;
+ vector<int> lengths;
+ vector<short> flowDataIntI;
+ vector<double> flowDataPrI;
+ map<string, int> nameMap;
+ vector<short> uniqueFlowgrams;
+ vector<int> uniqueCount;
+ vector<int> mapSeqToUnique;
+ vector<int> mapUniqueToSeq;
+ vector<int> uniqueLengths;
+ int numFlowCells;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
+ int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("Identifying unique flowgrams...\n");
+ int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("Calculating distances between flowgrams...\n");
+ string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
+ begTime = time(NULL);
+
+
+ flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+
+ m->mothurOutEndLine();
+ m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
+
+
+ string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
+ createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("\nClustering flowgrams...\n");
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ cluster(listFileName, distFileName, namesFileName);
+
+ if (m->control_pressed) { break; }
+
+ vector<int> otuData;
+ vector<int> cumNumSeqs;
+ vector<int> nSeqsPerOTU;
+ vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
+ vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
+ vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
+ vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
+
+
+ int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurRemove(distFileName);
+ m->mothurRemove(namesFileName);
+ m->mothurRemove(listFileName);
+
+ vector<double> dist; //adDist - distance of sequences to centroids
+ vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
+ vector<int> centroids; //the representative flowgram for each cluster m
+ vector<double> weight;
+ vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
+ vector<int> nSeqsBreaks;
+ vector<int> nOTUsBreaks;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
+
+ dist.assign(numSeqs * numOTUs, 0);
+ change.assign(numOTUs, 1);
+ centroids.assign(numOTUs, -1);
+ weight.assign(numOTUs, 0);
+ singleTau.assign(numSeqs, 1.0);
+
+ nSeqsBreaks.assign(2, 0);
+ nOTUsBreaks.assign(2, 0);
+
+ nSeqsBreaks[0] = 0;
+ nSeqsBreaks[1] = numSeqs;
+ nOTUsBreaks[1] = numOTUs;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
+
+ if (m->control_pressed) { break; }
+
+ double maxDelta = 0;
+ int iter = 0;
+
+ begClock = clock();
+ begTime = time(NULL);
+
+ m->mothurOut("\nDenoising flowgrams...\n");
+ m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
+
+ while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
+
+ if (m->control_pressed) { break; }
+
+ double cycClock = clock();
+ unsigned long long cycTime = time(NULL);
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+
+ if (m->control_pressed) { break; }
+
+ maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
+
+ if (m->control_pressed) { break; }
+
+ double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
+
+ if (m->control_pressed) { break; }
+
+ checkCentroids(numOTUs, centroids, weight);
+
+ if (m->control_pressed) { break; }
+
+ calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
+
+ if (m->control_pressed) { break; }
+
+ iter++;
+
+ m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
+
+ }
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("\nFinalizing...\n");
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ vector<int> otuCounts(numOTUs, 0);
+ for(int j=0;j<numSeqs;j++) { otuCounts[otuData[j]]++; }
+
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+
+ if (m->control_pressed) { break; }
+
+ if ((large) && (g == 0)) { flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir = m->hasPath(flowFileName); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string qualityFileName = getOutputFileName("qfile",variables);
+ string fastaFileName = getOutputFileName("fasta",variables);
+ string nameFileName = getOutputFileName("name",variables);
+ string otuCountsFileName = getOutputFileName("counts",variables);
+ string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
+ int pos = fileRoot.find_first_of('.');
+ string fileGroup = fileRoot;
+ if (pos != string::npos) { fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1))); }
+ string groupFileName = getOutputFileName("group",variables);
+
+
+ writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
+ writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
+ writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; }
+ writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; }
+ writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector); if (m->control_pressed) { break; }
+
+ if (large) {
+ if (g > 0) {
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0]));
+ m->appendFiles(qualityFileName, getOutputFileName("qfile",variables));
+ m->mothurRemove(qualityFileName);
+ m->appendFiles(fastaFileName, getOutputFileName("fasta",variables));
+ m->mothurRemove(fastaFileName);
+ m->appendFiles(nameFileName, getOutputFileName("name",variables));
+ m->mothurRemove(nameFileName);
+ m->appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
+ m->mothurRemove(otuCountsFileName);
+ m->appendFiles(groupFileName, getOutputFileName("group",variables));
+ m->mothurRemove(groupFileName);
+ }
+ m->mothurRemove(theseFlowFileNames[g]);
+ }
+ }
+
+ numCompleted++;
+ m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
+ }