+ delete read;
+ delete clusterNameMap;
+
+ RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
+
+ Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
+ string tag = cluster->getTag();
+
+ double clusterCutoff = cutoff;
+ while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
+
+ if (m->control_pressed) { break; }
+
+ cluster->update(clusterCutoff);
+ }
+
+ list->setLabel(toString(cutoff));
+
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ ofstream listFile;
+ m->openOutputFile(listFileName, listFile);
+ list->print(listFile);
+ listFile.close();
+
+ delete matrix; delete cluster; delete rabund; delete list;
+
+ return listFileName;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "cluster");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::calcCentroidsDriver(int start, int finish){
+
+ //this function gets the most likely homopolymer length at a flow position for a group of sequences
+ //within an otu
+
+ try{
+
+ for(int i=start;i<finish;i++){
+
+ if (m->control_pressed) { break; }
+
+ double count = 0;
+ int position = 0;
+ int minFlowGram = 100000000;
+ double minFlowValue = 1e8;
+ change[i] = 0; //FALSE
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ count += singleTau[seqNumber[cumNumSeqs[i] + j]];
+ }
+
+ if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
+ vector<double> adF(nSeqsPerOTU[i]);
+ vector<int> anL(nSeqsPerOTU[i]);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ int nIU = mapSeqToUnique[nI];
+
+ int k;
+ for(k=0;k<position;k++){
+ if(nIU == anL[k]){
+ break;
+ }
+ }
+ if(k == position){
+ anL[position] = nIU;
+ adF[position] = 0.0000;
+ position++;
+ }
+ }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+
+ double tauValue = singleTau[seqNumber[index]];
+
+ for(int k=0;k<position;k++){
+ double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
+ adF[k] += dist * tauValue;
+ }
+ }
+
+ for(int j=0;j<position;j++){
+ if(adF[j] < minFlowValue){
+ minFlowGram = j;
+ minFlowValue = adF[j];
+ }
+ }
+
+ if(centroids[i] != anL[minFlowGram]){
+ change[i] = 1;
+ centroids[i] = anL[minFlowGram];
+ }
+ }
+ else if(centroids[i] != -1){
+ change[i] = 1;
+ centroids[i] = -1;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
+ try{
+
+ int flowAValue = cent * numFlowCells;
+ int flowBValue = flow * numFlowCells;
+
+ double dist = 0;
+
+ for(int i=0;i<length;i++){
+ dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ return dist / (double)length;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getDistToCentroid");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getNewWeights(){
+ try{
+
+ double maxChange = 0;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ double difference = weight[i];
+ weight[i] = 0;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ weight[i] += tauValue;
+ }
+
+ difference = fabs(weight[i] - difference);
+ if(difference > maxChange){ maxChange = difference; }
+ }
+ return maxChange;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+ /**************************************************************************************************/
+
+double ShhherCommand::getLikelihood(){
+
+ try{
+
+ vector<long double> P(numSeqs, 0);
+ int effNumOTUs = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ if(weight[i] > MIN_WEIGHT){
+ effNumOTUs++;
+ }
+ }
+
+ string hold;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ double singleDist = dist[seqNumber[index]];
+
+ P[nI] += weight[i] * exp(-singleDist * sigma);
+ }
+ }
+ double nLL = 0.00;
+ for(int i=0;i<numSeqs;i++){
+ if(P[i] == 0){ P[i] = DBL_EPSILON; }
+
+ nLL += -log(P[i]);
+ }
+
+ nLL = nLL -(double)numSeqs * log(sigma);
+
+ return nLL;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::checkCentroids(){
+ try{
+ vector<int> unique(numOTUs, 1);
+
+ for(int i=0;i<numOTUs;i++){
+ if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
+ unique[i] = -1;
+ }
+ }
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ if(unique[i] == 1){
+ for(int j=i+1;j<numOTUs;j++){
+ if(unique[j] == 1){
+
+ if(centroids[j] == centroids[i]){
+ unique[j] = 0;
+ centroids[j] = -1;
+
+ weight[i] += weight[j];
+ weight[j] = 0.0;
+ }
+ }
+ }
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "checkCentroids");
+ exit(1);
+ }
+}
+ /**************************************************************************************************/
+
+
+
+void ShhherCommand::writeQualities(vector<int> otuCounts){
+
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("qfile");
+
+ ofstream qualityFile;
+ m->openOutputFile(qualityFileName, qualityFile);
+
+ qualityFile.setf(ios::fixed, ios::floatfield);
+ qualityFile.setf(ios::showpoint);
+ qualityFile << setprecision(6);
+
+ vector<vector<int> > qualities(numOTUs);
+ vector<double> pr(HOMOPS, 0);
+
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = 0;
+ int base = 0;
+
+ if(nSeqsPerOTU[i] > 0){
+ qualities[i].assign(1024, -1);
+
+ while(index < numFlowCells){
+ double maxPrValue = 1e8;
+ short maxPrIndex = -1;
+ double count = 0.0000;
+
+ pr.assign(HOMOPS, 0);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int lIndex = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[lIndex]];
+ int sequenceIndex = aaI[i][j];
+ short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
+
+ count += tauValue;
+
+ for(int s=0;s<HOMOPS;s++){
+ pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
+ }
+ }
+
+ maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
+ maxPrValue = pr[maxPrIndex];
+
+ if(count > MIN_COUNT){
+ double U = 0.0000;
+ double norm = 0.0000;
+
+ for(int s=0;s<HOMOPS;s++){
+ norm += exp(-(pr[s] - maxPrValue));
+ }
+
+ for(int s=1;s<=maxPrIndex;s++){
+ int value = 0;
+ double temp = 0.0000;
+
+ U += exp(-(pr[s-1]-maxPrValue))/norm;
+
+ if(U>0.00){
+ temp = log10(U);
+ }
+ else{
+ temp = -10.1;
+ }
+ temp = floor(-10 * temp);
+ value = (int)floor(temp);
+ if(value > 100){ value = 100; }
+
+ qualities[i][base] = (int)value;
+ base++;
+ }
+ }
+
+ index++;
+ }
+ }
+
+
+ if(otuCounts[i] > 0){
+ qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
+
+ int j=4; //need to get past the first four bases
+ while(qualities[i][j] != -1){
+ qualityFile << qualities[i][j] << ' ';
+ j++;
+ }
+ qualityFile << endl;
+ }
+ }
+ qualityFile.close();
+ outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeQualities");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeSequences(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("fasta");
+ ofstream fastaFile;
+ m->openOutputFile(fastaFileName, fastaFile);
+
+ vector<string> names(numOTUs, "");
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = centroids[i];
+
+ if(otuCounts[i] > 0){
+ fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
+
+ string newSeq = "";
+
+ for(int j=0;j<numFlowCells;j++){
+
+ char base = flowOrder[j % 4];
+ for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
+ newSeq += base;
+ }
+ }
+
+ fastaFile << newSeq.substr(4) << endl;
+ }
+ }
+ fastaFile.close();
+
+ outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
+
+ if(compositeFASTAFileName != ""){
+ m->appendFiles(fastaFileName, compositeFASTAFileName);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeSequences");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeNames(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("name");
+ ofstream nameFile;
+ m->openOutputFile(nameFileName, nameFile);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ if(otuCounts[i] > 0){
+ nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
+
+ for(int j=1;j<nSeqsPerOTU[i];j++){
+ nameFile << ',' << seqNameVector[aaI[i][j]];
+ }
+
+ nameFile << endl;
+ }
+ }
+ nameFile.close();
+ outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
+
+
+ if(compositeNamesFileName != ""){
+ m->appendFiles(nameFileName, compositeNamesFileName);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeNames");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeGroups(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
+ int pos = fileRoot.find_first_of('.');
+ string fileGroup = fileRoot;
+ if (pos != string::npos) { fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1))); }
+ string groupFileName = thisOutputDir + fileRoot + getOutputFileNameTag("group");
+ ofstream groupFile;
+ m->openOutputFile(groupFileName, groupFile);
+
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { break; }
+ groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
+ }
+ groupFile.close();
+ outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeGroups");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeClusters(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) +getOutputFileNameTag("counts");
+ ofstream otuCountsFile;
+ m->openOutputFile(otuCountsFileName, otuCountsFile);
+
+ string bases = flowOrder;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) {
+ break;
+ }
+ //output the translated version of the centroid sequence for the otu
+ if(otuCounts[i] > 0){
+ int index = centroids[i];
+
+ otuCountsFile << "ideal\t";
+ for(int j=8;j<numFlowCells;j++){
+ char base = bases[j % 4];
+ for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
+ otuCountsFile << base;
+ }
+ }
+ otuCountsFile << endl;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int sequence = aaI[i][j];
+ otuCountsFile << seqNameVector[sequence] << '\t';
+
+ string newSeq = "";
+
+ for(int k=0;k<lengths[sequence];k++){
+ char base = bases[k % 4];
+ int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
+
+ for(int s=0;s<freq;s++){
+ newSeq += base;
+ //otuCountsFile << base;
+ }
+ }
+ otuCountsFile << newSeq.substr(4) << endl;
+ }
+ otuCountsFile << endl;
+ }
+ }
+ otuCountsFile.close();
+ outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeClusters");
+ exit(1);
+ }
+}
+
+#else
+//**********************************************************************************************************************
+
+int ShhherCommand::execute(){
+ try {
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }