5 * Created by Pat Schloss on 8/31/11.
6 * Copyright 2011 Patrick D. Schloss. All rights reserved.
11 #include "sequence.hpp"
12 #include "listvector.hpp"
13 #include "inputdata.h"
15 #define MIN_DELTA 1.0e-6
19 #define MIN_TAU 1.0e-4
20 #define MIN_WEIGHT 0.1
23 /**************************************************************************************************/
24 int seqNoise::getSequenceData(string sequenceFileName, vector<string>& sequences){
27 ifstream sequenceFile;
28 m->openInputFile(sequenceFileName, sequenceFile);
30 while(!sequenceFile.eof()){
32 if (m->control_pressed) { break; }
34 Sequence temp(sequenceFile); m->gobble(sequenceFile);
36 if (temp.getName() != "") {
37 sequences.push_back(temp.getAligned());
45 m->errorOut(e, "seqNoise", "getSequenceData");
49 /**************************************************************************************************/
50 int seqNoise::addSeq(string seq, vector<string>& sequences){
52 sequences.push_back(seq);
56 m->errorOut(e, "seqNoise", "addSeq");
60 /**************************************************************************************************/
61 //no checks for file mismatches
62 int seqNoise::getRedundantNames(string namesFileName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){
64 string unique, redundant;
66 m->openInputFile(namesFileName, namesFile);
68 for(int i=0;i<redundantNames.size();i++){
70 if (m->control_pressed) { break; }
72 namesFile >> uniqueNames[i]; m->gobble(namesFile);
73 namesFile >> redundantNames[i]; m->gobble(namesFile);
75 seqFreq[i] = m->getNumNames(redundantNames[i]);
82 m->errorOut(e, "seqNoise", "getRedundantNames");
86 /**************************************************************************************************/
87 int seqNoise::addRedundantName(string uniqueName, string redundantName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){
90 uniqueNames.push_back(uniqueName);
91 redundantNames.push_back(redundantName);
92 seqFreq.push_back(m->getNumNames(redundantName));
97 m->errorOut(e, "seqNoise", "addRedundantName");
101 /**************************************************************************************************/
102 int seqNoise::getDistanceData(string distFileName, vector<double>& distances){
106 m->openInputFile(distFileName, distFile);
113 for(int i=0;i<numSeqs;i++){
115 if (m->control_pressed) { break; }
117 distances[i * numSeqs + i] = 0.0000;
121 for(int j=0;j<i;j++){
122 distFile >> distances[i * numSeqs + j];
123 distances[j * numSeqs + i] = distances[i * numSeqs + j];
131 catch(exception& e) {
132 m->errorOut(e, "seqNoise", "getDistanceData");
137 /**************************************************************************************************/
138 int seqNoise::getListData(string listFileName, double cutOff, vector<int>& otuData, vector<int>& otuFreq, vector<vector<int> >& otuBySeqLookUp){
142 m->openInputFile(listFileName, listFile);
144 bool adjustCutoff = true;
145 string lastLabel = "";
147 while(!listFile.eof()){
149 ListVector list(listFile); m->gobble(listFile); //10/18/13 - change to reading with listvector to accomodate changes to the listfiel format. ie. adding header labels.
151 string thisLabel = list.getLabel();
152 lastLabel = thisLabel;
154 if (thisLabel == "unique") {} //skip to next label in listfile
157 m->mothurConvert(thisLabel, threshold);
159 if(threshold < cutOff){} //skip to next label in listfile
161 adjustCutoff = false;
162 int numOTUs = list.getNumBins();
163 otuFreq.resize(numOTUs, 0);
165 for(int i=0;i<numOTUs;i++){
167 if (m->control_pressed) { return 0; }
169 string otu = list.get(i);
173 for(int j=0;j<otu.size();j++){
178 int index = atoi(number.c_str());
185 int index = atoi(number.c_str());
192 otuBySeqLookUp.resize(numOTUs);
194 int numSeqs = otuData.size();
196 for(int i=0;i<numSeqs;i++){
197 if (m->control_pressed) { return 0; }
198 otuBySeqLookUp[otuData[i]].push_back(i);
200 for(int i=0;i<numOTUs;i++){
201 if (m->control_pressed) { return 0; }
202 for(int j=otuBySeqLookUp[i].size();j<numSeqs;j++){
203 otuBySeqLookUp[i].push_back(0);
214 //the listfile does not contain a threshold greater than the cutoff so use highest value
217 InputData input(listFileName, "list");
218 ListVector* list = input.getListVector(lastLabel);
220 int numOTUs = list->getNumBins();
221 otuFreq.resize(numOTUs, 0);
223 for(int i=0;i<numOTUs;i++){
225 if (m->control_pressed) { return 0; }
227 string otu = list->get(i);
232 for(int j=0;j<otu.size();j++){
237 int index = atoi(number.c_str());
244 int index = atoi(number.c_str());
251 otuBySeqLookUp.resize(numOTUs);
253 int numSeqs = otuData.size();
255 for(int i=0;i<numSeqs;i++){
256 if (m->control_pressed) { return 0; }
257 otuBySeqLookUp[otuData[i]].push_back(i);
259 for(int i=0;i<numOTUs;i++){
260 if (m->control_pressed) { return 0; }
261 for(int j=otuBySeqLookUp[i].size();j<numSeqs;j++){
262 otuBySeqLookUp[i].push_back(0);
271 catch(exception& e) {
272 m->errorOut(e, "seqNoise", "getListData");
277 /**************************************************************************************************/
278 int seqNoise::updateOTUCountData(vector<int> otuFreq,
279 vector<vector<int> > otuBySeqLookUp,
280 vector<vector<int> > aanI,
283 vector<int>& cumCount
286 int numOTUs = otuFreq.size();
290 for(int i=0;i<numOTUs;i++){
293 if (m->control_pressed) { return 0; }
295 for(int j=0;j<otuFreq[i];j++){
296 anP[count] = otuBySeqLookUp[i][j];
297 anI[count] = aanI[i][j];
305 catch(exception& e) {
306 m->errorOut(e, "seqNoise", "updateOTUCountData");
310 /**************************************************************************************************/
311 double seqNoise::calcNewWeights(
312 vector<double>& weights, //
313 vector<int> seqFreq, //
315 vector<int> cumCount, //
317 vector<int> otuFreq, //
318 vector<double> tau //
322 int numOTUs = weights.size();
323 double maxChange = -1;
327 for(int i=0;i<numOTUs;i++){
329 if (m->control_pressed) { return 0; }
331 double change = weights[i];
335 for(int j=0;j<otuFreq[i];j++){
337 int index1 = cumCount[i] + j;
338 int index2 = anI[index1];
340 double currentTau = tau[anP[index1]];
341 double freq = double(seqFreq[index2]);
343 weights[i] += currentTau * freq;
345 change = fabs(weights[i] - change);
347 if(change > maxChange){ maxChange = change; }
353 catch(exception& e) {
354 m->errorOut(e, "seqNoise", "calcNewWeights");
359 /**************************************************************************************************/
361 int seqNoise::calcCentroids(
365 vector<int>& centroids,
366 vector<int> cumCount,
367 vector<double> distances,///
373 int numOTUs = change.size();
374 int numSeqs = seqFreq.size();
376 for(int i=0;i<numOTUs;i++){
378 if (m->control_pressed) { return 0; }
381 double minFValue = 1e10;
384 double count = 0.00000;
386 int freqOfOTU = otuFreq[i];
388 for(int j=0;j<freqOfOTU;j++){
389 int index = cumCount[i] + j;
390 count += seqFreq[anI[index]]*tau[anP[index]];
393 if(freqOfOTU > 0 && count > MIN_COUNT){
395 vector<double> adF(freqOfOTU);
396 vector<int> anL(freqOfOTU);
398 for(int j=0;j<freqOfOTU;j++){
399 anL[j] = anI[cumCount[i] + j];
403 for(int j=0;j<freqOfOTU;j++){
404 int index = cumCount[i] + j;
405 double curTau = tau[anP[index]];
407 for(int k=0;k<freqOfOTU;k++){
408 double dist = distances[anL[j]*numSeqs + anL[k]];
410 adF[k] += dist * curTau * seqFreq[anL[j]];
414 for(int j=0;j<freqOfOTU;j++){
415 if(adF[j] < minFValue){
421 if(centroids[i] != anL[minFIndex]){
423 centroids[i] = anL[minFIndex];
426 else if(centroids[i] != -1){
434 catch(exception& e) {
435 m->errorOut(e, "seqNoise", "calcCentroids");
440 /**************************************************************************************************/
442 int seqNoise::checkCentroids(vector<double>& weights, vector<int> centroids){
444 int numOTUs = centroids.size();
445 vector<int> unique(numOTUs, 1);
447 double minWeight = MIN_WEIGHT;
448 for(int i=0;i<numOTUs;i++){
449 if (m->control_pressed) { return 0; }
450 if(weights[i] < minWeight){ unique[i] = -1; }
453 for(int i=0;i<numOTUs;i++){
454 if (m->control_pressed) { return 0; }
456 for(int j=i+1; j<numOTUs;j++){
458 if(centroids[i] == centroids[j]){
460 weights[i] += weights[j];
470 catch(exception& e) {
471 m->errorOut(e, "seqNoise", "checkCentroids");
476 /**************************************************************************************************/
478 int seqNoise::setUpOTUData(vector<int>& otuData, vector<double>& percentage, vector<int> cumCount, vector<double> tau, vector<int> otuFreq, vector<int> anP, vector<int> anI){
481 int numOTUs = cumCount.size();
482 int numSeqs = otuData.size();
484 vector<double> bestTau(numSeqs, 0);
485 vector<double> bestIndex(numSeqs, -1);
487 for(int i=0;i<numOTUs;i++){
488 if (m->control_pressed) { return 0; }
489 for(int j=0;j<otuFreq[i];j++){
491 int index1 = cumCount[i] + j;
492 double thisTau = tau[anP[index1]];
493 int index2 = anI[index1];
495 if(thisTau > bestTau[index2]){
496 bestTau[index2] = thisTau;
497 bestIndex[index2] = i;
502 for(int i=0;i<numSeqs;i++){
503 if (m->control_pressed) { return 0; }
504 otuData[i] = bestIndex[i];
505 percentage[i] = 1 - bestTau[i];
509 catch(exception& e) {
510 m->errorOut(e, "seqNoise", "setUpOTUData");
515 /**************************************************************************************************/
517 int seqNoise::finishOTUData(vector<int> otuData, vector<int>& otuFreq, vector<int>& anP, vector<int>& anI, vector<int>& cumCount, vector<vector<int> >& otuBySeqLookUp, vector<vector<int> >& aanI, vector<double>& tau){
519 int numSeqs = otuData.size();
520 int numOTUs = otuFreq.size();
523 otuFreq.assign(numOTUs, 0);
524 tau.assign(numSeqs, 1);
525 anP.assign(numSeqs, 0);
526 anI.assign(numSeqs, 0);
528 for(int i=0;i<numSeqs;i++){
529 if (m->control_pressed) { return 0; }
530 int otu = otuData[i];
533 otuBySeqLookUp[otu][otuFreq[otu]] = i;
534 aanI[otu][otuFreq[otu]] = i;
537 updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
540 catch(exception& e) {
541 m->errorOut(e, "seqNoise", "finishOTUData");
546 /**************************************************************************************************/
548 int seqNoise::getLastMatch(char direction, vector<vector<char> >& alignMoves, int i, int j, vector<int>& seqA, vector<int>& seqB){
550 char nullReturn = -1;
553 if (m->control_pressed) { return nullReturn; }
554 if(direction == 'd'){
555 if(seqA[i-1] == seqB[j-1]) { return seqA[i-1]; }
556 else { return nullReturn; }
559 else if(direction == 'l') { j--; }
562 direction = alignMoves[i][j];
567 catch(exception& e) {
568 m->errorOut(e, "seqNoise", "getLastMatch");
573 /**************************************************************************************************/
575 int seqNoise::countDiffs(vector<int> query, vector<int> ref){
577 //double MATCH = 5.0;
578 //double MISMATCH = -2.0;
581 vector<vector<double> > correctMatrix(4);
582 for(int i=0;i<4;i++){ correctMatrix[i].resize(4); }
584 correctMatrix[0][0] = 0.000000; //AA
585 correctMatrix[1][0] = 11.619259; //CA
586 correctMatrix[2][0] = 11.694004; //TA
587 correctMatrix[3][0] = 7.748623; //GA
589 correctMatrix[1][1] = 0.000000; //CC
590 correctMatrix[2][1] = 7.619657; //TC
591 correctMatrix[3][1] = 12.852562; //GC
593 correctMatrix[2][2] = 0.000000; //TT
594 correctMatrix[3][2] = 10.964048; //TG
596 correctMatrix[3][3] = 0.000000; //GG
598 for(int i=0;i<4;i++){
599 for(int j=0;j<i;j++){
600 correctMatrix[j][i] = correctMatrix[i][j];
604 int queryLength = query.size();
605 int refLength = ref.size();
607 vector<vector<double> > alignMatrix(queryLength + 1);
608 vector<vector<char> > alignMoves(queryLength + 1);
610 for(int i=0;i<=queryLength;i++){
611 if (m->control_pressed) { return 0; }
612 alignMatrix[i].resize(refLength + 1, 0);
613 alignMoves[i].resize(refLength + 1, 'x');
616 for(int i=0;i<=queryLength;i++){
617 if (m->control_pressed) { return 0; }
618 alignMatrix[i][0] = 15.0 * i;
619 alignMoves[i][0] = 'u';
622 for(int i=0;i<=refLength;i++){
623 if (m->control_pressed) { return 0; }
624 alignMatrix[0][i] = 15.0 * i;
625 alignMoves[0][i] = 'l';
628 for(int i=1;i<=queryLength;i++){
629 if (m->control_pressed) { return 0; }
630 for(int j=1;j<=refLength;j++){
633 nogap = alignMatrix[i-1][j-1] + correctMatrix[query[i-1]][ref[j-1]];
638 if(i == queryLength){ //terminal gap
639 left = alignMatrix[i][j-1];
642 if(ref[j-1] == getLastMatch('l', alignMoves, i, j, query, ref)){
649 left = alignMatrix[i][j-1] + gap;
654 if(j == refLength){ //terminal gap
655 up = alignMatrix[i-1][j];
659 if(query[i-1] == getLastMatch('u', alignMoves, i, j, query, ref)){
666 up = alignMatrix[i-1][j] + gap;
673 alignMoves[i][j] = 'd';
674 alignMatrix[i][j] = nogap;
677 alignMoves[i][j] = 'u';
678 alignMatrix[i][j] = up;
683 alignMoves[i][j] = 'l';
684 alignMatrix[i][j] = left;
687 alignMoves[i][j] = 'u';
688 alignMatrix[i][j] = up;
698 // string alignA = "";
699 // string alignB = "";
700 // string bases = "ACTG";
702 while(i > 0 && j > 0){
703 if (m->control_pressed) { return 0; }
704 if(alignMoves[i][j] == 'd'){
705 // alignA = bases[query[i-1]] + alignA;
706 // alignB = bases[ref[j-1]] + alignB;
708 if(query[i-1] != ref[j-1]) { diffs++; }
713 else if(alignMoves[i][j] == 'u'){
715 // alignA = bases[query[i-1]] + alignA;
716 // alignB = '-' + alignB;
723 else if(alignMoves[i][j] == 'l'){
724 if(i != queryLength){
725 // alignA = '-' + alignA;
726 // alignB = bases[ref[j-1]] + alignB;
735 // cout << diffs << endl;
736 // cout << alignA << endl;
737 // cout << alignB << endl;
742 catch(exception& e) {
743 m->errorOut(e, "seqNoise", "countDiffs");
749 /**************************************************************************************************/
751 vector<int> seqNoise::convertSeq(string bases){
753 vector<int> numbers(bases.length(), -1);
755 for(int i=0;i<bases.length();i++){
756 if (m->control_pressed) { return numbers; }
760 if(b == 'A') { numbers[i] = 0; }
761 else if(b=='C') { numbers[i] = 1; }
762 else if(b=='T') { numbers[i] = 2; }
763 else if(b=='G') { numbers[i] = 3; }
764 else { numbers[i] = 0; }
769 catch(exception& e) {
770 m->errorOut(e, "seqNoise", "convertSeq");
775 /**************************************************************************************************/
777 string seqNoise::degapSeq(string aligned){
779 string unaligned = "";
781 for(int i=0;i<aligned.length();i++){
783 if (m->control_pressed) { return ""; }
785 if(aligned[i] != '-' && aligned[i] != '.'){
786 unaligned += aligned[i];
792 catch(exception& e) {
793 m->errorOut(e, "seqNoise", "degapSeq");
798 /**************************************************************************************************/
800 int seqNoise::writeOutput(string fastaFileName, string namesFileName, string uMapFileName, vector<int> finalTau, vector<int> centroids, vector<int> otuData, vector<string> sequences, vector<string> uniqueNames, vector<string> redundantNames, vector<int> seqFreq, vector<double>& distances){
802 int numOTUs = finalTau.size();
803 int numSeqs = uniqueNames.size();
805 ofstream fastaFile(fastaFileName.c_str());
806 ofstream namesFile(namesFileName.c_str());
807 ofstream uMapFile(uMapFileName.c_str());
809 vector<int> maxSequenceAbund(numOTUs, 0);
810 vector<int> maxSequenceIndex(numOTUs, 0);
812 for(int i=0;i<numSeqs;i++){
813 if (m->control_pressed) { return 0; }
814 if(maxSequenceAbund[otuData[i]] < seqFreq[i]){
815 maxSequenceAbund[otuData[i]] = seqFreq[i];
816 maxSequenceIndex[otuData[i]] = i;
822 for(int i=0;i<numOTUs;i++){
823 if (m->control_pressed) { return 0; }
827 if(maxSequenceIndex[i] != centroids[i] && distances[maxSequenceIndex[i]*numSeqs + centroids[i]] == 0){
828 // cout << uniqueNames[centroids[i]] << '\t' << uniqueNames[maxSequenceIndex[i]] << '\t' << count << endl;
829 centroids[i] = maxSequenceIndex[i];
832 int index = centroids[i];
834 fastaFile << '>' << uniqueNames[index] << endl << sequences[index] << endl;
835 namesFile << uniqueNames[index] << '\t';
837 string refSeq = sequences[index];
838 string redundantSeqs = redundantNames[index];;
841 vector<freqData> frequencyData;
843 for(int j=0;j<numSeqs;j++){
844 if(otuData[j] == i && j != index){
845 frequencyData.push_back(freqData(j, seqFreq[j]));
848 sort(frequencyData.rbegin(), frequencyData.rend());
850 string refDegap = degapSeq(refSeq);
851 vector<int> rUnalign = convertSeq(refDegap);
853 uMapFile << "ideal_seq_" << count << '\t' << finalTau[i] << endl;
854 uMapFile << uniqueNames[index] << '\t' << seqFreq[index] << "\t0\t" << refDegap << endl;
857 for(int j=0;j<frequencyData.size();j++){
858 if (m->control_pressed) { return 0; }
859 redundantSeqs += ',' + redundantNames[frequencyData[j].index];
861 uMapFile << uniqueNames[frequencyData[j].index] << '\t' << seqFreq[frequencyData[j].index] << '\t';
863 string querySeq = sequences[frequencyData[j].index];
865 string queryDegap = degapSeq(querySeq);
866 vector<int> qUnalign = convertSeq(queryDegap);
868 int udiffs = countDiffs(qUnalign, rUnalign);
869 uMapFile << udiffs << '\t' << queryDegap << endl;
874 namesFile << redundantSeqs << endl;
884 catch(exception& e) {
885 m->errorOut(e, "seqNoise", "writeOutput");
890 /**************************************************************************************************
892 int main(int argc, char *argv[]){
895 sigma = atof(argv[5]);
897 double cutOff = 0.08;
900 double minDelta = 1e-6;
902 string sequenceFileName = argv[1];
903 string fileNameStub = sequenceFileName.substr(0,sequenceFileName.find_last_of('.')) + ".shhh";
905 vector<string> sequences;
906 getSequenceData(sequenceFileName, sequences);
908 int numSeqs = sequences.size();
910 vector<string> uniqueNames(numSeqs);
911 vector<string> redundantNames(numSeqs);
912 vector<int> seqFreq(numSeqs);
914 string namesFileName = argv[4];
915 getRedundantNames(namesFileName, uniqueNames, redundantNames, seqFreq);
917 string distFileName = argv[2];
918 vector<double> distances(numSeqs * numSeqs);
919 getDistanceData(distFileName, distances);
921 string listFileName = argv[3];
922 vector<int> otuData(numSeqs);
924 vector<vector<int> > otuBySeqLookUp;
926 getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
928 int numOTUs = otuFreq.size();
930 vector<double> weights(numOTUs, 0);
931 vector<int> change(numOTUs, 1);
932 vector<int> centroids(numOTUs, -1);
933 vector<int> cumCount(numOTUs, 0);
935 vector<double> tau(numSeqs, 1);
936 vector<int> anP(numSeqs, 0);
937 vector<int> anI(numSeqs, 0);
938 vector<int> anN(numSeqs, 0);
939 vector<vector<int> > aanI = otuBySeqLookUp;
942 double maxDelta = 1e6;
944 while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
946 updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
947 maxDelta = calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau);
949 calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
950 checkCentroids(weights, centroids);
952 otuFreq.assign(numOTUs, 0);
956 for(int i=0;i<numSeqs;i++){
958 double norm = 0.0000;
959 double minWeight = MIN_WEIGHT;
960 vector<double> currentTau(numOTUs);
962 for(int j=0;j<numOTUs;j++){
963 if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
964 offset = distances[i * numSeqs+centroids[j]];
968 for(int j=0;j<numOTUs;j++){
969 if(weights[j] > minWeight){
970 currentTau[j] = exp(sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
971 norm += currentTau[j];
974 currentTau[j] = 0.0000;
978 for(int j=0;j<numOTUs;j++){
979 currentTau[j] /= norm;
982 for(int j=0;j<numOTUs;j++){
984 if(currentTau[j] > MIN_TAU){
985 int oldTotal = total;
988 tau.resize(oldTotal+1);
989 tau[oldTotal] = currentTau[j];
990 otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
991 aanI[j][otuFreq[j]] = i;
1004 updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
1006 vector<double> percentage(numSeqs);
1007 setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI);
1008 finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau);
1010 change.assign(numOTUs, 1);
1011 calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
1014 vector<int> finalTau(numOTUs, 0);
1015 for(int i=0;i<numSeqs;i++){
1016 finalTau[otuData[i]] += int(seqFreq[i]);
1019 writeOutput(fileNameStub, finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
1024 **************************************************************************************************/