5 * Created by Pat Schloss on 8/31/11.
6 * Copyright 2011 Patrick D. Schloss. All rights reserved.
11 #include "sequence.hpp"
13 #define MIN_DELTA 1.0e-6
17 #define MIN_TAU 1.0e-4
18 #define MIN_WEIGHT 0.1
21 /**************************************************************************************************/
22 int seqNoise::getSequenceData(string sequenceFileName, vector<string>& sequences){
25 ifstream sequenceFile;
26 m->openInputFile(sequenceFileName, sequenceFile);
28 while(!sequenceFile.eof()){
30 if (m->control_pressed) { break; }
32 Sequence temp(sequenceFile); m->gobble(sequenceFile);
34 if (temp.getName() != "") {
35 sequences.push_back(temp.getAligned());
43 m->errorOut(e, "seqNoise", "getSequenceData");
47 /**************************************************************************************************/
48 int seqNoise::addSeq(string seq, vector<string>& sequences){
50 sequences.push_back(seq);
54 m->errorOut(e, "seqNoise", "addSeq");
58 /**************************************************************************************************/
59 //no checks for file mismatches
60 int seqNoise::getRedundantNames(string namesFileName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){
62 string unique, redundant;
64 m->openInputFile(namesFileName, namesFile);
66 for(int i=0;i<redundantNames.size();i++){
68 if (m->control_pressed) { break; }
70 namesFile >> uniqueNames[i]; m->gobble(namesFile);
71 namesFile >> redundantNames[i]; m->gobble(namesFile);
73 seqFreq[i] = m->getNumNames(redundantNames[i]);
80 m->errorOut(e, "seqNoise", "getRedundantNames");
84 /**************************************************************************************************/
85 int seqNoise::addRedundantName(string uniqueName, string redundantName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){
88 uniqueNames.push_back(uniqueName);
89 redundantNames.push_back(redundantName);
90 seqFreq.push_back(m->getNumNames(redundantName));
95 m->errorOut(e, "seqNoise", "addRedundantName");
99 /**************************************************************************************************/
100 int seqNoise::getDistanceData(string distFileName, vector<double>& distances){
104 m->openInputFile(distFileName, distFile);
111 for(int i=0;i<numSeqs;i++){
113 if (m->control_pressed) { break; }
115 distances[i * numSeqs + i] = 0.0000;
119 for(int j=0;j<i;j++){
120 distFile >> distances[i * numSeqs + j];
121 distances[j * numSeqs + i] = distances[i * numSeqs + j];
129 catch(exception& e) {
130 m->errorOut(e, "seqNoise", "getDistanceData");
135 /**************************************************************************************************/
136 int seqNoise::getListData(string listFileName, double cutOff, vector<int>& otuData, vector<int>& otuFreq, vector<vector<int> >& otuBySeqLookUp){
140 m->openInputFile(listFileName, listFile);
144 bool adjustCutoff = true;
146 if(listFile.peek() == 'u'){ m->getline(listFile); }
149 listFile >> threshold;
151 if(threshold < cutOff){
152 line = m->getline(listFile); m->gobble(listFile);
155 adjustCutoff = false;
157 otuFreq.resize(numOTUs, 0);
159 for(int i=0;i<numOTUs;i++){
161 if (m->control_pressed) { return 0; }
170 for(int j=0;j<otu.size();j++){
175 int index = atoi(number.c_str());
182 int index = atoi(number.c_str());
189 otuBySeqLookUp.resize(numOTUs);
191 int numSeqs = otuData.size();
193 for(int i=0;i<numSeqs;i++){
194 if (m->control_pressed) { return 0; }
195 otuBySeqLookUp[otuData[i]].push_back(i);
197 for(int i=0;i<numOTUs;i++){
198 if (m->control_pressed) { return 0; }
199 for(int j=otuBySeqLookUp[i].size();j<numSeqs;j++){
200 otuBySeqLookUp[i].push_back(0);
210 //the listfile does not contain a threshold greater than the cutoff so use highest value
212 istringstream iss (line,istringstream::in);
215 otuFreq.resize(numOTUs, 0);
217 for(int i=0;i<numOTUs;i++){
219 if (m->control_pressed) { return 0; }
228 for(int j=0;j<otu.size();j++){
233 int index = atoi(number.c_str());
240 int index = atoi(number.c_str());
247 otuBySeqLookUp.resize(numOTUs);
249 int numSeqs = otuData.size();
251 for(int i=0;i<numSeqs;i++){
252 if (m->control_pressed) { return 0; }
253 otuBySeqLookUp[otuData[i]].push_back(i);
255 for(int i=0;i<numOTUs;i++){
256 if (m->control_pressed) { return 0; }
257 for(int j=otuBySeqLookUp[i].size();j<numSeqs;j++){
258 otuBySeqLookUp[i].push_back(0);
266 catch(exception& e) {
267 m->errorOut(e, "seqNoise", "getListData");
272 /**************************************************************************************************/
273 int seqNoise::updateOTUCountData(vector<int> otuFreq,
274 vector<vector<int> > otuBySeqLookUp,
275 vector<vector<int> > aanI,
278 vector<int>& cumCount
281 int numOTUs = otuFreq.size();
285 for(int i=0;i<numOTUs;i++){
288 if (m->control_pressed) { return 0; }
290 for(int j=0;j<otuFreq[i];j++){
291 anP[count] = otuBySeqLookUp[i][j];
292 anI[count] = aanI[i][j];
300 catch(exception& e) {
301 m->errorOut(e, "seqNoise", "updateOTUCountData");
305 /**************************************************************************************************/
306 double seqNoise::calcNewWeights(
307 vector<double>& weights, //
308 vector<int> seqFreq, //
310 vector<int> cumCount, //
312 vector<int> otuFreq, //
313 vector<double> tau //
317 int numOTUs = weights.size();
318 double maxChange = -1;
322 for(int i=0;i<numOTUs;i++){
324 if (m->control_pressed) { return 0; }
326 double change = weights[i];
330 for(int j=0;j<otuFreq[i];j++){
332 int index1 = cumCount[i] + j;
333 int index2 = anI[index1];
335 double currentTau = tau[anP[index1]];
336 double freq = double(seqFreq[index2]);
338 weights[i] += currentTau * freq;
340 change = fabs(weights[i] - change);
342 if(change > maxChange){ maxChange = change; }
348 catch(exception& e) {
349 m->errorOut(e, "seqNoise", "calcNewWeights");
354 /**************************************************************************************************/
356 int seqNoise::calcCentroids(
360 vector<int>& centroids,
361 vector<int> cumCount,
362 vector<double> distances,///
368 int numOTUs = change.size();
369 int numSeqs = seqFreq.size();
371 for(int i=0;i<numOTUs;i++){
373 if (m->control_pressed) { return 0; }
376 double minFValue = 1e10;
379 double count = 0.00000;
381 int freqOfOTU = otuFreq[i];
383 for(int j=0;j<freqOfOTU;j++){
384 int index = cumCount[i] + j;
385 count += seqFreq[anI[index]]*tau[anP[index]];
388 if(freqOfOTU > 0 && count > MIN_COUNT){
390 vector<double> adF(freqOfOTU);
391 vector<int> anL(freqOfOTU);
393 for(int j=0;j<freqOfOTU;j++){
394 anL[j] = anI[cumCount[i] + j];
398 for(int j=0;j<freqOfOTU;j++){
399 int index = cumCount[i] + j;
400 double curTau = tau[anP[index]];
402 for(int k=0;k<freqOfOTU;k++){
403 double dist = distances[anL[j]*numSeqs + anL[k]];
405 adF[k] += dist * curTau * seqFreq[anL[j]];
409 for(int j=0;j<freqOfOTU;j++){
410 if(adF[j] < minFValue){
416 if(centroids[i] != anL[minFIndex]){
418 centroids[i] = anL[minFIndex];
421 else if(centroids[i] != -1){
429 catch(exception& e) {
430 m->errorOut(e, "seqNoise", "calcCentroids");
435 /**************************************************************************************************/
437 int seqNoise::checkCentroids(vector<double>& weights, vector<int> centroids){
439 int numOTUs = centroids.size();
440 vector<int> unique(numOTUs, 1);
442 double minWeight = MIN_WEIGHT;
443 for(int i=0;i<numOTUs;i++){
444 if (m->control_pressed) { return 0; }
445 if(weights[i] < minWeight){ unique[i] = -1; }
448 for(int i=0;i<numOTUs;i++){
449 if (m->control_pressed) { return 0; }
451 for(int j=i+1; j<numOTUs;j++){
453 if(centroids[i] == centroids[j]){
455 weights[i] += weights[j];
465 catch(exception& e) {
466 m->errorOut(e, "seqNoise", "checkCentroids");
471 /**************************************************************************************************/
473 int seqNoise::setUpOTUData(vector<int>& otuData, vector<double>& percentage, vector<int> cumCount, vector<double> tau, vector<int> otuFreq, vector<int> anP, vector<int> anI){
476 int numOTUs = cumCount.size();
477 int numSeqs = otuData.size();
479 vector<double> bestTau(numSeqs, 0);
480 vector<double> bestIndex(numSeqs, -1);
482 for(int i=0;i<numOTUs;i++){
483 if (m->control_pressed) { return 0; }
484 for(int j=0;j<otuFreq[i];j++){
486 int index1 = cumCount[i] + j;
487 double thisTau = tau[anP[index1]];
488 int index2 = anI[index1];
490 if(thisTau > bestTau[index2]){
491 bestTau[index2] = thisTau;
492 bestIndex[index2] = i;
497 for(int i=0;i<numSeqs;i++){
498 if (m->control_pressed) { return 0; }
499 otuData[i] = bestIndex[i];
500 percentage[i] = 1 - bestTau[i];
504 catch(exception& e) {
505 m->errorOut(e, "seqNoise", "setUpOTUData");
510 /**************************************************************************************************/
512 int seqNoise::finishOTUData(vector<int> otuData, vector<int>& otuFreq, vector<int>& anP, vector<int>& anI, vector<int>& cumCount, vector<vector<int> >& otuBySeqLookUp, vector<vector<int> >& aanI, vector<double>& tau){
514 int numSeqs = otuData.size();
515 int numOTUs = otuFreq.size();
518 otuFreq.assign(numOTUs, 0);
519 tau.assign(numSeqs, 1);
520 anP.assign(numSeqs, 0);
521 anI.assign(numSeqs, 0);
523 for(int i=0;i<numSeqs;i++){
524 if (m->control_pressed) { return 0; }
525 int otu = otuData[i];
528 otuBySeqLookUp[otu][otuFreq[otu]] = i;
529 aanI[otu][otuFreq[otu]] = i;
532 updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
535 catch(exception& e) {
536 m->errorOut(e, "seqNoise", "finishOTUData");
541 /**************************************************************************************************/
543 int seqNoise::getLastMatch(char direction, vector<vector<char> >& alignMoves, int i, int j, vector<int>& seqA, vector<int>& seqB){
545 char nullReturn = -1;
548 if (m->control_pressed) { return nullReturn; }
549 if(direction == 'd'){
550 if(seqA[i-1] == seqB[j-1]) { return seqA[i-1]; }
551 else { return nullReturn; }
554 else if(direction == 'l') { j--; }
557 direction = alignMoves[i][j];
562 catch(exception& e) {
563 m->errorOut(e, "seqNoise", "getLastMatch");
568 /**************************************************************************************************/
570 int seqNoise::countDiffs(vector<int> query, vector<int> ref){
572 //double MATCH = 5.0;
573 //double MISMATCH = -2.0;
576 vector<vector<double> > correctMatrix(4);
577 for(int i=0;i<4;i++){ correctMatrix[i].resize(4); }
579 correctMatrix[0][0] = 0.000000; //AA
580 correctMatrix[1][0] = 11.619259; //CA
581 correctMatrix[2][0] = 11.694004; //TA
582 correctMatrix[3][0] = 7.748623; //GA
584 correctMatrix[1][1] = 0.000000; //CC
585 correctMatrix[2][1] = 7.619657; //TC
586 correctMatrix[3][1] = 12.852562; //GC
588 correctMatrix[2][2] = 0.000000; //TT
589 correctMatrix[3][2] = 10.964048; //TG
591 correctMatrix[3][3] = 0.000000; //GG
593 for(int i=0;i<4;i++){
594 for(int j=0;j<i;j++){
595 correctMatrix[j][i] = correctMatrix[i][j];
599 int queryLength = query.size();
600 int refLength = ref.size();
602 vector<vector<double> > alignMatrix(queryLength + 1);
603 vector<vector<char> > alignMoves(queryLength + 1);
605 for(int i=0;i<=queryLength;i++){
606 if (m->control_pressed) { return 0; }
607 alignMatrix[i].resize(refLength + 1, 0);
608 alignMoves[i].resize(refLength + 1, 'x');
611 for(int i=0;i<=queryLength;i++){
612 if (m->control_pressed) { return 0; }
613 alignMatrix[i][0] = 15.0 * i;
614 alignMoves[i][0] = 'u';
617 for(int i=0;i<=refLength;i++){
618 if (m->control_pressed) { return 0; }
619 alignMatrix[0][i] = 15.0 * i;
620 alignMoves[0][i] = 'l';
623 for(int i=1;i<=queryLength;i++){
624 if (m->control_pressed) { return 0; }
625 for(int j=1;j<=refLength;j++){
628 nogap = alignMatrix[i-1][j-1] + correctMatrix[query[i-1]][ref[j-1]];
633 if(i == queryLength){ //terminal gap
634 left = alignMatrix[i][j-1];
637 if(ref[j-1] == getLastMatch('l', alignMoves, i, j, query, ref)){
644 left = alignMatrix[i][j-1] + gap;
649 if(j == refLength){ //terminal gap
650 up = alignMatrix[i-1][j];
654 if(query[i-1] == getLastMatch('u', alignMoves, i, j, query, ref)){
661 up = alignMatrix[i-1][j] + gap;
668 alignMoves[i][j] = 'd';
669 alignMatrix[i][j] = nogap;
672 alignMoves[i][j] = 'u';
673 alignMatrix[i][j] = up;
678 alignMoves[i][j] = 'l';
679 alignMatrix[i][j] = left;
682 alignMoves[i][j] = 'u';
683 alignMatrix[i][j] = up;
693 // string alignA = "";
694 // string alignB = "";
695 // string bases = "ACTG";
697 while(i > 0 && j > 0){
698 if (m->control_pressed) { return 0; }
699 if(alignMoves[i][j] == 'd'){
700 // alignA = bases[query[i-1]] + alignA;
701 // alignB = bases[ref[j-1]] + alignB;
703 if(query[i-1] != ref[j-1]) { diffs++; }
708 else if(alignMoves[i][j] == 'u'){
710 // alignA = bases[query[i-1]] + alignA;
711 // alignB = '-' + alignB;
718 else if(alignMoves[i][j] == 'l'){
719 if(i != queryLength){
720 // alignA = '-' + alignA;
721 // alignB = bases[ref[j-1]] + alignB;
730 // cout << diffs << endl;
731 // cout << alignA << endl;
732 // cout << alignB << endl;
737 catch(exception& e) {
738 m->errorOut(e, "seqNoise", "countDiffs");
744 /**************************************************************************************************/
746 vector<int> seqNoise::convertSeq(string bases){
748 vector<int> numbers(bases.length(), -1);
750 for(int i=0;i<bases.length();i++){
751 if (m->control_pressed) { return numbers; }
755 if(b == 'A') { numbers[i] = 0; }
756 else if(b=='C') { numbers[i] = 1; }
757 else if(b=='T') { numbers[i] = 2; }
758 else if(b=='G') { numbers[i] = 3; }
759 else { numbers[i] = 0; }
764 catch(exception& e) {
765 m->errorOut(e, "seqNoise", "convertSeq");
770 /**************************************************************************************************/
772 string seqNoise::degapSeq(string aligned){
774 string unaligned = "";
776 for(int i=0;i<aligned.length();i++){
778 if (m->control_pressed) { return ""; }
780 if(aligned[i] != '-' && aligned[i] != '.'){
781 unaligned += aligned[i];
787 catch(exception& e) {
788 m->errorOut(e, "seqNoise", "degapSeq");
793 /**************************************************************************************************/
795 int seqNoise::writeOutput(string fastaFileName, string namesFileName, string uMapFileName, vector<int> finalTau, vector<int> centroids, vector<int> otuData, vector<string> sequences, vector<string> uniqueNames, vector<string> redundantNames, vector<int> seqFreq, vector<double>& distances){
797 int numOTUs = finalTau.size();
798 int numSeqs = uniqueNames.size();
800 ofstream fastaFile(fastaFileName.c_str());
801 ofstream namesFile(namesFileName.c_str());
802 ofstream uMapFile(uMapFileName.c_str());
804 vector<int> maxSequenceAbund(numOTUs, 0);
805 vector<int> maxSequenceIndex(numOTUs, 0);
807 for(int i=0;i<numSeqs;i++){
808 if (m->control_pressed) { return 0; }
809 if(maxSequenceAbund[otuData[i]] < seqFreq[i]){
810 maxSequenceAbund[otuData[i]] = seqFreq[i];
811 maxSequenceIndex[otuData[i]] = i;
817 for(int i=0;i<numOTUs;i++){
818 if (m->control_pressed) { return 0; }
822 if(maxSequenceIndex[i] != centroids[i] && distances[maxSequenceIndex[i]*numSeqs + centroids[i]] == 0){
823 // cout << uniqueNames[centroids[i]] << '\t' << uniqueNames[maxSequenceIndex[i]] << '\t' << count << endl;
824 centroids[i] = maxSequenceIndex[i];
827 int index = centroids[i];
829 fastaFile << '>' << uniqueNames[index] << endl << sequences[index] << endl;
830 namesFile << uniqueNames[index] << '\t';
832 string refSeq = sequences[index];
833 string redundantSeqs = redundantNames[index];;
836 vector<freqData> frequencyData;
838 for(int j=0;j<numSeqs;j++){
839 if(otuData[j] == i && j != index){
840 frequencyData.push_back(freqData(j, seqFreq[j]));
843 sort(frequencyData.rbegin(), frequencyData.rend());
845 string refDegap = degapSeq(refSeq);
846 vector<int> rUnalign = convertSeq(refDegap);
848 uMapFile << "ideal_seq_" << count << '\t' << finalTau[i] << endl;
849 uMapFile << uniqueNames[index] << '\t' << seqFreq[index] << "\t0\t" << refDegap << endl;
852 for(int j=0;j<frequencyData.size();j++){
853 if (m->control_pressed) { return 0; }
854 redundantSeqs += ',' + redundantNames[frequencyData[j].index];
856 uMapFile << uniqueNames[frequencyData[j].index] << '\t' << seqFreq[frequencyData[j].index] << '\t';
858 string querySeq = sequences[frequencyData[j].index];
860 string queryDegap = degapSeq(querySeq);
861 vector<int> qUnalign = convertSeq(queryDegap);
863 int udiffs = countDiffs(qUnalign, rUnalign);
864 uMapFile << udiffs << '\t' << queryDegap << endl;
869 namesFile << redundantSeqs << endl;
879 catch(exception& e) {
880 m->errorOut(e, "seqNoise", "writeOutput");
885 /**************************************************************************************************
887 int main(int argc, char *argv[]){
890 sigma = atof(argv[5]);
892 double cutOff = 0.08;
895 double minDelta = 1e-6;
897 string sequenceFileName = argv[1];
898 string fileNameStub = sequenceFileName.substr(0,sequenceFileName.find_last_of('.')) + ".shhh";
900 vector<string> sequences;
901 getSequenceData(sequenceFileName, sequences);
903 int numSeqs = sequences.size();
905 vector<string> uniqueNames(numSeqs);
906 vector<string> redundantNames(numSeqs);
907 vector<int> seqFreq(numSeqs);
909 string namesFileName = argv[4];
910 getRedundantNames(namesFileName, uniqueNames, redundantNames, seqFreq);
912 string distFileName = argv[2];
913 vector<double> distances(numSeqs * numSeqs);
914 getDistanceData(distFileName, distances);
916 string listFileName = argv[3];
917 vector<int> otuData(numSeqs);
919 vector<vector<int> > otuBySeqLookUp;
921 getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
923 int numOTUs = otuFreq.size();
925 vector<double> weights(numOTUs, 0);
926 vector<int> change(numOTUs, 1);
927 vector<int> centroids(numOTUs, -1);
928 vector<int> cumCount(numOTUs, 0);
930 vector<double> tau(numSeqs, 1);
931 vector<int> anP(numSeqs, 0);
932 vector<int> anI(numSeqs, 0);
933 vector<int> anN(numSeqs, 0);
934 vector<vector<int> > aanI = otuBySeqLookUp;
937 double maxDelta = 1e6;
939 while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
941 updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
942 maxDelta = calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau);
944 calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
945 checkCentroids(weights, centroids);
947 otuFreq.assign(numOTUs, 0);
951 for(int i=0;i<numSeqs;i++){
953 double norm = 0.0000;
954 double minWeight = MIN_WEIGHT;
955 vector<double> currentTau(numOTUs);
957 for(int j=0;j<numOTUs;j++){
958 if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
959 offset = distances[i * numSeqs+centroids[j]];
963 for(int j=0;j<numOTUs;j++){
964 if(weights[j] > minWeight){
965 currentTau[j] = exp(sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
966 norm += currentTau[j];
969 currentTau[j] = 0.0000;
973 for(int j=0;j<numOTUs;j++){
974 currentTau[j] /= norm;
977 for(int j=0;j<numOTUs;j++){
979 if(currentTau[j] > MIN_TAU){
980 int oldTotal = total;
983 tau.resize(oldTotal+1);
984 tau[oldTotal] = currentTau[j];
985 otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
986 aanI[j][otuFreq[j]] = i;
999 updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
1001 vector<double> percentage(numSeqs);
1002 setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI);
1003 finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau);
1005 change.assign(numOTUs, 1);
1006 calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
1009 vector<int> finalTau(numOTUs, 0);
1010 for(int i=0;i<numSeqs;i++){
1011 finalTau[otuData[i]] += int(seqFreq[i]);
1014 writeOutput(fileNameStub, finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
1019 **************************************************************************************************/