]> git.donarmstrong.com Git - mothur.git/blobdiff - uchime_src/searchchime.cpp
added uchime_src folder. added biom parameter to make.shared. added biom as a current...
[mothur.git] / uchime_src / searchchime.cpp
diff --git a/uchime_src/searchchime.cpp b/uchime_src/searchchime.cpp
new file mode 100644 (file)
index 0000000..c00a9c4
--- /dev/null
@@ -0,0 +1,304 @@
+#include "myutils.h"\r
+#include "ultra.h"\r
+#include "chime.h"\r
+#include "uc.h"\r
+#include "dp.h"\r
+#include <set>\r
+#include <algorithm>\r
+\r
+#define TRACE  0\r
+\r
+extern FILE *g_fUChime;\r
+\r
+void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,\r
+  vector<unsigned> &Parents);\r
+\r
+void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
+  const string &PathQA, const string &PathQB, ChimeHit2 &Hit);\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo);\r
+\r
+static void GetSmoothedIdVec(const SeqData &QSD, const SeqData &PSD, const string &Path,\r
+  vector<unsigned> &IdVec, unsigned d)\r
+       {\r
+       IdVec.clear();\r
+       const unsigned ColCount = SIZE(Path);\r
+\r
+       const byte *Q = QSD.Seq;\r
+       const byte *P = PSD.Seq;\r
+\r
+       const unsigned QL = QSD.L;\r
+       const unsigned PL = PSD.L;\r
+\r
+       if (QL <= d)\r
+               {\r
+               IdVec.resize(QSD.L, 0);\r
+               return;\r
+               }\r
+\r
+       unsigned QPos = 0;\r
+       unsigned PPos = 0;\r
+\r
+       vector<bool> SameVec;\r
+       SameVec.reserve(QL);\r
+       for (unsigned Col = 0; Col < ColCount; ++Col)\r
+               {\r
+               char c = Path[Col];\r
+\r
+               bool Same = false;\r
+               if (c == 'M')\r
+                       {\r
+                       byte q = Q[QPos];\r
+                       byte p = P[PPos];\r
+                       Same = (toupper(q) == toupper(p));\r
+                       }\r
+\r
+               if (c == 'M' || c == 'D')\r
+                       {\r
+                       ++QPos;\r
+                       SameVec.push_back(Same);\r
+                       }\r
+\r
+               if (c == 'M' || c == 'I')\r
+                       ++PPos;\r
+               }\r
+\r
+       asserta(SIZE(SameVec) == QL);\r
+\r
+       unsigned n = 0;\r
+       for (unsigned QPos = 0; QPos < d; ++QPos)\r
+               {\r
+               if (SameVec[QPos])\r
+                       ++n;\r
+               IdVec.push_back(n);\r
+               }\r
+\r
+       for (unsigned QPos = d; QPos < QL; ++QPos)\r
+               {\r
+               if (SameVec[QPos])\r
+                       ++n;\r
+               IdVec.push_back(n);\r
+               if (SameVec[QPos-d])\r
+                       --n;\r
+               }\r
+       asserta(SIZE(IdVec) == QL);\r
+\r
+#if    TRACE\r
+       {\r
+       Log("\n");\r
+       Log("GetSmoothedIdVec\n");\r
+       unsigned QPos = 0;\r
+       unsigned PPos = 0;\r
+       Log("Q P  Same       Id\n");\r
+       Log("- -  ----  -------\n");\r
+       for (unsigned Col = 0; Col < ColCount; ++Col)\r
+               {\r
+               char c = Path[Col];\r
+\r
+               bool Same = false;\r
+               if (c == 'M')\r
+                       {\r
+                       byte q = Q[QPos];\r
+                       byte p = P[PPos];\r
+                       Same = (toupper(q) == toupper(p));\r
+                       Log("%c %c  %4c  %7d\n", q, p, tof(Same), IdVec[QPos]);\r
+                       }\r
+\r
+               if (c == 'M' || c == 'D')\r
+                       ++QPos;\r
+               if (c == 'M' || c == 'I')\r
+                       ++PPos;\r
+               }\r
+       }\r
+#endif\r
+       }\r
+\r
+bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, \r
+  const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,\r
+  float MinFractId, ChimeHit2 &Hit)\r
+       {\r
+       Hit.Clear();\r
+       Hit.QLabel = QSD.Label;\r
+\r
+       if (opt_verbose)\r
+               {\r
+               Log("\n");\r
+               Log("SearchChime()\n");\r
+               Log("Query>%s\n", QSD.Label);\r
+               }\r
+\r
+       vector<unsigned> Parents;\r
+       GetCandidateParents(U, QSD, QAb, Parents);\r
+\r
+       unsigned ParentCount = SIZE(Parents);\r
+       if (ParentCount <= 1)\r
+               {\r
+               if (opt_verbose)\r
+                       Log("%u candidate parents, done.\n", ParentCount);\r
+               return false;\r
+               }\r
+\r
+       if (opt_fastalign)\r
+               HF.SetA(QSD);\r
+       HSPFinder *ptrHF = (opt_fastalign ? &HF : 0);\r
+\r
+       unsigned ChunkLength;\r
+       vector<unsigned> ChunkLos;\r
+       GetChunkInfo(QSD.L, ChunkLength, ChunkLos);\r
+       const unsigned ChunkCount = SIZE(ChunkLos);\r
+\r
+       vector<unsigned> ChunkIndexToBestId(ChunkCount, 0);\r
+       vector<unsigned> ChunkIndexToBestParentIndex(ChunkCount, UINT_MAX);\r
+\r
+       vector<SeqData> PSDs;\r
+       vector<string> Paths;\r
+       double TopPctId = 0.0;\r
+       unsigned TopParentIndex = UINT_MAX;\r
+       unsigned QL = QSD.L;\r
+       vector<unsigned> MaxIdVec(QL, 0);\r
+       for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
+               {\r
+               unsigned ParentSeqIndex = Parents[ParentIndex];\r
+\r
+               SeqData PSD;\r
+               //PSD.Label = U.GetSeedLabel(ParentSeqIndex);\r
+               //PSD.Seq = U.GetSeedSeq(ParentSeqIndex);\r
+               //PSD.L = U.GetSeedLength(ParentSeqIndex);\r
+               //PSD.Index = ParentSeqIndex;\r
+               U.GetSeqData(ParentSeqIndex, PSD);\r
+               PSDs.push_back(PSD);\r
+\r
+               if (opt_fastalign)\r
+                       HF.SetB(PSD);\r
+\r
+               PathData PD;\r
+\r
+               float HSPId;\r
+               bool Found = GlobalAlign(QSD, PSD, AP, AH, *ptrHF, MinFractId, HSPId, PD);\r
+               if (!Found)\r
+                       {\r
+                       Paths.push_back("");                            \r
+                       continue;\r
+                       }\r
+\r
+               double PctId = 100.0*GetFractIdGivenPath(QSD.Seq, PSD.Seq, PD.Start, true);\r
+               if (opt_selfid && PctId == 100.0)\r
+                       {\r
+                       Paths.push_back("");                            \r
+                       continue;\r
+                       }\r
+\r
+               if (PctId > TopPctId)\r
+                       {\r
+                       TopParentIndex = ParentIndex;\r
+                       TopPctId = PctId;\r
+                       if (TopPctId >= 100.0 - opt_mindiv)\r
+                               {\r
+                               if (opt_verbose)\r
+                                       {\r
+                                       Log("  %.1f%%  >%s\n", TopPctId, PSD.Label);\r
+                                       Log("  Top hit exceeds ctl threshold, done.\n");\r
+                                       return false;\r
+                                       }\r
+                               }\r
+                       }\r
+\r
+               string Path = PD.Start;\r
+               Paths.push_back(Path);\r
+\r
+               vector<unsigned> IdVec;\r
+               GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+\r
+               for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+                       if (IdVec[QPos] > MaxIdVec[QPos])\r
+                               MaxIdVec[QPos] = IdVec[QPos];\r
+               }\r
+\r
+       vector<unsigned> BestParents;\r
+       for (unsigned k = 0; k < opt_maxp; ++k)\r
+               {\r
+               unsigned BestParent = UINT_MAX;\r
+               unsigned BestCov = 0;\r
+               for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
+                       {\r
+                       const SeqData &PSD = PSDs[ParentIndex];\r
+                       const string &Path = Paths[ParentIndex];\r
+                       if (Path == "")\r
+                               continue;\r
+\r
+                       vector<unsigned> IdVec;\r
+                       GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+\r
+                       unsigned Cov = 0;\r
+                       for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+                               if (IdVec[QPos] == MaxIdVec[QPos])\r
+                                       ++Cov;\r
+\r
+                       if (Cov > BestCov)\r
+                               {\r
+                               BestParent = ParentIndex;\r
+                               BestCov = Cov;\r
+                               }\r
+                       }\r
+\r
+               if (BestParent == UINT_MAX)\r
+                       break;\r
+\r
+               BestParents.push_back(BestParent);\r
+               vector<unsigned> IdVec;\r
+\r
+               const SeqData &PSD = PSDs[BestParent];\r
+               const string &Path = Paths[BestParent];\r
+               GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+               for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+                       if (IdVec[QPos] == MaxIdVec[QPos])\r
+                               MaxIdVec[QPos] = UINT_MAX;\r
+               }\r
+\r
+       unsigned BestParentCount = SIZE(BestParents);\r
+\r
+       if (opt_verbose)\r
+               {\r
+               Log("%u/%u best parents\n", BestParentCount, ParentCount);\r
+               for (unsigned k = 0; k < BestParentCount; ++k)\r
+                       {\r
+                       unsigned i = BestParents[k];\r
+                       Log(" %s\n", PSDs[i].Label);\r
+                       }\r
+               }\r
+\r
+       bool Found = false;\r
+       for (unsigned k1 = 0; k1 < BestParentCount; ++k1)\r
+               {\r
+               unsigned i1 = BestParents[k1];\r
+               asserta(i1 < ParentCount);\r
+\r
+               const SeqData &PSD1 = PSDs[i1];\r
+               const string &Path1 = Paths[i1];\r
+\r
+               for (unsigned k2 = k1 + 1; k2 < BestParentCount; ++k2)\r
+                       {\r
+                       unsigned i2 = BestParents[k2];\r
+                       asserta(i2 < ParentCount);\r
+                       asserta(i2 != i1);\r
+\r
+                       const SeqData &PSD2 = PSDs[i2];\r
+                       const string &Path2 = Paths[i2];\r
+\r
+                       ChimeHit2 Hit2;\r
+                       AlignChime(QSD, PSD1, PSD2, Path1, Path2, Hit2);\r
+                       Hit2.PctIdQT = TopPctId;\r
+\r
+                       if (Hit2.Accept())\r
+                               Found = true;\r
+\r
+                       if (Hit2.Score > Hit.Score)\r
+                               Hit = Hit2;\r
+\r
+                       if (opt_verbose)\r
+                               Hit2.LogMe();\r
+                       }\r
+               }\r
+\r
+       return Found;\r
+       }\r