--- /dev/null
+#include "myutils.h"\r
+#include "ultra.h"\r
+#include "chime.h"\r
+#include "uc.h"\r
+#include "dp.h"\r
+#include <set>\r
+#include <algorithm>\r
+\r
+#define TRACE 0\r
+\r
+extern FILE *g_fUChime;\r
+\r
+void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,\r
+ vector<unsigned> &Parents);\r
+\r
+void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
+ const string &PathQA, const string &PathQB, ChimeHit2 &Hit);\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo);\r
+\r
+static void GetSmoothedIdVec(const SeqData &QSD, const SeqData &PSD, const string &Path,\r
+ vector<unsigned> &IdVec, unsigned d)\r
+ {\r
+ IdVec.clear();\r
+ const unsigned ColCount = SIZE(Path);\r
+\r
+ const byte *Q = QSD.Seq;\r
+ const byte *P = PSD.Seq;\r
+\r
+ const unsigned QL = QSD.L;\r
+ const unsigned PL = PSD.L;\r
+\r
+ if (QL <= d)\r
+ {\r
+ IdVec.resize(QSD.L, 0);\r
+ return;\r
+ }\r
+\r
+ unsigned QPos = 0;\r
+ unsigned PPos = 0;\r
+\r
+ vector<bool> SameVec;\r
+ SameVec.reserve(QL);\r
+ for (unsigned Col = 0; Col < ColCount; ++Col)\r
+ {\r
+ char c = Path[Col];\r
+\r
+ bool Same = false;\r
+ if (c == 'M')\r
+ {\r
+ byte q = Q[QPos];\r
+ byte p = P[PPos];\r
+ Same = (toupper(q) == toupper(p));\r
+ }\r
+\r
+ if (c == 'M' || c == 'D')\r
+ {\r
+ ++QPos;\r
+ SameVec.push_back(Same);\r
+ }\r
+\r
+ if (c == 'M' || c == 'I')\r
+ ++PPos;\r
+ }\r
+\r
+ asserta(SIZE(SameVec) == QL);\r
+\r
+ unsigned n = 0;\r
+ for (unsigned QPos = 0; QPos < d; ++QPos)\r
+ {\r
+ if (SameVec[QPos])\r
+ ++n;\r
+ IdVec.push_back(n);\r
+ }\r
+\r
+ for (unsigned QPos = d; QPos < QL; ++QPos)\r
+ {\r
+ if (SameVec[QPos])\r
+ ++n;\r
+ IdVec.push_back(n);\r
+ if (SameVec[QPos-d])\r
+ --n;\r
+ }\r
+ asserta(SIZE(IdVec) == QL);\r
+\r
+#if TRACE\r
+ {\r
+ Log("\n");\r
+ Log("GetSmoothedIdVec\n");\r
+ unsigned QPos = 0;\r
+ unsigned PPos = 0;\r
+ Log("Q P Same Id\n");\r
+ Log("- - ---- -------\n");\r
+ for (unsigned Col = 0; Col < ColCount; ++Col)\r
+ {\r
+ char c = Path[Col];\r
+\r
+ bool Same = false;\r
+ if (c == 'M')\r
+ {\r
+ byte q = Q[QPos];\r
+ byte p = P[PPos];\r
+ Same = (toupper(q) == toupper(p));\r
+ Log("%c %c %4c %7d\n", q, p, tof(Same), IdVec[QPos]);\r
+ }\r
+\r
+ if (c == 'M' || c == 'D')\r
+ ++QPos;\r
+ if (c == 'M' || c == 'I')\r
+ ++PPos;\r
+ }\r
+ }\r
+#endif\r
+ }\r
+\r
+bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, \r
+ const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,\r
+ float MinFractId, ChimeHit2 &Hit)\r
+ {\r
+ Hit.Clear();\r
+ Hit.QLabel = QSD.Label;\r
+\r
+ if (opt_verbose)\r
+ {\r
+ Log("\n");\r
+ Log("SearchChime()\n");\r
+ Log("Query>%s\n", QSD.Label);\r
+ }\r
+\r
+ vector<unsigned> Parents;\r
+ GetCandidateParents(U, QSD, QAb, Parents);\r
+\r
+ unsigned ParentCount = SIZE(Parents);\r
+ if (ParentCount <= 1)\r
+ {\r
+ if (opt_verbose)\r
+ Log("%u candidate parents, done.\n", ParentCount);\r
+ return false;\r
+ }\r
+\r
+ if (opt_fastalign)\r
+ HF.SetA(QSD);\r
+ HSPFinder *ptrHF = (opt_fastalign ? &HF : 0);\r
+\r
+ unsigned ChunkLength;\r
+ vector<unsigned> ChunkLos;\r
+ GetChunkInfo(QSD.L, ChunkLength, ChunkLos);\r
+ const unsigned ChunkCount = SIZE(ChunkLos);\r
+\r
+ vector<unsigned> ChunkIndexToBestId(ChunkCount, 0);\r
+ vector<unsigned> ChunkIndexToBestParentIndex(ChunkCount, UINT_MAX);\r
+\r
+ vector<SeqData> PSDs;\r
+ vector<string> Paths;\r
+ double TopPctId = 0.0;\r
+ unsigned TopParentIndex = UINT_MAX;\r
+ unsigned QL = QSD.L;\r
+ vector<unsigned> MaxIdVec(QL, 0);\r
+ for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
+ {\r
+ unsigned ParentSeqIndex = Parents[ParentIndex];\r
+\r
+ SeqData PSD;\r
+ //PSD.Label = U.GetSeedLabel(ParentSeqIndex);\r
+ //PSD.Seq = U.GetSeedSeq(ParentSeqIndex);\r
+ //PSD.L = U.GetSeedLength(ParentSeqIndex);\r
+ //PSD.Index = ParentSeqIndex;\r
+ U.GetSeqData(ParentSeqIndex, PSD);\r
+ PSDs.push_back(PSD);\r
+\r
+ if (opt_fastalign)\r
+ HF.SetB(PSD);\r
+\r
+ PathData PD;\r
+\r
+ float HSPId;\r
+ bool Found = GlobalAlign(QSD, PSD, AP, AH, *ptrHF, MinFractId, HSPId, PD);\r
+ if (!Found)\r
+ {\r
+ Paths.push_back(""); \r
+ continue;\r
+ }\r
+\r
+ double PctId = 100.0*GetFractIdGivenPath(QSD.Seq, PSD.Seq, PD.Start, true);\r
+ if (opt_selfid && PctId == 100.0)\r
+ {\r
+ Paths.push_back(""); \r
+ continue;\r
+ }\r
+\r
+ if (PctId > TopPctId)\r
+ {\r
+ TopParentIndex = ParentIndex;\r
+ TopPctId = PctId;\r
+ if (TopPctId >= 100.0 - opt_mindiv)\r
+ {\r
+ if (opt_verbose)\r
+ {\r
+ Log(" %.1f%% >%s\n", TopPctId, PSD.Label);\r
+ Log(" Top hit exceeds ctl threshold, done.\n");\r
+ return false;\r
+ }\r
+ }\r
+ }\r
+\r
+ string Path = PD.Start;\r
+ Paths.push_back(Path);\r
+\r
+ vector<unsigned> IdVec;\r
+ GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+\r
+ for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+ if (IdVec[QPos] > MaxIdVec[QPos])\r
+ MaxIdVec[QPos] = IdVec[QPos];\r
+ }\r
+\r
+ vector<unsigned> BestParents;\r
+ for (unsigned k = 0; k < opt_maxp; ++k)\r
+ {\r
+ unsigned BestParent = UINT_MAX;\r
+ unsigned BestCov = 0;\r
+ for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
+ {\r
+ const SeqData &PSD = PSDs[ParentIndex];\r
+ const string &Path = Paths[ParentIndex];\r
+ if (Path == "")\r
+ continue;\r
+\r
+ vector<unsigned> IdVec;\r
+ GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+\r
+ unsigned Cov = 0;\r
+ for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+ if (IdVec[QPos] == MaxIdVec[QPos])\r
+ ++Cov;\r
+\r
+ if (Cov > BestCov)\r
+ {\r
+ BestParent = ParentIndex;\r
+ BestCov = Cov;\r
+ }\r
+ }\r
+\r
+ if (BestParent == UINT_MAX)\r
+ break;\r
+\r
+ BestParents.push_back(BestParent);\r
+ vector<unsigned> IdVec;\r
+\r
+ const SeqData &PSD = PSDs[BestParent];\r
+ const string &Path = Paths[BestParent];\r
+ GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+ for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+ if (IdVec[QPos] == MaxIdVec[QPos])\r
+ MaxIdVec[QPos] = UINT_MAX;\r
+ }\r
+\r
+ unsigned BestParentCount = SIZE(BestParents);\r
+\r
+ if (opt_verbose)\r
+ {\r
+ Log("%u/%u best parents\n", BestParentCount, ParentCount);\r
+ for (unsigned k = 0; k < BestParentCount; ++k)\r
+ {\r
+ unsigned i = BestParents[k];\r
+ Log(" %s\n", PSDs[i].Label);\r
+ }\r
+ }\r
+\r
+ bool Found = false;\r
+ for (unsigned k1 = 0; k1 < BestParentCount; ++k1)\r
+ {\r
+ unsigned i1 = BestParents[k1];\r
+ asserta(i1 < ParentCount);\r
+\r
+ const SeqData &PSD1 = PSDs[i1];\r
+ const string &Path1 = Paths[i1];\r
+\r
+ for (unsigned k2 = k1 + 1; k2 < BestParentCount; ++k2)\r
+ {\r
+ unsigned i2 = BestParents[k2];\r
+ asserta(i2 < ParentCount);\r
+ asserta(i2 != i1);\r
+\r
+ const SeqData &PSD2 = PSDs[i2];\r
+ const string &Path2 = Paths[i2];\r
+\r
+ ChimeHit2 Hit2;\r
+ AlignChime(QSD, PSD1, PSD2, Path1, Path2, Hit2);\r
+ Hit2.PctIdQT = TopPctId;\r
+\r
+ if (Hit2.Accept())\r
+ Found = true;\r
+\r
+ if (Hit2.Score > Hit.Score)\r
+ Hit = Hit2;\r
+\r
+ if (opt_verbose)\r
+ Hit2.LogMe();\r
+ }\r
+ }\r
+\r
+ return Found;\r
+ }\r