+++ /dev/null
-#include "myutils.h"\r
-#include "ultra.h"\r
-#include "chime.h"\r
-#include "uc.h"\r
-#include "dp.h"\r
-#include <set>\r
-#include <algorithm>\r
-\r
-#define TRACE 0\r
-\r
-extern FILE *g_fUChime;\r
-\r
-void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,\r
- vector<unsigned> &Parents);\r
-\r
-void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
- const string &PathQA, const string &PathQB, ChimeHit2 &Hit);\r
-\r
-double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo);\r
-\r
-static void GetSmoothedIdVec(const SeqData &QSD, const SeqData &PSD, const string &Path,\r
- vector<unsigned> &IdVec, unsigned d)\r
- {\r
- IdVec.clear();\r
- const unsigned ColCount = SIZE(Path);\r
-\r
- const byte *Q = QSD.Seq;\r
- const byte *P = PSD.Seq;\r
-\r
- const unsigned QL = QSD.L;\r
- const unsigned PL = PSD.L;\r
-\r
- if (QL <= d)\r
- {\r
- IdVec.resize(QSD.L, 0);\r
- return;\r
- }\r
-\r
- unsigned QPos = 0;\r
- unsigned PPos = 0;\r
-\r
- vector<bool> SameVec;\r
- SameVec.reserve(QL);\r
- for (unsigned Col = 0; Col < ColCount; ++Col)\r
- {\r
- char c = Path[Col];\r
-\r
- bool Same = false;\r
- if (c == 'M')\r
- {\r
- byte q = Q[QPos];\r
- byte p = P[PPos];\r
- Same = (toupper(q) == toupper(p));\r
- }\r
-\r
- if (c == 'M' || c == 'D')\r
- {\r
- ++QPos;\r
- SameVec.push_back(Same);\r
- }\r
-\r
- if (c == 'M' || c == 'I')\r
- ++PPos;\r
- }\r
-\r
- asserta(SIZE(SameVec) == QL);\r
-\r
- unsigned n = 0;\r
- for (unsigned QPos = 0; QPos < d; ++QPos)\r
- {\r
- if (SameVec[QPos])\r
- ++n;\r
- IdVec.push_back(n);\r
- }\r
-\r
- for (unsigned QPos = d; QPos < QL; ++QPos)\r
- {\r
- if (SameVec[QPos])\r
- ++n;\r
- IdVec.push_back(n);\r
- if (SameVec[QPos-d])\r
- --n;\r
- }\r
- asserta(SIZE(IdVec) == QL);\r
-\r
-#if TRACE\r
- {\r
- Log("\n");\r
- Log("GetSmoothedIdVec\n");\r
- unsigned QPos = 0;\r
- unsigned PPos = 0;\r
- Log("Q P Same Id\n");\r
- Log("- - ---- -------\n");\r
- for (unsigned Col = 0; Col < ColCount; ++Col)\r
- {\r
- char c = Path[Col];\r
-\r
- bool Same = false;\r
- if (c == 'M')\r
- {\r
- byte q = Q[QPos];\r
- byte p = P[PPos];\r
- Same = (toupper(q) == toupper(p));\r
- Log("%c %c %4c %7d\n", q, p, tof(Same), IdVec[QPos]);\r
- }\r
-\r
- if (c == 'M' || c == 'D')\r
- ++QPos;\r
- if (c == 'M' || c == 'I')\r
- ++PPos;\r
- }\r
- }\r
-#endif\r
- }\r
-\r
-bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, \r
- const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,\r
- float MinFractId, ChimeHit2 &Hit)\r
- {\r
- Hit.Clear();\r
- Hit.QLabel = QSD.Label;\r
-\r
- if (opt_verbose)\r
- {\r
- Log("\n");\r
- Log("SearchChime()\n");\r
- Log("Query>%s\n", QSD.Label);\r
- }\r
-\r
- vector<unsigned> Parents;\r
- GetCandidateParents(U, QSD, QAb, Parents);\r
-\r
- unsigned ParentCount = SIZE(Parents);\r
- if (ParentCount <= 1)\r
- {\r
- if (opt_verbose)\r
- Log("%u candidate parents, done.\n", ParentCount);\r
- return false;\r
- }\r
-\r
- if (opt_fastalign)\r
- HF.SetA(QSD);\r
- HSPFinder *ptrHF = (opt_fastalign ? &HF : 0);\r
-\r
- unsigned ChunkLength;\r
- vector<unsigned> ChunkLos;\r
- GetChunkInfo(QSD.L, ChunkLength, ChunkLos);\r
- const unsigned ChunkCount = SIZE(ChunkLos);\r
-\r
- vector<unsigned> ChunkIndexToBestId(ChunkCount, 0);\r
- vector<unsigned> ChunkIndexToBestParentIndex(ChunkCount, UINT_MAX);\r
-\r
- vector<SeqData> PSDs;\r
- vector<string> Paths;\r
- double TopPctId = 0.0;\r
- unsigned TopParentIndex = UINT_MAX;\r
- unsigned QL = QSD.L;\r
- vector<unsigned> MaxIdVec(QL, 0);\r
- for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
- {\r
- unsigned ParentSeqIndex = Parents[ParentIndex];\r
-\r
- SeqData PSD;\r
- //PSD.Label = U.GetSeedLabel(ParentSeqIndex);\r
- //PSD.Seq = U.GetSeedSeq(ParentSeqIndex);\r
- //PSD.L = U.GetSeedLength(ParentSeqIndex);\r
- //PSD.Index = ParentSeqIndex;\r
- U.GetSeqData(ParentSeqIndex, PSD);\r
- PSDs.push_back(PSD);\r
-\r
- if (opt_fastalign)\r
- HF.SetB(PSD);\r
-\r
- PathData PD;\r
-\r
- float HSPId;\r
- bool Found = GlobalAlign(QSD, PSD, AP, AH, *ptrHF, MinFractId, HSPId, PD);\r
- if (!Found)\r
- {\r
- Paths.push_back(""); \r
- continue;\r
- }\r
-\r
- double PctId = 100.0*GetFractIdGivenPath(QSD.Seq, PSD.Seq, PD.Start, true);\r
- if (opt_selfid && PctId == 100.0)\r
- {\r
- Paths.push_back(""); \r
- continue;\r
- }\r
-\r
- if (PctId > TopPctId)\r
- {\r
- TopParentIndex = ParentIndex;\r
- TopPctId = PctId;\r
- if (TopPctId >= 100.0 - opt_mindiv)\r
- {\r
- if (opt_verbose)\r
- {\r
- Log(" %.1f%% >%s\n", TopPctId, PSD.Label);\r
- Log(" Top hit exceeds ctl threshold, done.\n");\r
- return false;\r
- }\r
- }\r
- }\r
-\r
- string Path = PD.Start;\r
- Paths.push_back(Path);\r
-\r
- vector<unsigned> IdVec;\r
- GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
-\r
- for (unsigned QPos = 0; QPos < QL; ++QPos)\r
- if (IdVec[QPos] > MaxIdVec[QPos])\r
- MaxIdVec[QPos] = IdVec[QPos];\r
- }\r
-\r
- vector<unsigned> BestParents;\r
- for (unsigned k = 0; k < opt_maxp; ++k)\r
- {\r
- unsigned BestParent = UINT_MAX;\r
- unsigned BestCov = 0;\r
- for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
- {\r
- const SeqData &PSD = PSDs[ParentIndex];\r
- const string &Path = Paths[ParentIndex];\r
- if (Path == "")\r
- continue;\r
-\r
- vector<unsigned> IdVec;\r
- GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
-\r
- unsigned Cov = 0;\r
- for (unsigned QPos = 0; QPos < QL; ++QPos)\r
- if (IdVec[QPos] == MaxIdVec[QPos])\r
- ++Cov;\r
-\r
- if (Cov > BestCov)\r
- {\r
- BestParent = ParentIndex;\r
- BestCov = Cov;\r
- }\r
- }\r
-\r
- if (BestParent == UINT_MAX)\r
- break;\r
-\r
- BestParents.push_back(BestParent);\r
- vector<unsigned> IdVec;\r
-\r
- const SeqData &PSD = PSDs[BestParent];\r
- const string &Path = Paths[BestParent];\r
- GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
- for (unsigned QPos = 0; QPos < QL; ++QPos)\r
- if (IdVec[QPos] == MaxIdVec[QPos])\r
- MaxIdVec[QPos] = UINT_MAX;\r
- }\r
-\r
- unsigned BestParentCount = SIZE(BestParents);\r
-\r
- if (opt_verbose)\r
- {\r
- Log("%u/%u best parents\n", BestParentCount, ParentCount);\r
- for (unsigned k = 0; k < BestParentCount; ++k)\r
- {\r
- unsigned i = BestParents[k];\r
- Log(" %s\n", PSDs[i].Label);\r
- }\r
- }\r
-\r
- bool Found = false;\r
- for (unsigned k1 = 0; k1 < BestParentCount; ++k1)\r
- {\r
- unsigned i1 = BestParents[k1];\r
- asserta(i1 < ParentCount);\r
-\r
- const SeqData &PSD1 = PSDs[i1];\r
- const string &Path1 = Paths[i1];\r
-\r
- for (unsigned k2 = k1 + 1; k2 < BestParentCount; ++k2)\r
- {\r
- unsigned i2 = BestParents[k2];\r
- asserta(i2 < ParentCount);\r
- asserta(i2 != i1);\r
-\r
- const SeqData &PSD2 = PSDs[i2];\r
- const string &Path2 = Paths[i2];\r
-\r
- ChimeHit2 Hit2;\r
- AlignChime(QSD, PSD1, PSD2, Path1, Path2, Hit2);\r
- Hit2.PctIdQT = TopPctId;\r
-\r
- if (Hit2.Accept())\r
- Found = true;\r
-\r
- if (Hit2.Score > Hit.Score)\r
- Hit = Hit2;\r
-\r
- if (opt_verbose)\r
- Hit2.LogMe();\r
- }\r
- }\r
-\r
- return Found;\r
- }\r