--- /dev/null
+#include "myutils.h"\r
+#include "seqdb.h"\r
+#include "alpha.h"\r
+#include "timing.h"\r
+#include "sfasta.h"\r
+#include "seq.h"\r
+\r
+void SeqToFasta(FILE *f, const char *Label, const byte *Seq, unsigned L)\r
+ {\r
+ const unsigned ROWLEN = 80;\r
+ if (Label != 0)\r
+ fprintf(f, ">%s\n", Label);\r
+ unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;\r
+ for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)\r
+ {\r
+ unsigned From = BlockIndex*ROWLEN;\r
+ unsigned To = From + ROWLEN;\r
+ if (To >= L)\r
+ To = L;\r
+ for (unsigned Pos = From; Pos < To; ++Pos)\r
+ fputc(Seq[Pos], f);\r
+ fputc('\n', f);\r
+ }\r
+ }\r
+\r
+SeqDB::~SeqDB()\r
+ {\r
+ Clear();\r
+ }\r
+\r
+SeqDB::SeqDB()\r
+ {\r
+ Clear(true);\r
+ }\r
+\r
+void SeqDB::Clear(bool ctor)\r
+ {\r
+ if (!ctor)\r
+ {\r
+ for (unsigned i = 0; i < m_SeqCount; ++i)\r
+ {\r
+ unsigned n = strlen(m_Labels[i]);\r
+ MYFREE(m_Labels[i], n, SeqDB);\r
+ MYFREE(m_Seqs[i], m_SeqLengths[i], SeqDB);\r
+ }\r
+ MYFREE(m_Labels, m_Size, SeqDB);\r
+ MYFREE(m_Seqs, m_Size, SeqDB);\r
+ MYFREE(m_SeqLengths, m_Size, SeqDB);\r
+ }\r
+\r
+ m_FileName.clear();\r
+ m_SeqCount = 0;\r
+ m_Size = 0;\r
+\r
+ m_Labels = 0;\r
+ m_Seqs = 0;\r
+ m_SeqLengths = 0;\r
+\r
+ m_Aligned = false;\r
+ m_IsNucleo = false;\r
+ m_IsNucleoSet = false;\r
+ }\r
+\r
+void SeqDB::InitEmpty(bool Nucleo)\r
+ {\r
+ Clear();\r
+ m_IsNucleo = Nucleo;\r
+ m_IsNucleoSet = true;\r
+ }\r
+\r
+void SeqDB::FromFasta(const string &FileName, bool AllowGaps)\r
+ {\r
+ Clear();\r
+ m_FileName = FileName;\r
+ SFasta SF;\r
+\r
+ SF.Open(FileName);\r
+ SF.m_AllowGaps = AllowGaps;\r
+\r
+ ProgressStep(0, 1000, "Reading %s", FileName.c_str());\r
+ for (;;)\r
+ {\r
+ unsigned QueryPctDoneX10 = SF.GetPctDoneX10();\r
+ ProgressStep(QueryPctDoneX10, 1000, "Reading %s", FileName.c_str());\r
+ const byte *Seq = SF.GetNextSeq();\r
+ if (Seq == 0)\r
+ break;\r
+\r
+ const char *Label = SF.GetLabel();\r
+ unsigned L = SF.GetSeqLength();\r
+ AddSeq(Label, Seq, L);\r
+ }\r
+ ProgressStep(999, 1000, "Reading %s", FileName.c_str());\r
+\r
+ SetIsNucleo();\r
+\r
+ Progress("%s sequences\n", IntToStr(GetSeqCount()));\r
+ }\r
+\r
+void SeqDB::ToFasta(const string &FileName) const\r
+ {\r
+ FILE *f = CreateStdioFile(FileName);\r
+ for (unsigned SeqIndex = 0; SeqIndex < GetSeqCount(); ++SeqIndex)\r
+ ToFasta(f, SeqIndex);\r
+ CloseStdioFile(f);\r
+ }\r
+\r
+void SeqDB::SeqToFasta(FILE *f, unsigned SeqIndex, bool WithLabel) const\r
+ {\r
+ if (WithLabel)\r
+ fprintf(f, ">%s\n", GetLabel(SeqIndex));\r
+\r
+ const unsigned ROWLEN = 80;\r
+\r
+ unsigned L = GetSeqLength(SeqIndex);\r
+ const byte *Seq = GetSeq(SeqIndex);\r
+ unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;\r
+ for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)\r
+ {\r
+ unsigned From = BlockIndex*ROWLEN;\r
+ unsigned To = From + ROWLEN;\r
+ if (To >= L)\r
+ To = L;\r
+ for (unsigned Pos = From; Pos < To; ++Pos)\r
+ fputc(Seq[Pos], f);\r
+ fputc('\n', f);\r
+ }\r
+ }\r
+\r
+void SeqDB::ToFasta(FILE *f, unsigned SeqIndex) const\r
+ {\r
+ asserta(SeqIndex < m_SeqCount);\r
+ fprintf(f, ">%s\n", GetLabel(SeqIndex));\r
+ SeqToFasta(f, SeqIndex);\r
+ }\r
+\r
+unsigned SeqDB::GetMaxLabelLength() const\r
+ {\r
+ const unsigned SeqCount = GetSeqCount();\r
+ unsigned MaxL = 0;\r
+ for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+ {\r
+ unsigned L = (unsigned) strlen(m_Labels[Index]);\r
+ if (L > MaxL)\r
+ MaxL = L;\r
+ }\r
+ return MaxL;\r
+ }\r
+\r
+unsigned SeqDB::GetMaxSeqLength() const\r
+ {\r
+ const unsigned SeqCount = GetSeqCount();\r
+ unsigned MaxL = 0;\r
+ for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+ {\r
+ unsigned L = m_SeqLengths[Index];\r
+ if (L > MaxL)\r
+ MaxL = L;\r
+ }\r
+ return MaxL;\r
+ }\r
+\r
+void SeqDB::LogMe() const\r
+ {\r
+ Log("\n");\r
+ const unsigned SeqCount = GetSeqCount();\r
+ Log("SeqDB %u seqs, aligned=%c\n", SeqCount, tof(m_Aligned));\r
+ if (SeqCount == 0)\r
+ return;\r
+\r
+ Log("Index Label Length Seq\n");\r
+ Log("----- ---------------- ------ ---\n");\r
+ for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+ {\r
+ Log("%5u", Index);\r
+ Log(" %16.16s", m_Labels[Index]);\r
+ unsigned L = m_SeqLengths[Index];\r
+ Log(" %6u", L);\r
+ Log(" %*.*s", L, L, m_Seqs[Index]);\r
+ Log("\n");\r
+ }\r
+ }\r
+\r
+void SeqDB::GetSeqData(unsigned Id, SeqData &Buffer) const\r
+ {\r
+ asserta(Id < m_SeqCount);\r
+ Buffer.Seq = m_Seqs[Id];\r
+ Buffer.Label = m_Labels[Id];\r
+ Buffer.L = m_SeqLengths[Id];\r
+ Buffer.Index = Id;\r
+ Buffer.ORFParent = 0;\r
+ Buffer.RevComp = false;\r
+ Buffer.Nucleo = IsNucleo();\r
+ }\r
+\r
+void SeqDB::SetIsNucleo()\r
+ {\r
+ const unsigned SeqCount = GetSeqCount();\r
+ unsigned N = 0;\r
+ for (unsigned i = 0; i < 100; ++i)\r
+ {\r
+ unsigned SeqIndex = unsigned(rand()%SeqCount);\r
+ const byte *Seq = GetSeq(SeqIndex);\r
+ unsigned L = GetSeqLength(SeqIndex);\r
+ const unsigned Pos = unsigned(rand()%L);\r
+ byte c = Seq[Pos];\r
+\r
+ if (g_IsNucleoChar[c])\r
+ ++N;\r
+ }\r
+ m_IsNucleo = (N > 80);\r
+ m_IsNucleoSet = true;\r
+ }\r
+\r
+unsigned SeqDB::GetTotalLength() const\r
+ {\r
+ const unsigned SeqCount = GetSeqCount();\r
+ unsigned TotalLength = 0;\r
+ for (unsigned Id = 0; Id < SeqCount; ++Id)\r
+ TotalLength += GetSeqLength(Id);\r
+ return TotalLength;\r
+ }\r
+\r
+unsigned SeqDB::AddSeq(const char *Label, const byte *Seq, unsigned L)\r
+ {\r
+ StartTimer(AddSeq);\r
+ if (m_SeqCount >= m_Size)\r
+ {\r
+ unsigned NewSize = unsigned(m_Size*1.5) + 1024;\r
+ char **NewLabels = MYALLOC(char *, NewSize, SeqDB);\r
+ byte **NewSeqs = MYALLOC(byte *, NewSize, SeqDB);\r
+ unsigned *NewSeqLengths = MYALLOC(unsigned, NewSize, SeqDB);\r
+\r
+ for (unsigned i = 0; i < m_SeqCount; ++i)\r
+ {\r
+ NewLabels[i] = m_Labels[i];\r
+ NewSeqs[i] = m_Seqs[i];\r
+ NewSeqLengths[i] = m_SeqLengths[i];\r
+ }\r
+\r
+ MYFREE(m_Labels, m_SeqCount, SeqDB);\r
+ MYFREE(m_Seqs, m_SeqCount, SeqDB);\r
+ MYFREE(m_SeqLengths, m_SeqCount, SeqDB);\r
+\r
+ m_Labels = NewLabels;\r
+ m_Seqs = NewSeqs;\r
+ m_SeqLengths = NewSeqLengths;\r
+ m_Size = NewSize;\r
+ }\r
+\r
+ unsigned Index = m_SeqCount++;\r
+ m_Seqs[Index] = MYALLOC(byte, L, SeqDB);\r
+ memcpy(m_Seqs[Index], Seq, L);\r
+\r
+ unsigned n = strlen(Label) + 1;\r
+ m_Labels[Index] = MYALLOC(char, n, SeqDB);\r
+ memcpy(m_Labels[Index], Label, n);\r
+\r
+ if (Index == 0)\r
+ m_Aligned = true;\r
+ else\r
+ m_Aligned = (m_Aligned && L == m_SeqLengths[0]);\r
+\r
+ m_SeqLengths[Index] = L;\r
+\r
+ EndTimer(AddSeq);\r
+ return Index;\r
+ }\r
+\r
+unsigned SeqDB::GetIndex(const char *Label) const\r
+ {\r
+ for (unsigned i = 0; i < m_SeqCount; ++i)\r
+ if (strcmp(Label, m_Labels[i]) == 0)\r
+ return i;\r
+ Die("SeqDB::GetIndex(%s), not found", Label);\r
+ return UINT_MAX;\r
+ }\r
+\r
+void SeqDB::MakeLabelToIndex(map<string, unsigned> &LabelToIndex)\r
+ {\r
+ LabelToIndex.clear();\r
+ for (unsigned i = 0; i < m_SeqCount; ++i)\r
+ {\r
+ const string &Label = string(GetLabel(i));\r
+ if (LabelToIndex.find(Label) != LabelToIndex.end())\r
+ Die("Duplicate label: %s", Label.c_str());\r
+ LabelToIndex[Label] = i;\r
+ }\r
+ }\r