]> git.donarmstrong.com Git - mothur.git/blobdiff - uchime_src/seqdb.cpp
added uchime_src folder. added biom parameter to make.shared. added biom as a current...
[mothur.git] / uchime_src / seqdb.cpp
diff --git a/uchime_src/seqdb.cpp b/uchime_src/seqdb.cpp
new file mode 100644 (file)
index 0000000..03de189
--- /dev/null
@@ -0,0 +1,289 @@
+#include "myutils.h"\r
+#include "seqdb.h"\r
+#include "alpha.h"\r
+#include "timing.h"\r
+#include "sfasta.h"\r
+#include "seq.h"\r
+\r
+void SeqToFasta(FILE *f, const char *Label, const byte *Seq, unsigned L)\r
+       {\r
+       const unsigned ROWLEN = 80;\r
+       if (Label != 0)\r
+               fprintf(f, ">%s\n", Label);\r
+       unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;\r
+       for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)\r
+               {\r
+               unsigned From = BlockIndex*ROWLEN;\r
+               unsigned To = From + ROWLEN;\r
+               if (To >= L)\r
+                       To = L;\r
+               for (unsigned Pos = From; Pos < To; ++Pos)\r
+                       fputc(Seq[Pos], f);\r
+               fputc('\n', f);\r
+               }\r
+       }\r
+\r
+SeqDB::~SeqDB()\r
+       {\r
+       Clear();\r
+       }\r
+\r
+SeqDB::SeqDB()\r
+       {\r
+       Clear(true);\r
+       }\r
+\r
+void SeqDB::Clear(bool ctor)\r
+       {\r
+       if (!ctor)\r
+               {\r
+               for (unsigned i = 0; i < m_SeqCount; ++i)\r
+                       {\r
+                       unsigned n = strlen(m_Labels[i]);\r
+                       MYFREE(m_Labels[i], n, SeqDB);\r
+                       MYFREE(m_Seqs[i], m_SeqLengths[i], SeqDB);\r
+                       }\r
+               MYFREE(m_Labels, m_Size, SeqDB);\r
+               MYFREE(m_Seqs, m_Size, SeqDB);\r
+               MYFREE(m_SeqLengths, m_Size, SeqDB);\r
+               }\r
+\r
+       m_FileName.clear();\r
+       m_SeqCount = 0;\r
+       m_Size = 0;\r
+\r
+       m_Labels = 0;\r
+       m_Seqs = 0;\r
+       m_SeqLengths = 0;\r
+\r
+       m_Aligned = false;\r
+       m_IsNucleo = false;\r
+       m_IsNucleoSet = false;\r
+       }\r
+\r
+void SeqDB::InitEmpty(bool Nucleo)\r
+       {\r
+       Clear();\r
+       m_IsNucleo = Nucleo;\r
+       m_IsNucleoSet = true;\r
+       }\r
+\r
+void SeqDB::FromFasta(const string &FileName, bool AllowGaps)\r
+       {\r
+       Clear();\r
+       m_FileName = FileName;\r
+       SFasta SF;\r
+\r
+       SF.Open(FileName);\r
+       SF.m_AllowGaps = AllowGaps;\r
+\r
+       ProgressStep(0, 1000, "Reading %s", FileName.c_str());\r
+       for (;;)\r
+               {\r
+               unsigned QueryPctDoneX10 = SF.GetPctDoneX10();\r
+               ProgressStep(QueryPctDoneX10, 1000, "Reading %s", FileName.c_str());\r
+               const byte *Seq = SF.GetNextSeq();\r
+               if (Seq == 0)\r
+                       break;\r
+\r
+               const char *Label = SF.GetLabel();\r
+               unsigned L = SF.GetSeqLength();\r
+               AddSeq(Label, Seq, L);\r
+               }\r
+       ProgressStep(999, 1000, "Reading %s", FileName.c_str());\r
+\r
+       SetIsNucleo();\r
+\r
+       Progress("%s sequences\n", IntToStr(GetSeqCount()));\r
+       }\r
+\r
+void SeqDB::ToFasta(const string &FileName) const\r
+       {\r
+       FILE *f = CreateStdioFile(FileName);\r
+       for (unsigned SeqIndex = 0; SeqIndex < GetSeqCount(); ++SeqIndex)\r
+               ToFasta(f, SeqIndex);\r
+       CloseStdioFile(f);\r
+       }\r
+\r
+void SeqDB::SeqToFasta(FILE *f, unsigned SeqIndex, bool WithLabel) const\r
+       {\r
+       if (WithLabel)\r
+               fprintf(f, ">%s\n", GetLabel(SeqIndex));\r
+\r
+       const unsigned ROWLEN = 80;\r
+\r
+       unsigned L = GetSeqLength(SeqIndex);\r
+       const byte *Seq = GetSeq(SeqIndex);\r
+       unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;\r
+       for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)\r
+               {\r
+               unsigned From = BlockIndex*ROWLEN;\r
+               unsigned To = From + ROWLEN;\r
+               if (To >= L)\r
+                       To = L;\r
+               for (unsigned Pos = From; Pos < To; ++Pos)\r
+                       fputc(Seq[Pos], f);\r
+               fputc('\n', f);\r
+               }\r
+       }\r
+\r
+void SeqDB::ToFasta(FILE *f, unsigned SeqIndex) const\r
+       {\r
+       asserta(SeqIndex < m_SeqCount);\r
+       fprintf(f, ">%s\n", GetLabel(SeqIndex));\r
+       SeqToFasta(f, SeqIndex);\r
+       }\r
+\r
+unsigned SeqDB::GetMaxLabelLength() const\r
+       {\r
+       const unsigned SeqCount = GetSeqCount();\r
+       unsigned MaxL = 0;\r
+       for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+               {\r
+               unsigned L = (unsigned) strlen(m_Labels[Index]);\r
+               if (L > MaxL)\r
+                       MaxL = L;\r
+               }\r
+       return MaxL;\r
+       }\r
+\r
+unsigned SeqDB::GetMaxSeqLength() const\r
+       {\r
+       const unsigned SeqCount = GetSeqCount();\r
+       unsigned MaxL = 0;\r
+       for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+               {\r
+               unsigned L = m_SeqLengths[Index];\r
+               if (L > MaxL)\r
+                       MaxL = L;\r
+               }\r
+       return MaxL;\r
+       }\r
+\r
+void SeqDB::LogMe() const\r
+       {\r
+       Log("\n");\r
+       const unsigned SeqCount = GetSeqCount();\r
+       Log("SeqDB %u seqs, aligned=%c\n", SeqCount, tof(m_Aligned));\r
+       if (SeqCount == 0)\r
+               return;\r
+\r
+       Log("Index             Label  Length  Seq\n");\r
+       Log("-----  ----------------  ------  ---\n");\r
+       for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+               {\r
+               Log("%5u", Index);\r
+               Log("  %16.16s", m_Labels[Index]);\r
+               unsigned L = m_SeqLengths[Index];\r
+               Log("  %6u", L);\r
+               Log("  %*.*s", L, L, m_Seqs[Index]);\r
+               Log("\n");\r
+               }\r
+       }\r
+\r
+void SeqDB::GetSeqData(unsigned Id, SeqData &Buffer) const\r
+       {\r
+       asserta(Id < m_SeqCount);\r
+       Buffer.Seq = m_Seqs[Id];\r
+       Buffer.Label = m_Labels[Id];\r
+       Buffer.L = m_SeqLengths[Id];\r
+       Buffer.Index = Id;\r
+       Buffer.ORFParent = 0;\r
+       Buffer.RevComp = false;\r
+       Buffer.Nucleo = IsNucleo();\r
+       }\r
+\r
+void SeqDB::SetIsNucleo()\r
+       {\r
+       const unsigned SeqCount = GetSeqCount();\r
+       unsigned N = 0;\r
+       for (unsigned i = 0; i < 100; ++i)\r
+               {\r
+               unsigned SeqIndex = unsigned(rand()%SeqCount);\r
+               const byte *Seq = GetSeq(SeqIndex);\r
+               unsigned L = GetSeqLength(SeqIndex);\r
+               const unsigned Pos = unsigned(rand()%L);\r
+               byte c = Seq[Pos];\r
+\r
+               if (g_IsNucleoChar[c])\r
+                       ++N;\r
+               }\r
+       m_IsNucleo = (N > 80);\r
+       m_IsNucleoSet = true;\r
+       }\r
+\r
+unsigned SeqDB::GetTotalLength() const\r
+       {\r
+       const unsigned SeqCount = GetSeqCount();\r
+       unsigned TotalLength = 0;\r
+       for (unsigned Id = 0; Id < SeqCount; ++Id)\r
+               TotalLength += GetSeqLength(Id);\r
+       return TotalLength;\r
+       }\r
+\r
+unsigned SeqDB::AddSeq(const char *Label, const byte *Seq, unsigned L)\r
+       {\r
+       StartTimer(AddSeq);\r
+       if (m_SeqCount >= m_Size)\r
+               {\r
+               unsigned NewSize = unsigned(m_Size*1.5) + 1024;\r
+               char **NewLabels = MYALLOC(char *, NewSize, SeqDB);\r
+               byte **NewSeqs = MYALLOC(byte *, NewSize, SeqDB);\r
+               unsigned *NewSeqLengths = MYALLOC(unsigned, NewSize, SeqDB);\r
+\r
+               for (unsigned i = 0; i < m_SeqCount; ++i)\r
+                       {\r
+                       NewLabels[i] = m_Labels[i];\r
+                       NewSeqs[i] = m_Seqs[i];\r
+                       NewSeqLengths[i] = m_SeqLengths[i];\r
+                       }\r
+\r
+               MYFREE(m_Labels, m_SeqCount, SeqDB);\r
+               MYFREE(m_Seqs, m_SeqCount, SeqDB);\r
+               MYFREE(m_SeqLengths, m_SeqCount, SeqDB);\r
+\r
+               m_Labels = NewLabels;\r
+               m_Seqs = NewSeqs;\r
+               m_SeqLengths = NewSeqLengths;\r
+               m_Size = NewSize;\r
+               }\r
+\r
+       unsigned Index = m_SeqCount++;\r
+       m_Seqs[Index] = MYALLOC(byte, L, SeqDB);\r
+       memcpy(m_Seqs[Index], Seq, L);\r
+\r
+       unsigned n = strlen(Label) + 1;\r
+       m_Labels[Index] = MYALLOC(char, n, SeqDB);\r
+       memcpy(m_Labels[Index], Label, n);\r
+\r
+       if (Index == 0)\r
+               m_Aligned = true;\r
+       else\r
+               m_Aligned = (m_Aligned && L == m_SeqLengths[0]);\r
+\r
+       m_SeqLengths[Index] = L;\r
+\r
+       EndTimer(AddSeq);\r
+       return Index;\r
+       }\r
+\r
+unsigned SeqDB::GetIndex(const char *Label) const\r
+       {\r
+       for (unsigned i = 0; i < m_SeqCount; ++i)\r
+               if (strcmp(Label, m_Labels[i]) == 0)\r
+                       return i;\r
+       Die("SeqDB::GetIndex(%s), not found", Label);\r
+       return UINT_MAX;\r
+       }\r
+\r
+void SeqDB::MakeLabelToIndex(map<string, unsigned> &LabelToIndex)\r
+       {\r
+       LabelToIndex.clear();\r
+       for (unsigned i = 0; i < m_SeqCount; ++i)\r
+               {\r
+               const string &Label = string(GetLabel(i));\r
+               if (LabelToIndex.find(Label) != LabelToIndex.end())\r
+                       Die("Duplicate label: %s", Label.c_str());\r
+               LabelToIndex[Label] = i;\r
+               }\r
+       }\r