]> git.donarmstrong.com Git - mothur.git/blobdiff - uchime_src/sfasta.cpp
added uchime_src folder. added biom parameter to make.shared. added biom as a current...
[mothur.git] / uchime_src / sfasta.cpp
diff --git a/uchime_src/sfasta.cpp b/uchime_src/sfasta.cpp
new file mode 100644 (file)
index 0000000..918d4f8
--- /dev/null
@@ -0,0 +1,467 @@
+#include "sfasta.h"\r
+#include "orf.h"\r
+#include "alpha.h"\r
+#include "timing.h"\r
+\r
+static inline bool isgap(byte c)\r
+       {\r
+       return c == '-' || c == '.';\r
+       }\r
+\r
+const unsigned BufferSize = 16*1024*1024;\r
+\r
+static unsigned GetMaxPoly(const byte *Seq, unsigned L)\r
+       {\r
+       byte CurrChar = Seq[0];\r
+       unsigned Start = 0;\r
+       unsigned MaxLen = 1;\r
+       for (unsigned i = 1; i < L; ++i)\r
+               {\r
+               char c = Seq[i];\r
+               if (c != CurrChar || i+1 == L)\r
+                       {\r
+                       unsigned Len = i - Start;\r
+                       if (Len > MaxLen)\r
+                               MaxLen = Len;\r
+                       CurrChar = c;\r
+                       Start = i;\r
+                       }\r
+               }\r
+       return MaxLen;\r
+       }\r
+\r
+SFasta::SFasta()\r
+       {\r
+       m_FileName = "";\r
+       m_File = 0;\r
+       m_Buffer = 0;\r
+       m_BufferSize = 0;\r
+       m_BufferOffset = 0;\r
+       m_BufferBytes = 0;\r
+       m_FilePos = 0;\r
+       m_FileSize = 0;\r
+       m_Label = 0;\r
+       m_SeqLength = 0;\r
+       m_TooShortCount = 0;\r
+       m_TooLongCount = 0;\r
+       m_ShortestLength = 0;\r
+       m_LongestLength = 0;\r
+       m_IsNucleo = false;\r
+       m_IsNucleoSet = false;\r
+       }\r
+\r
+SFasta::~SFasta()\r
+       {\r
+       Clear();\r
+       }\r
+\r
+void SFasta::Clear()\r
+       {\r
+       MYFREE(m_Buffer, m_BufferSize, SFasta);\r
+       if (m_File != 0)\r
+               CloseStdioFile(m_File);\r
+\r
+       m_FileName = "";\r
+       m_File = 0;\r
+       m_Buffer = 0;\r
+       m_BufferSize = 0;\r
+       m_BufferOffset = 0;\r
+       m_BufferBytes = 0;\r
+       m_FilePos = 0;\r
+       m_FileSize = 0;\r
+       m_Label = 0;\r
+       m_SeqLength = 0;\r
+       m_SeqIndex = UINT_MAX;\r
+       m_AllowGaps = false;\r
+       m_IsNucleo = false;\r
+       m_IsNucleoSet = false;\r
+       m_TooShortCount = 0;\r
+       m_TooLongCount = 0;\r
+       m_ShortestLength = 0;\r
+       m_LongestLength = 0;\r
+       m_TooPolyCount = 0;\r
+       }\r
+\r
+void SFasta::LogMe() const\r
+       {\r
+       Log("\n");\r
+       Log("SFasta::LogMe()\n");\r
+       Log("FileName=%s\n", m_FileName.c_str());\r
+       Log("FileSize=%u\n", (unsigned) m_FileSize);\r
+       Log("FilePos=%u\n", (unsigned) m_FilePos);\r
+       Log("BufferSize=%u\n", m_BufferSize);\r
+       Log("BufferPos=%u\n", m_BufferOffset);\r
+       Log("BufferBytes=%u\n", m_BufferBytes);\r
+       if (m_Label == 0)\r
+               Log("Label=NULL\n");\r
+       else\r
+               Log("Label=%s\n", m_Label);\r
+       Log("SeqLength=%u\n", m_SeqLength);\r
+       }\r
+\r
+const byte *SFasta::GetNextSeq()\r
+       {\r
+       for (;;)\r
+               {\r
+               const byte *Seq = GetNextSeqLo();\r
+               if (Seq == 0)\r
+                       {\r
+                       if (m_TooShortCount > 0)\r
+                               Warning("%u short sequences (--minlen %u, shortest %u) discarded from %s",\r
+                                 m_TooShortCount, opt_minlen, m_ShortestLength, m_FileName.c_str());\r
+                       if (m_TooLongCount > 0)\r
+                               Warning("%u long sequences (--maxlen %u, longest %u) discarded from %s",\r
+                                 m_TooLongCount, opt_maxlen, m_LongestLength, m_FileName.c_str());\r
+                       if (m_TooPolyCount > 0)\r
+                               Warning("%u sequences with long homopolymers discarded (--maxpoly %u)",\r
+                                 m_TooPolyCount, opt_maxpoly);\r
+                       return 0;\r
+                       }\r
+               if (m_SeqLength < opt_minlen)\r
+                       {\r
+                       ++m_TooShortCount;\r
+                       if (m_ShortestLength == 0 || m_SeqLength < m_ShortestLength)\r
+                               m_ShortestLength = m_SeqLength;\r
+                       continue;\r
+                       }\r
+               if (m_SeqLength > opt_maxlen && opt_maxlen != 0)\r
+                       {\r
+                       if (m_LongestLength == 0 || m_SeqLength > m_LongestLength)\r
+                               m_LongestLength = m_SeqLength;\r
+                       ++m_TooLongCount;\r
+                       continue;\r
+                       }\r
+               return Seq;\r
+               }\r
+       }\r
+\r
+const byte *SFasta::GetNextSeqLo()\r
+       {\r
+// End of cache?\r
+       if (m_BufferOffset == m_BufferBytes)\r
+               {\r
+       // End of file?\r
+               if (m_FilePos == m_FileSize)\r
+                       return 0;\r
+               FillCache();\r
+               }\r
+\r
+       StartTimer(SF_GetNextSeq);\r
+       asserta(m_Buffer[m_BufferOffset] == '>');\r
+       m_Label = (char *) (m_Buffer + m_BufferOffset + 1);\r
+       \r
+//// Scan to end-of-line.\r
+//// Use dubious library function strchr() in the hope\r
+//// that it uses fast machine code.\r
+//     byte *ptr = (byte *) strchr(m_Label, '\n');\r
+//     asserta(ptr != 0);\r
+//     *ptr = 0;\r
+\r
+       byte *ptr = 0;\r
+       for (unsigned i = m_BufferOffset; i < m_BufferSize; ++i)\r
+               {\r
+               char c = m_Buffer[i];\r
+               if (c == '\n' || c == '\r')\r
+                       {\r
+                       ptr = m_Buffer + i;\r
+                       break;\r
+                       }\r
+               }\r
+       asserta(ptr != 0);\r
+\r
+       if (opt_trunclabels)\r
+               {\r
+               for (char *p = m_Label; *p; ++p)\r
+                       if (isspace(*p))\r
+                               {\r
+                               *p = 0;\r
+                               break;\r
+                               }\r
+               }\r
+       else\r
+               {\r
+               for (char *p = m_Label; *p; ++p)\r
+                       {\r
+                       if (*p == '\t')\r
+                               *p = ' ';\r
+                       else if (*p == '\r' || *p == '\n')\r
+                               {\r
+                               *p = 0;\r
+                               char NextChar = *(p+1);\r
+                               if (NextChar == '\r' || NextChar == '\n')\r
+                                       ++p;\r
+                               break;\r
+                               }\r
+                       }\r
+               }\r
+\r
+// ptr points to end-of-line.\r
+// Move to start of sequence data.\r
+       byte *Seq = ++ptr;\r
+\r
+// Delete white space in-place\r
+       byte *To = ptr;\r
+       m_BufferOffset = (unsigned) (ptr - m_Buffer);\r
+       while (m_BufferOffset < m_BufferBytes)\r
+               {\r
+               byte c = m_Buffer[m_BufferOffset];\r
+               if (c == '>')\r
+                       {\r
+                       char prevc = '\n';\r
+                       if (m_BufferOffset > 0)\r
+                               prevc = m_Buffer[m_BufferOffset-1];\r
+                       if (prevc == '\n' || prevc == '\r')\r
+                               break;\r
+                       }\r
+               ++m_BufferOffset;\r
+               if (isalpha(c) || (isgap(c) && m_AllowGaps))\r
+                       *To++ = c;\r
+               else if (c == '\n' || c == '\r')\r
+                       continue;\r
+               else\r
+                       {\r
+                       const char *Label = (m_Label == 0 ? "" : m_Label);\r
+                       static bool WarningDone = false;\r
+                       if (!WarningDone)\r
+                               {\r
+                               if (isgap(c))\r
+                                       Warning("Ignoring gaps in FASTA file '%s'",\r
+                                         m_FileName.c_str());\r
+                               else if (isprint(c))\r
+                                       Warning("Invalid FASTA file '%s', non-letter '%c' in sequence >%s",\r
+                                         m_FileName.c_str(), c, Label);\r
+                               else\r
+                                       Warning("Invalid FASTA file '%s', non-printing byte (hex %02x) in sequence >%s",\r
+                                         m_FileName.c_str(), c, Label);\r
+                               WarningDone = true;\r
+                               }\r
+                       continue;\r
+                       }\r
+               }\r
+       m_SeqLength = unsigned(To - Seq);\r
+\r
+       if (m_SeqIndex == UINT_MAX)\r
+               m_SeqIndex = 0;\r
+       else\r
+               ++m_SeqIndex;\r
+\r
+       EndTimer(SF_GetNextSeq);\r
+       return Seq;\r
+       }\r
+\r
+void SFasta::Open(const string &FileName)\r
+       {\r
+       Clear();\r
+       m_FileName = FileName;\r
+       m_File = OpenStdioFile(FileName);\r
+       m_BufferSize = BufferSize;\r
+       //m_Buffer = myalloc<byte>(m_BufferSize);\r
+       m_Buffer = MYALLOC(byte, m_BufferSize, SFasta);\r
+       m_FileSize = GetStdioFileSize(m_File);\r
+       }\r
+\r
+void SFasta::Rewind()\r
+       {\r
+       m_BufferOffset = 0;\r
+       m_BufferBytes = 0;\r
+       m_FilePos = 0;\r
+       }\r
+\r
+bool SFasta::SetIsNucleo()\r
+       {\r
+       if (m_FilePos != 0)\r
+               Die("SFasta::IsNucleo, not at BOF");\r
+\r
+       unsigned LetterCount = 0;\r
+       unsigned NucleoLetterCount = 0;\r
+       for (;;)\r
+               {\r
+               const byte *Seq = GetNextSeq();\r
+               if (Seq == 0)\r
+                       break;\r
+               unsigned L = GetSeqLength();\r
+               for (unsigned i = 0; i < L; ++i)\r
+                       if (g_IsNucleoChar[Seq[i]])\r
+                               ++NucleoLetterCount;\r
+               LetterCount += L;\r
+               if (LetterCount > 256)\r
+                       break;\r
+               }\r
+       Rewind();\r
+       if (LetterCount == 0)\r
+               {\r
+               m_IsNucleoSet = true;\r
+               m_IsNucleo = true;\r
+               return true;\r
+               }\r
+\r
+// Nucleo if more than 90% nucleo letters AGCTUN\r
+       m_IsNucleo = double(NucleoLetterCount)/LetterCount > 0.9;\r
+       m_IsNucleoSet = true;\r
+       return m_IsNucleo;\r
+       }\r
+\r
+void SFasta::FillCache()\r
+       {\r
+       StartTimer(SF_FillCache);\r
+       asserta(m_FilePos < m_FileSize);\r
+\r
+// off_t may be larger type than unsigned, e.g. 64- vs. 32-bit.\r
+       off_t otBytesToRead = m_FileSize - m_FilePos;\r
+\r
+       bool FinalBuffer = true;\r
+       if (otBytesToRead > (off_t) m_BufferSize)\r
+               {\r
+               FinalBuffer = false;\r
+               otBytesToRead = m_BufferSize;\r
+               }\r
+\r
+       unsigned BytesToRead = unsigned(otBytesToRead);\r
+       asserta(BytesToRead > 0);\r
+       asserta(BytesToRead <= m_BufferSize);\r
+\r
+       SetStdioFilePos(m_File, m_FilePos);\r
+       ReadStdioFile(m_File, m_Buffer, BytesToRead);\r
+       if (m_Buffer[0] != '>')\r
+               {\r
+               if (m_FilePos == 0)\r
+                       Die("Input is not FASTA file");\r
+               else\r
+                       Die("SFasta::FillCache() failed, expected '>'");\r
+               }\r
+\r
+       m_BufferOffset = 0;\r
+\r
+// If last buffer in file, done\r
+       if (FinalBuffer)\r
+               {\r
+               m_BufferBytes = BytesToRead;\r
+               m_FilePos += BytesToRead;\r
+               EndTimer(SF_FillCache);\r
+               return;\r
+               }\r
+\r
+// If not last buffer, truncate any partial sequence\r
+// at end of buffer. Search backwards to find last '>'.\r
+       byte *ptr = m_Buffer + BytesToRead - 1;\r
+       while (ptr > m_Buffer)\r
+               {\r
+               if (ptr[0] == '>' && (ptr[-1] == '\n' || ptr[-1] == '\r'))\r
+                       break;\r
+               --ptr;\r
+               }\r
+\r
+       if (ptr == m_Buffer)\r
+               {\r
+               LogMe();\r
+               if (*ptr != '>')\r
+                       {\r
+       // No '>' found.\r
+       // This might techincally be legal FASTA if the entire\r
+       // buffer is white space, but strange if not the last buffer\r
+       // in the file, so quit anyway.\r
+                       Die("Failed to find '>' (pos=%u, bytes=%u)",\r
+                         (unsigned) m_FilePos, BytesToRead);\r
+                       }\r
+               else\r
+                       {\r
+       // Entire buffer is one sequence which may be truncated.\r
+                       Die("Sequence too long (pos=%u, bytes=%u)",\r
+                         (unsigned) m_FilePos, BytesToRead);\r
+                       }\r
+               }\r
+\r
+       asserta(*ptr == '>');\r
+\r
+       m_BufferBytes = unsigned(ptr - m_Buffer);\r
+       m_FilePos += m_BufferBytes;\r
+\r
+       EndTimer(SF_FillCache);\r
+       }\r
+\r
+unsigned SFasta::GetPctDoneX10() const\r
+       {\r
+       if (m_FilePos == 0 || m_FileSize == 0)\r
+               return 0;\r
+\r
+       assert(m_FilePos >= (off_t) m_BufferBytes);\r
+       off_t BufferStart = m_FilePos - m_BufferBytes;\r
+       off_t BufferPos = BufferStart + m_BufferOffset;\r
+\r
+       unsigned iPctX10 = unsigned(10.0*double(BufferPos)*100.0/double(m_FileSize));\r
+       if (iPctX10 == 0)\r
+               return 1;\r
+       if (iPctX10 >= 999)\r
+               return 998;\r
+       return iPctX10;\r
+       }\r
+\r
+double SFasta::GetPctDone() const\r
+       {\r
+       if (m_FilePos == 0 || m_FileSize == 0)\r
+               return 0;\r
+\r
+       assert(m_FilePos >= (off_t) m_BufferBytes);\r
+       off_t BufferStart = m_FilePos - m_BufferBytes;\r
+       off_t BufferPos = BufferStart + m_BufferOffset;\r
+\r
+       return double(BufferPos)*100.0/double(m_FileSize);\r
+       }\r
+\r
+bool SFasta::GetNextSD(SeqData &SD)\r
+       {\r
+       SD.Seq = GetNextSeq();\r
+       if (SD.Seq == 0)\r
+               return false;\r
+\r
+       SD.Label = GetLabel();\r
+       SD.L = GetSeqLength();\r
+       SD.Index = GetSeqIndex();\r
+       SD.ORFParent = 0;\r
+       SD.Nucleo = GetIsNucleo();\r
+       SD.RevComp = false;\r
+\r
+       return true;\r
+       }\r
+\r
+#if    TEST\r
+void TestSFasta()\r
+       {\r
+       SFasta SF;\r
+       SF.Open(opt_input);\r
+\r
+       if (opt_verbose)\r
+               {\r
+               Log("  Index   Length  Label\n");\r
+               Log("-------  -------  -----\n");\r
+               }\r
+\r
+       unsigned Index = 0;\r
+       unsigned SeqCount = 0;\r
+       double LetterCount = 0.0;\r
+       ProgressStep(0, 1000, "Reading");\r
+       for (;;)\r
+               {\r
+               const byte *Seq = SF.GetNextSeq();\r
+               if (Seq == 0)\r
+                       break;\r
+               ProgressStep(SF.GetPctDoneX10(), 1000, "Reading");\r
+               const char *Label = SF.GetLabel();\r
+               unsigned L = SF.GetSeqLength();\r
+               ++SeqCount;\r
+               LetterCount += L;\r
+\r
+               if (opt_verbose)\r
+                       {\r
+                       Log(">%7u  %7u  '%s'\n", Index, L, Label);\r
+                       Log("+%7.7s  %7.7s  \"%*.*s\"\n", "", "", L, L, Seq);\r
+                       }\r
+\r
+               ++Index;\r
+               }\r
+       ProgressStep(999, 1000, "Reading");\r
+\r
+       Progress("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));\r
+       Log("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));\r
+       }\r
+#endif // TEST\r