--- /dev/null
+#include "sfasta.h"\r
+#include "orf.h"\r
+#include "alpha.h"\r
+#include "timing.h"\r
+\r
+static inline bool isgap(byte c)\r
+ {\r
+ return c == '-' || c == '.';\r
+ }\r
+\r
+const unsigned BufferSize = 16*1024*1024;\r
+\r
+static unsigned GetMaxPoly(const byte *Seq, unsigned L)\r
+ {\r
+ byte CurrChar = Seq[0];\r
+ unsigned Start = 0;\r
+ unsigned MaxLen = 1;\r
+ for (unsigned i = 1; i < L; ++i)\r
+ {\r
+ char c = Seq[i];\r
+ if (c != CurrChar || i+1 == L)\r
+ {\r
+ unsigned Len = i - Start;\r
+ if (Len > MaxLen)\r
+ MaxLen = Len;\r
+ CurrChar = c;\r
+ Start = i;\r
+ }\r
+ }\r
+ return MaxLen;\r
+ }\r
+\r
+SFasta::SFasta()\r
+ {\r
+ m_FileName = "";\r
+ m_File = 0;\r
+ m_Buffer = 0;\r
+ m_BufferSize = 0;\r
+ m_BufferOffset = 0;\r
+ m_BufferBytes = 0;\r
+ m_FilePos = 0;\r
+ m_FileSize = 0;\r
+ m_Label = 0;\r
+ m_SeqLength = 0;\r
+ m_TooShortCount = 0;\r
+ m_TooLongCount = 0;\r
+ m_ShortestLength = 0;\r
+ m_LongestLength = 0;\r
+ m_IsNucleo = false;\r
+ m_IsNucleoSet = false;\r
+ }\r
+\r
+SFasta::~SFasta()\r
+ {\r
+ Clear();\r
+ }\r
+\r
+void SFasta::Clear()\r
+ {\r
+ MYFREE(m_Buffer, m_BufferSize, SFasta);\r
+ if (m_File != 0)\r
+ CloseStdioFile(m_File);\r
+\r
+ m_FileName = "";\r
+ m_File = 0;\r
+ m_Buffer = 0;\r
+ m_BufferSize = 0;\r
+ m_BufferOffset = 0;\r
+ m_BufferBytes = 0;\r
+ m_FilePos = 0;\r
+ m_FileSize = 0;\r
+ m_Label = 0;\r
+ m_SeqLength = 0;\r
+ m_SeqIndex = UINT_MAX;\r
+ m_AllowGaps = false;\r
+ m_IsNucleo = false;\r
+ m_IsNucleoSet = false;\r
+ m_TooShortCount = 0;\r
+ m_TooLongCount = 0;\r
+ m_ShortestLength = 0;\r
+ m_LongestLength = 0;\r
+ m_TooPolyCount = 0;\r
+ }\r
+\r
+void SFasta::LogMe() const\r
+ {\r
+ Log("\n");\r
+ Log("SFasta::LogMe()\n");\r
+ Log("FileName=%s\n", m_FileName.c_str());\r
+ Log("FileSize=%u\n", (unsigned) m_FileSize);\r
+ Log("FilePos=%u\n", (unsigned) m_FilePos);\r
+ Log("BufferSize=%u\n", m_BufferSize);\r
+ Log("BufferPos=%u\n", m_BufferOffset);\r
+ Log("BufferBytes=%u\n", m_BufferBytes);\r
+ if (m_Label == 0)\r
+ Log("Label=NULL\n");\r
+ else\r
+ Log("Label=%s\n", m_Label);\r
+ Log("SeqLength=%u\n", m_SeqLength);\r
+ }\r
+\r
+const byte *SFasta::GetNextSeq()\r
+ {\r
+ for (;;)\r
+ {\r
+ const byte *Seq = GetNextSeqLo();\r
+ if (Seq == 0)\r
+ {\r
+ if (m_TooShortCount > 0)\r
+ Warning("%u short sequences (--minlen %u, shortest %u) discarded from %s",\r
+ m_TooShortCount, opt_minlen, m_ShortestLength, m_FileName.c_str());\r
+ if (m_TooLongCount > 0)\r
+ Warning("%u long sequences (--maxlen %u, longest %u) discarded from %s",\r
+ m_TooLongCount, opt_maxlen, m_LongestLength, m_FileName.c_str());\r
+ if (m_TooPolyCount > 0)\r
+ Warning("%u sequences with long homopolymers discarded (--maxpoly %u)",\r
+ m_TooPolyCount, opt_maxpoly);\r
+ return 0;\r
+ }\r
+ if (m_SeqLength < opt_minlen)\r
+ {\r
+ ++m_TooShortCount;\r
+ if (m_ShortestLength == 0 || m_SeqLength < m_ShortestLength)\r
+ m_ShortestLength = m_SeqLength;\r
+ continue;\r
+ }\r
+ if (m_SeqLength > opt_maxlen && opt_maxlen != 0)\r
+ {\r
+ if (m_LongestLength == 0 || m_SeqLength > m_LongestLength)\r
+ m_LongestLength = m_SeqLength;\r
+ ++m_TooLongCount;\r
+ continue;\r
+ }\r
+ return Seq;\r
+ }\r
+ }\r
+\r
+const byte *SFasta::GetNextSeqLo()\r
+ {\r
+// End of cache?\r
+ if (m_BufferOffset == m_BufferBytes)\r
+ {\r
+ // End of file?\r
+ if (m_FilePos == m_FileSize)\r
+ return 0;\r
+ FillCache();\r
+ }\r
+\r
+ StartTimer(SF_GetNextSeq);\r
+ asserta(m_Buffer[m_BufferOffset] == '>');\r
+ m_Label = (char *) (m_Buffer + m_BufferOffset + 1);\r
+ \r
+//// Scan to end-of-line.\r
+//// Use dubious library function strchr() in the hope\r
+//// that it uses fast machine code.\r
+// byte *ptr = (byte *) strchr(m_Label, '\n');\r
+// asserta(ptr != 0);\r
+// *ptr = 0;\r
+\r
+ byte *ptr = 0;\r
+ for (unsigned i = m_BufferOffset; i < m_BufferSize; ++i)\r
+ {\r
+ char c = m_Buffer[i];\r
+ if (c == '\n' || c == '\r')\r
+ {\r
+ ptr = m_Buffer + i;\r
+ break;\r
+ }\r
+ }\r
+ asserta(ptr != 0);\r
+\r
+ if (opt_trunclabels)\r
+ {\r
+ for (char *p = m_Label; *p; ++p)\r
+ if (isspace(*p))\r
+ {\r
+ *p = 0;\r
+ break;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ for (char *p = m_Label; *p; ++p)\r
+ {\r
+ if (*p == '\t')\r
+ *p = ' ';\r
+ else if (*p == '\r' || *p == '\n')\r
+ {\r
+ *p = 0;\r
+ char NextChar = *(p+1);\r
+ if (NextChar == '\r' || NextChar == '\n')\r
+ ++p;\r
+ break;\r
+ }\r
+ }\r
+ }\r
+\r
+// ptr points to end-of-line.\r
+// Move to start of sequence data.\r
+ byte *Seq = ++ptr;\r
+\r
+// Delete white space in-place\r
+ byte *To = ptr;\r
+ m_BufferOffset = (unsigned) (ptr - m_Buffer);\r
+ while (m_BufferOffset < m_BufferBytes)\r
+ {\r
+ byte c = m_Buffer[m_BufferOffset];\r
+ if (c == '>')\r
+ {\r
+ char prevc = '\n';\r
+ if (m_BufferOffset > 0)\r
+ prevc = m_Buffer[m_BufferOffset-1];\r
+ if (prevc == '\n' || prevc == '\r')\r
+ break;\r
+ }\r
+ ++m_BufferOffset;\r
+ if (isalpha(c) || (isgap(c) && m_AllowGaps))\r
+ *To++ = c;\r
+ else if (c == '\n' || c == '\r')\r
+ continue;\r
+ else\r
+ {\r
+ const char *Label = (m_Label == 0 ? "" : m_Label);\r
+ static bool WarningDone = false;\r
+ if (!WarningDone)\r
+ {\r
+ if (isgap(c))\r
+ Warning("Ignoring gaps in FASTA file '%s'",\r
+ m_FileName.c_str());\r
+ else if (isprint(c))\r
+ Warning("Invalid FASTA file '%s', non-letter '%c' in sequence >%s",\r
+ m_FileName.c_str(), c, Label);\r
+ else\r
+ Warning("Invalid FASTA file '%s', non-printing byte (hex %02x) in sequence >%s",\r
+ m_FileName.c_str(), c, Label);\r
+ WarningDone = true;\r
+ }\r
+ continue;\r
+ }\r
+ }\r
+ m_SeqLength = unsigned(To - Seq);\r
+\r
+ if (m_SeqIndex == UINT_MAX)\r
+ m_SeqIndex = 0;\r
+ else\r
+ ++m_SeqIndex;\r
+\r
+ EndTimer(SF_GetNextSeq);\r
+ return Seq;\r
+ }\r
+\r
+void SFasta::Open(const string &FileName)\r
+ {\r
+ Clear();\r
+ m_FileName = FileName;\r
+ m_File = OpenStdioFile(FileName);\r
+ m_BufferSize = BufferSize;\r
+ //m_Buffer = myalloc<byte>(m_BufferSize);\r
+ m_Buffer = MYALLOC(byte, m_BufferSize, SFasta);\r
+ m_FileSize = GetStdioFileSize(m_File);\r
+ }\r
+\r
+void SFasta::Rewind()\r
+ {\r
+ m_BufferOffset = 0;\r
+ m_BufferBytes = 0;\r
+ m_FilePos = 0;\r
+ }\r
+\r
+bool SFasta::SetIsNucleo()\r
+ {\r
+ if (m_FilePos != 0)\r
+ Die("SFasta::IsNucleo, not at BOF");\r
+\r
+ unsigned LetterCount = 0;\r
+ unsigned NucleoLetterCount = 0;\r
+ for (;;)\r
+ {\r
+ const byte *Seq = GetNextSeq();\r
+ if (Seq == 0)\r
+ break;\r
+ unsigned L = GetSeqLength();\r
+ for (unsigned i = 0; i < L; ++i)\r
+ if (g_IsNucleoChar[Seq[i]])\r
+ ++NucleoLetterCount;\r
+ LetterCount += L;\r
+ if (LetterCount > 256)\r
+ break;\r
+ }\r
+ Rewind();\r
+ if (LetterCount == 0)\r
+ {\r
+ m_IsNucleoSet = true;\r
+ m_IsNucleo = true;\r
+ return true;\r
+ }\r
+\r
+// Nucleo if more than 90% nucleo letters AGCTUN\r
+ m_IsNucleo = double(NucleoLetterCount)/LetterCount > 0.9;\r
+ m_IsNucleoSet = true;\r
+ return m_IsNucleo;\r
+ }\r
+\r
+void SFasta::FillCache()\r
+ {\r
+ StartTimer(SF_FillCache);\r
+ asserta(m_FilePos < m_FileSize);\r
+\r
+// off_t may be larger type than unsigned, e.g. 64- vs. 32-bit.\r
+ off_t otBytesToRead = m_FileSize - m_FilePos;\r
+\r
+ bool FinalBuffer = true;\r
+ if (otBytesToRead > (off_t) m_BufferSize)\r
+ {\r
+ FinalBuffer = false;\r
+ otBytesToRead = m_BufferSize;\r
+ }\r
+\r
+ unsigned BytesToRead = unsigned(otBytesToRead);\r
+ asserta(BytesToRead > 0);\r
+ asserta(BytesToRead <= m_BufferSize);\r
+\r
+ SetStdioFilePos(m_File, m_FilePos);\r
+ ReadStdioFile(m_File, m_Buffer, BytesToRead);\r
+ if (m_Buffer[0] != '>')\r
+ {\r
+ if (m_FilePos == 0)\r
+ Die("Input is not FASTA file");\r
+ else\r
+ Die("SFasta::FillCache() failed, expected '>'");\r
+ }\r
+\r
+ m_BufferOffset = 0;\r
+\r
+// If last buffer in file, done\r
+ if (FinalBuffer)\r
+ {\r
+ m_BufferBytes = BytesToRead;\r
+ m_FilePos += BytesToRead;\r
+ EndTimer(SF_FillCache);\r
+ return;\r
+ }\r
+\r
+// If not last buffer, truncate any partial sequence\r
+// at end of buffer. Search backwards to find last '>'.\r
+ byte *ptr = m_Buffer + BytesToRead - 1;\r
+ while (ptr > m_Buffer)\r
+ {\r
+ if (ptr[0] == '>' && (ptr[-1] == '\n' || ptr[-1] == '\r'))\r
+ break;\r
+ --ptr;\r
+ }\r
+\r
+ if (ptr == m_Buffer)\r
+ {\r
+ LogMe();\r
+ if (*ptr != '>')\r
+ {\r
+ // No '>' found.\r
+ // This might techincally be legal FASTA if the entire\r
+ // buffer is white space, but strange if not the last buffer\r
+ // in the file, so quit anyway.\r
+ Die("Failed to find '>' (pos=%u, bytes=%u)",\r
+ (unsigned) m_FilePos, BytesToRead);\r
+ }\r
+ else\r
+ {\r
+ // Entire buffer is one sequence which may be truncated.\r
+ Die("Sequence too long (pos=%u, bytes=%u)",\r
+ (unsigned) m_FilePos, BytesToRead);\r
+ }\r
+ }\r
+\r
+ asserta(*ptr == '>');\r
+\r
+ m_BufferBytes = unsigned(ptr - m_Buffer);\r
+ m_FilePos += m_BufferBytes;\r
+\r
+ EndTimer(SF_FillCache);\r
+ }\r
+\r
+unsigned SFasta::GetPctDoneX10() const\r
+ {\r
+ if (m_FilePos == 0 || m_FileSize == 0)\r
+ return 0;\r
+\r
+ assert(m_FilePos >= (off_t) m_BufferBytes);\r
+ off_t BufferStart = m_FilePos - m_BufferBytes;\r
+ off_t BufferPos = BufferStart + m_BufferOffset;\r
+\r
+ unsigned iPctX10 = unsigned(10.0*double(BufferPos)*100.0/double(m_FileSize));\r
+ if (iPctX10 == 0)\r
+ return 1;\r
+ if (iPctX10 >= 999)\r
+ return 998;\r
+ return iPctX10;\r
+ }\r
+\r
+double SFasta::GetPctDone() const\r
+ {\r
+ if (m_FilePos == 0 || m_FileSize == 0)\r
+ return 0;\r
+\r
+ assert(m_FilePos >= (off_t) m_BufferBytes);\r
+ off_t BufferStart = m_FilePos - m_BufferBytes;\r
+ off_t BufferPos = BufferStart + m_BufferOffset;\r
+\r
+ return double(BufferPos)*100.0/double(m_FileSize);\r
+ }\r
+\r
+bool SFasta::GetNextSD(SeqData &SD)\r
+ {\r
+ SD.Seq = GetNextSeq();\r
+ if (SD.Seq == 0)\r
+ return false;\r
+\r
+ SD.Label = GetLabel();\r
+ SD.L = GetSeqLength();\r
+ SD.Index = GetSeqIndex();\r
+ SD.ORFParent = 0;\r
+ SD.Nucleo = GetIsNucleo();\r
+ SD.RevComp = false;\r
+\r
+ return true;\r
+ }\r
+\r
+#if TEST\r
+void TestSFasta()\r
+ {\r
+ SFasta SF;\r
+ SF.Open(opt_input);\r
+\r
+ if (opt_verbose)\r
+ {\r
+ Log(" Index Length Label\n");\r
+ Log("------- ------- -----\n");\r
+ }\r
+\r
+ unsigned Index = 0;\r
+ unsigned SeqCount = 0;\r
+ double LetterCount = 0.0;\r
+ ProgressStep(0, 1000, "Reading");\r
+ for (;;)\r
+ {\r
+ const byte *Seq = SF.GetNextSeq();\r
+ if (Seq == 0)\r
+ break;\r
+ ProgressStep(SF.GetPctDoneX10(), 1000, "Reading");\r
+ const char *Label = SF.GetLabel();\r
+ unsigned L = SF.GetSeqLength();\r
+ ++SeqCount;\r
+ LetterCount += L;\r
+\r
+ if (opt_verbose)\r
+ {\r
+ Log(">%7u %7u '%s'\n", Index, L, Label);\r
+ Log("+%7.7s %7.7s \"%*.*s\"\n", "", "", L, L, Seq);\r
+ }\r
+\r
+ ++Index;\r
+ }\r
+ ProgressStep(999, 1000, "Reading");\r
+\r
+ Progress("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));\r
+ Log("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));\r
+ }\r
+#endif // TEST\r