- /*
- FIXME, this is apparently locale dependent.
- */
-#if HAVE_MBRTOWC
- wchar_t multibyte[2];
- size_t thislen = mbrtowc (multibyte, line_chars, left, &state);
-#else
- size_t thislen = 1;
-#endif /* !HAVE_MBRTOWC */
-
- /* Stop converting at invalid character;
- this can mean we have read just the first part
- of a valid character. */
- if (thislen == (size_t) -1)
- break;
-
- /* We want to handle embedded NUL bytes
- but the return value is 0. Correct this. */
- if (thislen == 0)
- thislen = 1;
-
- if (thislen == 1 && line_chars[0] == '\t')
- (*column) = (*column / 8 + 1) * 8;
+ // Skip UTF-8 continuation bytes. This is simplistic but
+ // robust, and we warn against non-UTF-8 input in the lexer
+ // already. In the case of non-UTF-8 or of this function being
+ // called in mid-character, the results are somewhat arbitrary,
+ // but there is no really sane definition anyway.
+ if ((*line_chars & 0xc0) == 0x80)
+ continue;
+
+ if (*line_chars == '\t')
+ (*column) = (*column / 8 + 1) * 8;