From 3bf397f8e8aa6560b575ffae1b539d33539d8902 Mon Sep 17 00:00:00 2001 From: David Kastrup Date: Mon, 2 Jan 2012 16:43:09 +0100 Subject: [PATCH] misc.cc: remove utf8_char_len, change callers. This was too fragile when fed invalid UTF-8. Callers still are far from pretty. --- lily/include/misc.hh | 1 - lily/misc.cc | 20 -------------------- lily/source-file.cc | 27 ++++++++++++--------------- lily/text-interface.cc | 24 +++++++++++++----------- 4 files changed, 25 insertions(+), 47 deletions(-) diff --git a/lily/include/misc.hh b/lily/include/misc.hh index ce4f4c838b..18f996ad4b 100644 --- a/lily/include/misc.hh +++ b/lily/include/misc.hh @@ -64,7 +64,6 @@ Real directed_round (Real f, Direction d); Real peak_around (Real epsilon, Real threshold, Real x); Real convex_amplifier (Real standard_x, Real increase_factor, Real x); string camel_case_to_lisp_identifier (string in); -vsize utf8_char_len (char); #endif diff --git a/lily/misc.cc b/lily/misc.cc index 0fa4a90240..cc673e9548 100644 --- a/lily/misc.cc +++ b/lily/misc.cc @@ -94,23 +94,3 @@ camel_case_to_lisp_identifier (string in) return result; } - -vsize -utf8_char_len (char current) -{ - vsize char_len = 1; - - // U+10000 - U+10FFFF - if ((current & 0xF0) == 0xF0) - char_len = 4; - // U+0800 - U+FFFF - else if ((current & 0xE0) == 0xE0) - char_len = 3; - // U+0080 - U+07FF - else if ((current & 0xC0) == 0xC0) - char_len = 2; - else if (current & 0x80) - programming_error ("invalid UTF-8 string"); - - return char_len; -} diff --git a/lily/source-file.cc b/lily/source-file.cc index e6f7a4fb19..69611709e1 100644 --- a/lily/source-file.cc +++ b/lily/source-file.cc @@ -277,30 +277,27 @@ Source_file::get_counts (char const *pos_str0, char const *line_start = (char const *)data + line[LEFT]; ssize left = (char const *) pos_str0 - line_start; + *byte_offset = left; + string line_begin (line_start, left); char const *line_chars = line_begin.c_str (); - while (left > 0) + for (; left > 0; --left, ++line_chars) { - size_t thislen = utf8_char_len (*line_chars); - - if (thislen == 1 && line_chars[0] == '\t') + // Skip UTF-8 continuation bytes. This is simplistic but + // robust, and we warn against non-UTF-8 input in the lexer + // already. In the case of non-UTF-8 or of this function being + // called in mid-character, the results are somewhat arbitrary, + // but there is no really sane definition anyway. + if ((*line_chars & 0xc0) == 0x80) + continue; + + if (*line_chars == '\t') (*column) = (*column / 8 + 1) * 8; else (*column)++; (*line_char)++; - - /* - To have decent output in UTF-8 aware terminals, - we must keep track of the number of bytes from - the left edge of the terminal. - */ - *byte_offset += thislen; - - /* Advance past this character. */ - line_chars += thislen; - left -= thislen; } } diff --git a/lily/text-interface.cc b/lily/text-interface.cc index 47171b0949..2bb4545a20 100644 --- a/lily/text-interface.cc +++ b/lily/text-interface.cc @@ -33,9 +33,8 @@ #include "warn.hh" static void -replace_special_characters (string *str, SCM props) +replace_special_characters (string &str, SCM props) { - vsize i = 0; SCM replacement_alist = ly_chain_assoc_get (ly_symbol2scm ("replacement-alist"), props, SCM_EOL); @@ -47,18 +46,21 @@ replace_special_characters (string *str, SCM props) (scm_string_length (scm_caar (s)))); } - while (i <= str->size ()) + for (vsize i = 0; i < str.size (); i++) { + /* Don't match in mid-UTF-8 */ + if ((str[i] & 0xc0) == 0x80) + continue; for (vsize j = max_length + 1; j--;) { - string dummy = str->substr (i, j); - string ligature = robust_scm2string - (ly_assoc_get (ly_string2scm (dummy), - replacement_alist, SCM_BOOL_F), ""); - if (ligature != "") - str->replace (i, j, ligature); + if (j > str.size () - i) + continue; + string dummy = str.substr (i, j); + SCM ligature = ly_assoc_get (ly_string2scm (dummy), + replacement_alist, SCM_BOOL_F); + if (scm_is_true (ligature)) + str.replace (i, j, robust_scm2string (ligature, "")); } - i += utf8_char_len ((*str)[i]); } } @@ -75,7 +77,7 @@ Text_interface::interpret_string (SCM layout_smob, Output_def *layout = unsmob_output_def (layout_smob); Font_metric *fm = select_encoded_font (layout, props); - replace_special_characters (&str, props); + replace_special_characters (str, props); /* We want to filter strings with a music font that pass through -- 2.39.2