From 3bf397f8e8aa6560b575ffae1b539d33539d8902 Mon Sep 17 00:00:00 2001
From: David Kastrup <dak@gnu.org>
Date: Mon, 2 Jan 2012 16:43:09 +0100
Subject: [PATCH] misc.cc: remove utf8_char_len, change callers.

This was too fragile when fed invalid UTF-8.  Callers still are far
from pretty.
---
 lily/include/misc.hh   |  1 -
 lily/misc.cc           | 20 --------------------
 lily/source-file.cc    | 27 ++++++++++++---------------
 lily/text-interface.cc | 24 +++++++++++++-----------
 4 files changed, 25 insertions(+), 47 deletions(-)

diff --git a/lily/include/misc.hh b/lily/include/misc.hh
index ce4f4c838b..18f996ad4b 100644
--- a/lily/include/misc.hh
+++ b/lily/include/misc.hh
@@ -64,7 +64,6 @@ Real directed_round (Real f, Direction d);
 Real peak_around (Real epsilon, Real threshold, Real x);
 Real convex_amplifier (Real standard_x, Real increase_factor, Real x);
 string camel_case_to_lisp_identifier (string in);
-vsize utf8_char_len (char);
 
 #endif
 
diff --git a/lily/misc.cc b/lily/misc.cc
index 0fa4a90240..cc673e9548 100644
--- a/lily/misc.cc
+++ b/lily/misc.cc
@@ -94,23 +94,3 @@ camel_case_to_lisp_identifier (string in)
 
   return result;
 }
-
-vsize
-utf8_char_len (char current)
-{
-  vsize char_len = 1;
-
-  // U+10000 - U+10FFFF
-  if ((current & 0xF0) == 0xF0)
-    char_len = 4;
-  // U+0800 - U+FFFF
-  else if ((current & 0xE0) == 0xE0)
-    char_len = 3;
-  // U+0080 - U+07FF
-  else if ((current & 0xC0) == 0xC0)
-    char_len = 2;
-  else if (current & 0x80)
-    programming_error ("invalid UTF-8 string");
-
-  return char_len;
-}
diff --git a/lily/source-file.cc b/lily/source-file.cc
index e6f7a4fb19..69611709e1 100644
--- a/lily/source-file.cc
+++ b/lily/source-file.cc
@@ -277,30 +277,27 @@ Source_file::get_counts (char const *pos_str0,
   char const *line_start = (char const *)data + line[LEFT];
 
   ssize left = (char const *) pos_str0 - line_start;
+  *byte_offset = left;
+
   string line_begin (line_start, left);
   char const *line_chars = line_begin.c_str ();
 
-  while (left > 0)
+  for (; left > 0; --left, ++line_chars)
     {
-      size_t thislen = utf8_char_len (*line_chars);
-
-      if (thislen == 1 && line_chars[0] == '\t')
+      // Skip UTF-8 continuation bytes.  This is simplistic but
+      // robust, and we warn against non-UTF-8 input in the lexer
+      // already.  In the case of non-UTF-8 or of this function being
+      // called in mid-character, the results are somewhat arbitrary,
+      // but there is no really sane definition anyway.
+      if ((*line_chars & 0xc0) == 0x80)
+	continue;
+
+      if (*line_chars == '\t')
         (*column) = (*column / 8 + 1) * 8;
       else
         (*column)++;
 
       (*line_char)++;
-
-      /*
-        To have decent output in UTF-8 aware terminals,
-        we must keep track of the number of bytes from
-        the left edge of the terminal.
-      */
-      *byte_offset += thislen;
-
-      /* Advance past this character. */
-      line_chars += thislen;
-      left -= thislen;
     }
 }
 
diff --git a/lily/text-interface.cc b/lily/text-interface.cc
index 47171b0949..2bb4545a20 100644
--- a/lily/text-interface.cc
+++ b/lily/text-interface.cc
@@ -33,9 +33,8 @@
 #include "warn.hh"
 
 static void
-replace_special_characters (string *str, SCM props)
+replace_special_characters (string &str, SCM props)
 {
-  vsize i = 0;
   SCM replacement_alist = ly_chain_assoc_get (ly_symbol2scm ("replacement-alist"),
                                               props,
                                               SCM_EOL);
@@ -47,18 +46,21 @@ replace_special_characters (string *str, SCM props)
                         (scm_string_length (scm_caar (s))));
     }
 
-  while (i <= str->size ())
+  for (vsize i = 0; i < str.size (); i++)
     {
+      /* Don't match in mid-UTF-8 */
+      if ((str[i] & 0xc0) == 0x80)
+	continue;
       for (vsize j = max_length + 1; j--;)
         {
-          string dummy = str->substr (i, j);
-          string ligature = robust_scm2string
-                            (ly_assoc_get (ly_string2scm (dummy),
-                                           replacement_alist, SCM_BOOL_F), "");
-          if (ligature != "")
-            str->replace (i, j, ligature);
+	  if (j > str.size () - i)
+	    continue;
+          string dummy = str.substr (i, j);
+          SCM ligature = ly_assoc_get (ly_string2scm (dummy),
+				       replacement_alist, SCM_BOOL_F);
+	  if (scm_is_true (ligature))
+            str.replace (i, j, robust_scm2string (ligature, ""));
         }
-      i += utf8_char_len ((*str)[i]);
     }
 }
 
@@ -75,7 +77,7 @@ Text_interface::interpret_string (SCM layout_smob,
   Output_def *layout = unsmob_output_def (layout_smob);
   Font_metric *fm = select_encoded_font (layout, props);
 
-  replace_special_characters (&str, props);
+  replace_special_characters (str, props);
 
   /*
     We want to filter strings with a music font that pass through
-- 
2.39.5