From eece01e3958ec2557ebb7d5ed62d7545b6ee37ec Mon Sep 17 00:00:00 2001
From: Reinhold Kainhofer <reinhold@kainhofer.com>
Date: Tue, 12 Apr 2011 13:47:04 +0200
Subject: [PATCH] pdf-metadata: Use UTF-16BE for metadata if required

All Latin1 metadata strings need to be printed out to the .ps file
in Latin1 (NOT in UTF-8), and all non-Latin1 strings need to use
UTF-16BE encoding.
The escaping of parentheses and backslashes needs to be applied
AFTER encoding the string (since the escaping is required to correctly
detect the end of the encoding string when reading the byte-sequence
of the encoded string from the file. Basically, we are dumping a binary
sequence to the file, enclosed by parentheses. All \051 bytes need
to be escaped to make sure they are not detected as the closing
parenthesis.)
---
 input/regression/pdfmark-metadata-unicode.ly | 26 +++++++++
 lily/pdf-scheme.cc                           | 60 ++++++++++++++++++++
 scm/framework-ps.scm                         |  6 +-
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 input/regression/pdfmark-metadata-unicode.ly
 create mode 100644 lily/pdf-scheme.cc

diff --git a/input/regression/pdfmark-metadata-unicode.ly b/input/regression/pdfmark-metadata-unicode.ly
new file mode 100644
index 0000000000..5f51a620c0
--- /dev/null
+++ b/input/regression/pdfmark-metadata-unicode.ly
@@ -0,0 +1,26 @@
+\version "2.13.60"
+
+
+\header
+{
+
+  texidoc = "PDF metadata need either Latin1 encoding (not UTF8) or full
+  UTF-16BE with BOM. The title field uses full UTF-16 (russian characters,
+  euro, etc), while the composer uses normal european diacrits (which need
+  to be encoded as Latin1, not as UTF8). Closing parenthesis need to be
+  escaped by a backslash AFTER encoding!"
+
+  % Non-latin1 text, requiring UTF-16BE (with BOM) encoding in PDF metatdata:
+  % closing parentheses and backslashed need to be escaped AFTER encoding!
+  title = "UTF-16BE title:Â² â¬ ÄÄÅÅÅ®Å¯Å¿ÐÑÑ)\\\n Â¡"
+  % Latin1 text, requiring at least PDFDocEncoding in PDF metadata, all Latin1
+  % characters coincide, so no special encoding is required, just print out
+  % the Latin1 characters (NOT the utf8 bytes!)
+  composer = "Latin1 composer (with special chars): JÃ¶hÃ¥nÃ± StrauÃ"
+  poet = "UTF-16BE with parentheses: ) â¬ ÄÄÅÅÅ®Å¯Å¿ÐÑÑ"
+}
+
+\score
+{
+  \new Staff c'1
+}
\ No newline at end of file
diff --git a/lily/pdf-scheme.cc b/lily/pdf-scheme.cc
new file mode 100644
index 0000000000..6d717c55ad
--- /dev/null
+++ b/lily/pdf-scheme.cc
@@ -0,0 +1,60 @@
+/*
+  This file is part of LilyPond, the GNU music typesetter.
+
+  Copyright (C) 2011 Reinhold Kainhofer <reinhold@kainhofer.com>
+
+  LilyPond is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  LilyPond is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with LilyPond.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <glib.h>
+using namespace std;
+
+#include "lily-guile.hh"
+
+
+LY_DEFINE (ly_encode_string_for_pdf, "ly:encode-string-for-pdf",
+	   1, 0, 0, (SCM str),
+	   "Check whether the string needs to be encoded for PDF output (Latin1,"
+	   " PDFDocEncoding or in the most general case UTF-16BE).")
+{
+  LY_ASSERT_TYPE (scm_is_string, str, 1);
+  char *p = ly_scm2str0 (str);
+  char *g = NULL;
+  const char *charset;
+  gsize bytes_written = 0;
+  g_get_charset (&charset); /* The current locale */
+
+  /* First, try to convert to ISO-8859-1 (no encodings required) */
+  g = g_convert (p, -1, "ISO-8859-1", charset, 0, &bytes_written, 0);
+  /* If that fails, we have to resolve to full UTF-16BE */
+  if (!g) {
+    char *g_without_BOM = g_convert (p, -1,  "UTF-16BE", charset, 0, &bytes_written, 0);
+    /* prepend the BOM manually, g_convert doesn't do it! */
+    g = new char[bytes_written+3];
+    g[0] = (char)254;
+    g[1] = (char)255;
+    memcpy (&g[2], g_without_BOM, bytes_written+1); // Copy string + \0
+    free (g_without_BOM);
+    bytes_written += 2;
+  }
+  free (p);
+
+  /* Convert back to SCM object and return it */
+  if (g) {
+    return scm_from_locale_stringn (g, bytes_written);
+  } else {
+    return str;
+  }
+
+}
diff --git a/scm/framework-ps.scm b/scm/framework-ps.scm
index eb4e545531..b0359bcd88 100644
--- a/scm/framework-ps.scm
+++ b/scm/framework-ps.scm
@@ -413,12 +413,16 @@
 ;;; Create DOCINFO pdfmark containing metadata
 ;;; header fields with pdf prefix override those without the prefix
 (define (handle-metadata header port)
+  (define (metadata-encode val)
+    ;; First, call ly:encode-string-for-pdf to encode the string (latin1 or
+    ;; utf-16be), then escape all parentheses and backslashes
+    (ps-quote (ly:encode-string-for-pdf val)))
   (define (metadata-lookup-output overridevar fallbackvar field)
     (let* ((overrideval (ly:modules-lookup (list header) overridevar))
 	   (fallbackval (ly:modules-lookup (list header) fallbackvar))
 	   (val (if overrideval overrideval fallbackval)))
       (if val
-	  (format port "/~a (~a)\n" field (ps-quote (markup->string val))))))
+	  (format port "/~a (~a)\n" field (metadata-encode (markup->string val))))))
   (display "[ " port)
   (metadata-lookup-output 'pdfcomposer 'composer "Author")
   (format port "/Creator (LilyPond ~a)\n" (lilypond-version))
-- 
2.39.5