From eece01e3958ec2557ebb7d5ed62d7545b6ee37ec Mon Sep 17 00:00:00 2001 From: Reinhold Kainhofer Date: Tue, 12 Apr 2011 13:47:04 +0200 Subject: [PATCH] pdf-metadata: Use UTF-16BE for metadata if required All Latin1 metadata strings need to be printed out to the .ps file in Latin1 (NOT in UTF-8), and all non-Latin1 strings need to use UTF-16BE encoding. The escaping of parentheses and backslashes needs to be applied AFTER encoding the string (since the escaping is required to correctly detect the end of the encoding string when reading the byte-sequence of the encoded string from the file. Basically, we are dumping a binary sequence to the file, enclosed by parentheses. All \051 bytes need to be escaped to make sure they are not detected as the closing parenthesis.) --- input/regression/pdfmark-metadata-unicode.ly | 26 +++++++++ lily/pdf-scheme.cc | 60 ++++++++++++++++++++ scm/framework-ps.scm | 6 +- 3 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 input/regression/pdfmark-metadata-unicode.ly create mode 100644 lily/pdf-scheme.cc diff --git a/input/regression/pdfmark-metadata-unicode.ly b/input/regression/pdfmark-metadata-unicode.ly new file mode 100644 index 0000000000..5f51a620c0 --- /dev/null +++ b/input/regression/pdfmark-metadata-unicode.ly @@ -0,0 +1,26 @@ +\version "2.13.60" + + +\header +{ + + texidoc = "PDF metadata need either Latin1 encoding (not UTF8) or full + UTF-16BE with BOM. The title field uses full UTF-16 (russian characters, + euro, etc), while the composer uses normal european diacrits (which need + to be encoded as Latin1, not as UTF8). Closing parenthesis need to be + escaped by a backslash AFTER encoding!" + + % Non-latin1 text, requiring UTF-16BE (with BOM) encoding in PDF metatdata: + % closing parentheses and backslashed need to be escaped AFTER encoding! + title = "UTF-16BE title:² € ĂĄœŖŮůſЖюљ)\\\n ¡" + % Latin1 text, requiring at least PDFDocEncoding in PDF metadata, all Latin1 + % characters coincide, so no special encoding is required, just print out + % the Latin1 characters (NOT the utf8 bytes!) + composer = "Latin1 composer (with special chars): Jöhånñ Strauß" + poet = "UTF-16BE with parentheses: ) € ĂĄœŖŮůſЖюљ" +} + +\score +{ + \new Staff c'1 +} \ No newline at end of file diff --git a/lily/pdf-scheme.cc b/lily/pdf-scheme.cc new file mode 100644 index 0000000000..6d717c55ad --- /dev/null +++ b/lily/pdf-scheme.cc @@ -0,0 +1,60 @@ +/* + This file is part of LilyPond, the GNU music typesetter. + + Copyright (C) 2011 Reinhold Kainhofer + + LilyPond is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + LilyPond is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with LilyPond. If not, see . +*/ + +#include +using namespace std; + +#include "lily-guile.hh" + + +LY_DEFINE (ly_encode_string_for_pdf, "ly:encode-string-for-pdf", + 1, 0, 0, (SCM str), + "Check whether the string needs to be encoded for PDF output (Latin1," + " PDFDocEncoding or in the most general case UTF-16BE).") +{ + LY_ASSERT_TYPE (scm_is_string, str, 1); + char *p = ly_scm2str0 (str); + char *g = NULL; + const char *charset; + gsize bytes_written = 0; + g_get_charset (&charset); /* The current locale */ + + /* First, try to convert to ISO-8859-1 (no encodings required) */ + g = g_convert (p, -1, "ISO-8859-1", charset, 0, &bytes_written, 0); + /* If that fails, we have to resolve to full UTF-16BE */ + if (!g) { + char *g_without_BOM = g_convert (p, -1, "UTF-16BE", charset, 0, &bytes_written, 0); + /* prepend the BOM manually, g_convert doesn't do it! */ + g = new char[bytes_written+3]; + g[0] = (char)254; + g[1] = (char)255; + memcpy (&g[2], g_without_BOM, bytes_written+1); // Copy string + \0 + free (g_without_BOM); + bytes_written += 2; + } + free (p); + + /* Convert back to SCM object and return it */ + if (g) { + return scm_from_locale_stringn (g, bytes_written); + } else { + return str; + } + +} diff --git a/scm/framework-ps.scm b/scm/framework-ps.scm index eb4e545531..b0359bcd88 100644 --- a/scm/framework-ps.scm +++ b/scm/framework-ps.scm @@ -413,12 +413,16 @@ ;;; Create DOCINFO pdfmark containing metadata ;;; header fields with pdf prefix override those without the prefix (define (handle-metadata header port) + (define (metadata-encode val) + ;; First, call ly:encode-string-for-pdf to encode the string (latin1 or + ;; utf-16be), then escape all parentheses and backslashes + (ps-quote (ly:encode-string-for-pdf val))) (define (metadata-lookup-output overridevar fallbackvar field) (let* ((overrideval (ly:modules-lookup (list header) overridevar)) (fallbackval (ly:modules-lookup (list header) fallbackvar)) (val (if overrideval overrideval fallbackval))) (if val - (format port "/~a (~a)\n" field (ps-quote (markup->string val)))))) + (format port "/~a (~a)\n" field (metadata-encode (markup->string val)))))) (display "[ " port) (metadata-lookup-output 'pdfcomposer 'composer "Author") (format port "/Creator (LilyPond ~a)\n" (lilypond-version)) -- 2.39.2