texi2html: Script to generate the nodename<=>filename/anchor map for texi2html

author Reinhold Kainhofer <reinhold@kainhofer.com>

Sat, 19 Jul 2008 18:26:11 +0000 (20:26 +0200)

committer Reinhold Kainhofer <reinhold@kainhofer.com>

Sat, 19 Jul 2008 18:26:11 +0000 (20:26 +0200)
author Reinhold Kainhofer <reinhold@kainhofer.com>
Sat, 19 Jul 2008 18:26:11 +0000 (20:26 +0200)
committer Reinhold Kainhofer <reinhold@kainhofer.com>
Sat, 19 Jul 2008 18:26:11 +0000 (20:26 +0200)
diff --git a/buildscripts/extract_texi_filenames.py b/buildscripts/extract_texi_filenames.py

new file mode 100755 (executable)

index 0000000..de27ffa
--- /dev/null
+++ b/buildscripts/extract_texi_filenames.py
@@ -0,0 +1,129 @@
+#!@PYTHON@
+# -*- coding: utf-8 -*-
+# extrace_texi_filenames.py
+
+# USAGE:  extract_texi_filenames.py [-o OUTDIR] FILES
+#
+# -o OUTDIR specifies that output files should rather be written in OUTDIR
+#
+# This script parses the .texi file given and creates a file with the
+# nodename <=> filename/anchor map (tab-separated as NODE\tFILENAME\tANCHOR).
+# The idea behind: Unnumbered subsections go into the same file as the
+# previous numbered section, @translationof gives the original node name,
+# which is then used for the filename/anchor.
+#
+# If this script is run on a file texifile.texi, it produces a file
+# texifile_xref.map, which can then be used by our texi2html init script
+# to determine the correct file name and anchor for external refs
+
+import sys
+import re
+import os
+import getopt
+
+#import langdefs
+
+optlist, args = getopt.getopt (sys.argv[1:],'o:')
+files = args
+
+outdir = '.'
+for x in optlist:
+    if x[0] == '-o':
+        outdir = x[1]
+
+include_re = re.compile (r'@include ((?!../lily-).*?)\.texi$', re.M)
+whitespaces = re.compile (r'\s+')
+section_translation_re = re.compile (r'@(node|(?:unnumbered|appendix)(?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|(?:major|chap|(?:sub){0,2})heading|translationof) (.*?)\n')
+
+def expand_includes (m):
+    filepath = os.path.join (os.path.dirname (m.group(0)), m.group(1)) + '.texi'
+    print "Including file: " + filepath
+    if os.path.exists (filepath):
+        return extract_sections (filepath)
+    return m.group(0)
+
+def extract_sections (filename):
+    result = ''
+    f = open (filename, 'r')
+    page = f.read ()
+    f.close()
+    # Replace all includes by their list of sections and extract all sections
+    page = include_re.sub (expand_includes, page)
+    sections = section_translation_re.findall (page)
+    for sec in sections:
+        result += "@" + sec[0] + " " + sec[1] + "\n"
+    return result
+
+def texinfo_file_name(title):
+    # File name normalization by texinfo (described in the texinfo manual):
+    # 1/2: letters and numbers are left unchanged
+    # 3/4: multiple, leading and trailing whitespace is removed
+    title = title.strip ();
+    title = whitespaces.sub (' ', title)
+    # 5:   all remaining spaces are converted to '-'
+    # 6:   all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
+    result = ''
+    for index in range(len(title)):
+        char = title[index]
+        if char == ' ': # space -> '-'
+            result += '-'
+        elif ( ('0' <= char and char <= '9' ) or
+               ('A' <= char and char <= 'Z' ) or
+               ('a' <= char and char <= 'z' ) ):  # number or letter
+            result += char
+        else:
+            ccode = ord(char)
+            if ccode <= 0xFFFF:
+                result += "_%04x" % ccode
+            else:
+                result += "__%06x" % ccode
+    # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
+    if ord(result[0]) in range (ord('0'), ord('9')):
+        result = 't_g' + result
+    return result
+
+texinfo_re = re.compile (r'@.*{(.*)}')
+def remove_texinfo (title):
+    return texinfo_re.sub (r'\1', title)
+
+def create_texinfo_anchor (title):
+    return texinfo_file_name (remove_texinfo (title))
+
+unnumbered_re = re.compile (r'unnumbered.*')
+def process_sections (filename, page):
+    sections = section_translation_re.findall (page)
+    p = os.path.join (outdir, filename) [:-5] + '_xref.map'
+    f = open (p, 'w')
+
+    this_title = ''
+    this_filename = ''
+    this_anchor = ''
+    this_unnumbered = False
+    for sec in sections:
+        if sec[0] == "node":
+            # Write out the cached values to the file and start a new section:
+            if this_title != '':
+                f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
+                print (this_title + "\t" + this_filename + "\t" + this_anchor)
+            this_title = remove_texinfo (sec[1])
+            this_anchor = create_texinfo_anchor (sec[1])
+        if sec[0] == "translationof":
+            anchor = create_texinfo_anchor (sec[1])
+            this_anchor = anchor
+            if not this_unnumbered:
+                this_filename = anchor
+        else:
+            this_unnumbered = unnumbered_re.match (sec[0])
+            if not this_unnumbered:
+                this_filename = this_anchor
+
+    if this_title != '':
+        f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
+        print (this_title + "\t" + this_filename + "\t" + this_anchor)
+    f.close ()
+
+
+for filename in files:
+    print "extract_texi_filenames.py: Processing %s" % filename
+    sections = extract_sections (filename)
+    process_sections (filename, sections)
author	Reinhold Kainhofer <reinhold@kainhofer.com>
	Sat, 19 Jul 2008 18:26:11 +0000 (20:26 +0200)
committer	Reinhold Kainhofer <reinhold@kainhofer.com>
	Sat, 19 Jul 2008 18:26:11 +0000 (20:26 +0200)