buildscripts/extract_texi_filenames.py

   1 #!@PYTHON@
   2 # -*- coding: utf-8 -*-
   3 # extrace_texi_filenames.py
   4
   5 # USAGE:  extract_texi_filenames.py [-o OUTDIR] FILES
   6 #
   7 # -o OUTDIR specifies that output files should rather be written in OUTDIR
   8 #
   9 # Description:
  10 # This script parses the .texi file given and creates a file with the
  11 # nodename <=> filename/anchor map.
  12 # The idea behind: Unnumbered subsections go into the same file as the
  13 # previous numbered section, @translationof gives the original node name,
  14 # which is then used for the filename/anchor.
  15 #
  16 # If this script is run on a file texifile.texi, it produces a file
  17 # texifile_xref.map with tab-separated entries of the form
  18 #        NODE\tFILENAME\tANCHOR
  19 # Note: The filename does not have any extension appended!
  20 # This file can then be used by our texi2html init script to determine
  21 # the correct file name and anchor for external refs
  22
  23 import sys
  24 import re
  25 import os
  26 import getopt
  27
  28 #import langdefs
  29
  30 optlist, args = getopt.getopt (sys.argv[1:],'o:')
  31 files = args
  32
  33 outdir = '.'
  34 for x in optlist:
  35     if x[0] == '-o':
  36         outdir = x[1]
  37
  38 include_re = re.compile (r'@include ((?!../lily-).*?)\.texi$', re.M)
  39 whitespaces = re.compile (r'\s+')
  40 section_translation_re = re.compile (r'@(node|(?:unnumbered|appendix)(?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|(?:major|chap|(?:sub){0,2})heading|translationof) (.*?)\n')
  41
  42 def expand_includes (m):
  43     filepath = os.path.join (os.path.dirname (m.group(0)), m.group(1)) + '.texi'
  44     print "Including file: " + filepath
  45     if os.path.exists (filepath):
  46         return extract_sections (filepath)
  47     return ''
  48
  49 def extract_sections (filename):
  50     result = ''
  51     f = open (filename, 'r')
  52     page = f.read ()
  53     f.close()
  54     # Replace all includes by their list of sections and extract all sections
  55     page = include_re.sub (expand_includes, page)
  56     sections = section_translation_re.findall (page)
  57     for sec in sections:
  58         result += "@" + sec[0] + " " + sec[1] + "\n"
  59     return result
  60
  61 # Convert a given node name to its proper file name (normalization as explained
  62 # in the texinfo manual:
  63 # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
  64 def texinfo_file_name(title):
  65     # exception: The top node is always mapped to index.html
  66     if title == "Top":
  67         return "index"
  68     # File name normalization by texinfo (described in the texinfo manual):
  69     # 1/2: letters and numbers are left unchanged
  70     # 3/4: multiple, leading and trailing whitespace is removed
  71     title = title.strip ();
  72     title = whitespaces.sub (' ', title)
  73     # 5:   all remaining spaces are converted to '-'
  74     # 6:   all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
  75     result = ''
  76     for index in range(len(title)):
  77         char = title[index]
  78         if char == ' ': # space -> '-'
  79             result += '-'
  80         elif ( ('0' <= char and char <= '9' ) or
  81                ('A' <= char and char <= 'Z' ) or
  82                ('a' <= char and char <= 'z' ) ):  # number or letter
  83             result += char
  84         else:
  85             ccode = ord(char)
  86             if ccode <= 0xFFFF:
  87                 result += "_%04x" % ccode
  88             else:
  89                 result += "__%06x" % ccode
  90     # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
  91     if ord(result[0]) in range (ord('0'), ord('9')):
  92         result = 't_g' + result
  93     return result
  94
  95 texinfo_re = re.compile (r'@.*{(.*)}')
  96 def remove_texinfo (title):
  97     return texinfo_re.sub (r'\1', title)
  98
  99 def create_texinfo_anchor (title):
 100     return texinfo_file_name (remove_texinfo (title))
 101
 102 unnumbered_re = re.compile (r'unnumbered.*')
 103 def process_sections (filename, page):
 104     sections = section_translation_re.findall (page)
 105     # TODO: Don't rely on the file having a 4-letter extension (texi)!!!
 106     p = os.path.join (outdir, filename) [:-5] + '_xref.map'
 107     f = open (p, 'w')
 108
 109     this_title = ''
 110     this_filename = ''
 111     this_anchor = ''
 112     this_unnumbered = False
 113     for sec in sections:
 114         if sec[0] == "node":
 115             # Write out the cached values to the file and start a new section:
 116             if this_title != '':
 117                 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 118                 print (this_title + "\t" + this_filename + "\t" + this_anchor)
 119             this_title = remove_texinfo (sec[1])
 120             this_anchor = create_texinfo_anchor (sec[1])
 121         elif sec[0] == "translationof":
 122             anchor = create_texinfo_anchor (sec[1])
 123             # If @translationof is used, it gives the original node name, which
 124             # we use for the anchor and the file name (if it is a numbered node)
 125             this_anchor = anchor
 126             if not this_unnumbered:
 127                 this_filename = anchor
 128         else:
 129             # unnumbered nodes use the previously used file name, only numbered
 130             # nodes get their own filename!
 131             this_unnumbered = unnumbered_re.match (sec[0])
 132             if not this_unnumbered:
 133                 this_filename = this_anchor
 134
 135     if this_title != '':
 136         f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 137         print (this_title + "\t" + this_filename + "\t" + this_anchor)
 138     f.close ()
 139
 140
 141 for filename in files:
 142     print "extract_texi_filenames.py: Processing %s" % filename
 143     sections = extract_sections (filename)
 144     process_sections (filename, sections)