buildscripts/extract_texi_filenames.py

   1 #!@PYTHON@
   2 # -*- coding: utf-8 -*-
   3 # extrace_texi_filenames.py
   4
   5 # USAGE:  extract_texi_filenames.py [-o OUTDIR] FILES
   6 #
   7 # -o OUTDIR specifies that output files should rather be written in OUTDIR
   8 #
   9 # Description:
  10 # This script parses the .texi file given and creates a file with the
  11 # nodename <=> filename/anchor map.
  12 # The idea behind: Unnumbered subsections go into the same file as the
  13 # previous numbered section, @translationof gives the original node name,
  14 # which is then used for the filename/anchor.
  15 #
  16 # If this script is run on a file texifile.texi, it produces a file
  17 # texifile.xref-map with tab-separated entries of the form
  18 #        NODE\tFILENAME\tANCHOR
  19 # Note: The filename does not have any extension appended!
  20 # This file can then be used by our texi2html init script to determine
  21 # the correct file name and anchor for external refs
  22
  23 import sys
  24 import re
  25 import os
  26 import getopt
  27
  28 #import langdefs
  29
  30 optlist, args = getopt.getopt (sys.argv[1:],'o:')
  31 files = args
  32
  33 outdir = '.'
  34 for x in optlist:
  35     if x[0] == '-o':
  36         outdir = x[1]
  37
  38 include_re = re.compile (r'@include ((?!../lily-).*?)\.texi$', re.M)
  39 whitespaces = re.compile (r'\s+')
  40 section_translation_re = re.compile (r'@(node|(?:unnumbered|appendix)(?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|(?:major|chap|(?:sub){0,2})heading|translationof) (.*?)\s*\n')
  41
  42 def expand_includes (m, filename):
  43     filepath = os.path.join (os.path.dirname (filename), m.group(1)) + '.texi'
  44     if os.path.exists (filepath):
  45         return extract_sections (filepath)
  46     else:
  47         print "Unable to locate include file " + filepath
  48         return ''
  49
  50 def extract_sections (filename):
  51     result = ''
  52     f = open (filename, 'r')
  53     page = f.read ()
  54     f.close()
  55     # Replace all includes by their list of sections and extract all sections
  56     page = include_re.sub (lambda m: expand_includes (m, filename), page)
  57     sections = section_translation_re.findall (page)
  58     for sec in sections:
  59         result += "@" + sec[0] + " " + sec[1] + "\n"
  60     return result
  61
  62 # Convert a given node name to its proper file name (normalization as explained
  63 # in the texinfo manual:
  64 # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
  65 def texinfo_file_name(title):
  66     # exception: The top node is always mapped to index.html
  67     if title == "Top":
  68         return "index"
  69     # File name normalization by texinfo (described in the texinfo manual):
  70     # 1/2: letters and numbers are left unchanged
  71     # 3/4: multiple, leading and trailing whitespace is removed
  72     title = title.strip ();
  73     title = whitespaces.sub (' ', title)
  74     # 5:   all remaining spaces are converted to '-'
  75     # 6:   all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
  76     result = ''
  77     for index in range(len(title)):
  78         char = title[index]
  79         if char == ' ': # space -> '-'
  80             result += '-'
  81         elif ( ('0' <= char and char <= '9' ) or
  82                ('A' <= char and char <= 'Z' ) or
  83                ('a' <= char and char <= 'z' ) ):  # number or letter
  84             result += char
  85         else:
  86             ccode = ord(char)
  87             if ccode <= 0xFFFF:
  88                 result += "_%04x" % ccode
  89             else:
  90                 result += "__%06x" % ccode
  91     # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
  92     if (result != '') and (ord(result[0]) in range (ord('0'), ord('9'))):
  93         result = 't_g' + result
  94     return result
  95
  96 texinfo_re = re.compile (r'@.*{(.*)}')
  97 def remove_texinfo (title):
  98     return texinfo_re.sub (r'\1', title)
  99
 100 def create_texinfo_anchor (title):
 101     return texinfo_file_name (remove_texinfo (title))
 102
 103 unnumbered_re = re.compile (r'unnumbered.*')
 104 def process_sections (filename, page):
 105     sections = section_translation_re.findall (page)
 106     # TODO: Don't rely on the file having a 4-letter extension (texi)!!!
 107     p = os.path.join (outdir, filename) [:-5] + '.xref-map'
 108     f = open (p, 'w')
 109
 110     this_title = ''
 111     this_filename = 'index'
 112     this_anchor = ''
 113     this_unnumbered = False
 114     had_section = False
 115     for sec in sections:
 116         if sec[0] == "node":
 117             # Write out the cached values to the file and start a new section:
 118             if this_title != '' and this_title != 'Top':
 119                     f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 120             had_section = False
 121             this_title = remove_texinfo (sec[1])
 122             this_anchor = create_texinfo_anchor (sec[1])
 123         elif sec[0] == "translationof":
 124             anchor = create_texinfo_anchor (sec[1])
 125             # If @translationof is used, it gives the original node name, which
 126             # we use for the anchor and the file name (if it is a numbered node)
 127             this_anchor = anchor
 128             if not this_unnumbered:
 129                 this_filename = anchor
 130         else:
 131             # Some pages might not use a node for every section, so treat this
 132             # case here, too: If we already had a section and encounter enother
 133             # one before the next @node, we write out the old one and start
 134             # with the new values
 135             if had_section and this_title != '':
 136                 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 137                 this_title = remove_texinfo (sec[1])
 138                 this_anchor = create_texinfo_anchor (sec[1])
 139             had_section = True
 140
 141             # unnumbered nodes use the previously used file name, only numbered
 142             # nodes get their own filename! However, top-level @unnumbered
 143             # still get their own file.
 144             this_unnumbered = unnumbered_re.match (sec[0])
 145             if not this_unnumbered or sec[0] == "unnumbered":
 146                 this_filename = this_anchor
 147
 148     if this_title != '' and this_title != 'Top':
 149         f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 150     f.close ()
 151
 152
 153 for filename in files:
 154     print "extract_texi_filenames.py: Processing %s" % filename
 155     sections = extract_sections (filename)
 156     process_sections (filename, sections)