scripts/build/extract_texi_filenames.py

   1 #!@PYTHON@
   2 # -*- coding: utf-8 -*-
   3 # extract_texi_filenames.py
   4
   5 # USAGE:  extract_texi_filenames.py [-o OUTDIR] FILES
   6 #
   7 # -o OUTDIR specifies that output files should rather be written in OUTDIR
   8 #
   9 # Description:
  10 # This script parses the .texi file given and creates a file with the
  11 # nodename <=> filename/anchor map.
  12 # The idea behind: Unnumbered subsections go into the same file as the
  13 # previous numbered section, @translationof gives the original node name,
  14 # which is then used for the filename/anchor.
  15 #
  16 # If this script is run on a file texifile.texi, it produces a file
  17 # texifile[.LANG].xref-map with tab-separated entries of the form
  18 #        NODE\tFILENAME\tANCHOR
  19 # LANG is the document language in case it's not 'en'
  20 # Note: The filename does not have any extension appended!
  21 # This file can then be used by our texi2html init script to determine
  22 # the correct file name and anchor for external refs
  23
  24 import sys
  25 import re
  26 import os
  27 import getopt
  28
  29 options_list, files = getopt.getopt (sys.argv[1:],'o:s:hI:',
  30                                ['output=', 'split=', 'help', 'include='])
  31
  32 help_text = r"""Usage: %(program_name)s [OPTIONS]... TEXIFILE...
  33 Extract files names for texinfo (sub)sections from the texinfo files.
  34
  35 Options:
  36  -h, --help                     print this help
  37  -I, --include=DIRECTORY        append DIRECTORY to include search path
  38  -o, --output=DIRECTORY         write .xref-map files to DIRECTORY
  39  -s, --split=MODE               split manual according to MODE. Possible values
  40                                 are section and custom (default)
  41 """
  42
  43 def help (text):
  44     sys.stdout.write ( text)
  45     sys.exit (0)
  46
  47 outdir = '.'
  48 split = "custom"
  49 include_path = []
  50 for opt in options_list:
  51     o = opt[0]
  52     a = opt[1]
  53     if o == '-h' or o == '--help':
  54         help (help_text % vars ())
  55     if o == '-I' or o == '--include':
  56         if os.path.isdir (a):
  57             include_path.append (a)
  58     elif o == '-o' or o == '--output':
  59         outdir = a
  60     elif o == '-s' or o == '--split':
  61         split = a
  62     else:
  63         raise Exception ('unknown option: ' + o)
  64
  65
  66 if not os.path.isdir (outdir):
  67     if os.path.exists (outdir):
  68         os.unlink (outdir)
  69     os.makedirs (outdir)
  70
  71 include_re = re.compile (r'@include ((?!../lily-).*?\.i?texi)$', re.M)
  72 whitespaces = re.compile (r'\s+')
  73 section_translation_re = re.compile ('^@(node|(?:unnumbered|appendix)\
  74 (?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|\
  75 (?:major|chap|(?:sub){0,2})heading|translationof|lydoctitle) (.*?)\\s*$', re.MULTILINE)
  76
  77 def expand_includes (m, filename):
  78     filepath = os.path.join (os.path.dirname (filename), m.group(1))
  79     if os.path.exists (filepath):
  80         return extract_sections (filepath)[1]
  81     else:
  82         for directory in include_path:
  83             filepath = os.path.join (directory, m.group(1))
  84             if os.path.exists (filepath):
  85                 return extract_sections (filepath)[1]
  86         print "Unable to locate include file " + filepath
  87         return ''
  88
  89 lang_re = re.compile (r'^@documentlanguage (.+)', re.M)
  90
  91 def extract_sections (filename):
  92     result = ''
  93     f = open (filename, 'r')
  94     page = f.read ()
  95     f.close()
  96     # Search document language
  97     m = lang_re.search (page)
  98     if m and m.group (1) != 'en':
  99         lang_suffix = '.' + m.group (1)
 100     else:
 101         lang_suffix = ''
 102     # Replace all includes by their list of sections and extract all sections
 103     page = include_re.sub (lambda m: expand_includes (m, filename), page)
 104     sections = section_translation_re.findall (page)
 105     for sec in sections:
 106         result += "@" + sec[0] + " " + sec[1] + "\n"
 107     return (lang_suffix, result)
 108
 109 # Convert a given node name to its proper file name (normalization as explained
 110 # in the texinfo manual:
 111 # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
 112 def texinfo_file_name(title):
 113     # exception: The top node is always mapped to index.html
 114     if title == "Top":
 115         return "index"
 116     # File name normalization by texinfo (described in the texinfo manual):
 117     # 1/2: letters and numbers are left unchanged
 118     # 3/4: multiple, leading and trailing whitespace is removed
 119     title = title.strip ();
 120     title = whitespaces.sub (' ', title)
 121     # 5:   all remaining spaces are converted to '-'
 122     # 6:   all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
 123     result = ''
 124     for index in range(len(title)):
 125         char = title[index]
 126         if char == ' ': # space -> '-'
 127             result += '-'
 128         elif ( ('0' <= char and char <= '9' ) or
 129                ('A' <= char and char <= 'Z' ) or
 130                ('a' <= char and char <= 'z' ) ):  # number or letter
 131             result += char
 132         else:
 133             ccode = ord(char)
 134             if ccode <= 0xFFFF:
 135                 result += "_%04x" % ccode
 136             else:
 137                 result += "__%06x" % ccode
 138     # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
 139     if (result != '') and (ord(result[0]) in range (ord('0'), ord('9'))):
 140         result = 't_g' + result
 141     return result
 142
 143 texinfo_re = re.compile (r'@.*{(.*)}')
 144 def remove_texinfo (title):
 145     return texinfo_re.sub (r'\1', title)
 146
 147 def create_texinfo_anchor (title):
 148     return texinfo_file_name (remove_texinfo (title))
 149
 150 unnumbered_re = re.compile (r'unnumbered.+|lydoctitle')
 151 file_name_section_level = {
 152     'top': 4,
 153     'chapter':3,
 154     'unnumbered':3,
 155     'appendix':3,
 156     'section':2,
 157     'unnumberedsec':2,
 158     'appendixsec':2,
 159     'subsection':1,
 160     'unnumberedsubsec':1,
 161     'appendixsubsec':1,
 162     'subsubsection':0,
 163     'unnumberedsubsubsec':0,
 164     'appendixsubsubsec':0
 165 }
 166 if split in file_name_section_level:
 167     splitting_level = file_name_section_level[split]
 168 else:
 169     splitting_level = -1
 170 def process_sections (filename, lang_suffix, page):
 171     sections = section_translation_re.findall (page)
 172     basename = os.path.splitext (os.path.basename (filename))[0]
 173     p = os.path.join (outdir, basename) + lang_suffix + '.xref-map'
 174     f = open (p, 'w')
 175
 176     this_title = ''
 177     this_filename = 'index'
 178     this_anchor = ''
 179     this_unnumbered = False
 180     had_section = False
 181     for sec in sections:
 182         if sec[0] == "node":
 183             # Write out the cached values to the file and start a new section:
 184             if this_title and this_title != 'Top':
 185                     f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 186             had_section = False
 187             this_title = remove_texinfo (sec[1])
 188             this_anchor = create_texinfo_anchor (sec[1])
 189         elif sec[0] == "translationof":
 190             anchor = create_texinfo_anchor (sec[1])
 191             # If @translationof is used, it gives the original node name, which
 192             # we use for the anchor and the file name (if it is a numbered node)
 193             this_anchor = anchor
 194             if not this_unnumbered:
 195                 this_filename = anchor
 196         else:
 197             # Some pages might not use a node for every section, so treat this
 198             # case here, too: If we already had a section and encounter another
 199             # one before the next @node, we write out the old one and start
 200             # with the new values
 201             if had_section and this_title:
 202                 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 203                 this_title = remove_texinfo (sec[1])
 204                 this_anchor = create_texinfo_anchor (sec[1])
 205             had_section = True
 206
 207             if split == 'custom':
 208                 # unnumbered nodes use the previously used file name, only numbered
 209                 # nodes get their own filename! However, top-level @unnumbered
 210                 # still get their own file.
 211                 this_unnumbered = unnumbered_re.match (sec[0])
 212                 if not this_unnumbered:
 213                     this_filename = this_anchor
 214             elif split == 'node':
 215                 this_filename = this_anchor
 216             else:
 217                 if sec[0] in file_name_section_level and \
 218                         file_name_section_level[sec[0]] >= splitting_level:
 219                     this_filename = this_anchor
 220
 221     if this_title and this_title != 'Top':
 222         f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 223     f.close ()
 224
 225
 226 for filename in files:
 227     print "extract_texi_filenames.py: Processing %s" % filename
 228     (lang_suffix, sections) = extract_sections (filename)
 229     process_sections (filename, lang_suffix, sections)