scripts/build/extract_texi_filenames.py

   1 #!@PYTHON@
   2 # -*- coding: utf-8 -*-
   3 # extract_texi_filenames.py
   4
   5 # USAGE:  extract_texi_filenames.py [-o OUTDIR] FILES
   6 #
   7 # -o OUTDIR specifies that output files should rather be written in OUTDIR
   8 #
   9 # Description:
  10 # This script parses the .texi file given and creates a file with the
  11 # nodename <=> filename/anchor map.
  12 # The idea behind: Unnumbered subsections go into the same file as the
  13 # previous numbered section, @translationof gives the original node name,
  14 # which is then used for the filename/anchor.
  15 #
  16 # If this script is run on a file texifile.texi, it produces a file
  17 # texifile[.LANG].xref-map with tab-separated entries of the form
  18 #        NODE\tFILENAME\tANCHOR
  19 # LANG is the document language in case it's not 'en'
  20 # Note: The filename does not have any extension appended!
  21 # This file should then be used by our texi2html init script to determine
  22 # the correct file name and anchor for external refs
  23
  24 # For translated documentation: cross-references to nodes that exist
  25 # only in documentation in English are allowed, that's why the already
  26 # generated map file of docs in English is loaded with
  27 # --master-map-file option, then the node names that are defined in
  28 # the map for the manual in English but not in the translated manual
  29 # are added to the map for the translated manual.
  30
  31
  32 import sys
  33 import re
  34 import os
  35 import getopt
  36
  37 options_list, files = getopt.getopt (sys.argv[1:],'o:s:hI:m:k:q',
  38                                      ['output=', 'split=',
  39                                       'help', 'include=',
  40                                       'master-map-file=',
  41                                       'known-missing-files=',
  42                                       'quiet'])
  43
  44 help_text = r"""Usage: %(program_name)s [OPTIONS]... TEXIFILE...
  45 Extract files names for texinfo (sub)sections from the texinfo files.
  46
  47 Options:
  48  -h, --help                     print this help
  49  -I, --include=DIRECTORY        append DIRECTORY to include search path
  50  -m, --master-map-file=FILE     use FILE as master map file
  51  -o, --output=DIRECTORY         write .xref-map files to DIRECTORY
  52  -s, --split=MODE               split manual according to MODE. Possible values
  53                                 are section and custom (default)
  54  -k, --known-missing-files      a filename which has a list of files known
  55                                 to be missing for this make
  56  -q, --quiet                    suppress most messages
  57 """
  58
  59 def help (text):
  60     sys.stdout.write ( text)
  61     sys.exit (0)
  62
  63 outdir = '.'
  64 split = "custom"
  65 include_path = ['.',]
  66 master_map_file = ''
  67 known_missing_files = []
  68 known_missing_files_file = ''
  69 docs_without_directories = ['changes', 'music-glossary']
  70 suppress_output = False
  71 initial_map = {}
  72 for opt in options_list:
  73     o = opt[0]
  74     a = opt[1]
  75     if o == '-h' or o == '--help':
  76         help (help_text % vars ())
  77     if o == '-I' or o == '--include':
  78         if os.path.isdir (a):
  79             include_path.append (a)
  80         else:
  81             path_list = a.split('/')
  82             file_name = path_list[len(path_list)-1]
  83             if not (file_name in docs_without_directories):
  84                 print a, 'is not a directory.'
  85                 print 'Please consider adding it to the list of '
  86                 print 'known missing files in extract_texi_filename.py.'
  87     elif o == '-o' or o == '--output':
  88         outdir = a
  89     elif o == '-s' or o == '--split':
  90         split = a
  91     elif o == '-m' or o == '--master-map-file':
  92         if os.path.isfile (a):
  93             master_map_file = a
  94     elif o == '--known-missing-files':
  95         if os.path.isfile (a):
  96             known_missing_files_file = a
  97         else:
  98             print 'Missing files list file not found: ', a
  99     elif o == '-q' or o == '--quiet':
 100         suppress_output = True
 101     else:
 102         raise Exception ('unknown option: ' + o)
 103
 104 if known_missing_files_file:
 105     missing_files = open (known_missing_files_file, 'r')
 106     known_missing_files = missing_files.read().splitlines()
 107     missing_files.close()
 108
 109 if not os.path.isdir (outdir):
 110     if os.path.exists (outdir):
 111         os.unlink (outdir)
 112     os.makedirs (outdir)
 113
 114 # Only look at @include if it is not preceeded by a @c:
 115 include_re = re.compile (r'^(?!.*@c .*@include)@include ((?!../lily-).*?\.i?te(xi|ly))$', re.M)
 116 whitespaces = re.compile (r'\s+')
 117 section_translation_re = re.compile ('^@(node|(?:unnumbered|appendix)\
 118 (?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|\
 119 (?:major|chap|(?:sub){0,2})heading|lydoctitle|translationof|nodeprefix) \
 120 (.+)$', re.MULTILINE)
 121 external_node_re = re.compile (r'\s+@c\s+external.*')
 122
 123 def expand_includes (m, filename):
 124     include_name = m.group (1)
 125     filepath = os.path.join (os.path.dirname (filename), include_name)
 126     if os.path.exists (filepath):
 127         return extract_sections (filepath)[1]
 128     else:
 129         for directory in include_path:
 130             filepath = os.path.join (directory, include_name)
 131             if os.path.exists (filepath):
 132                 return extract_sections (filepath)[1]
 133         if not (include_name in known_missing_files):
 134             # Not found
 135             print 'Warning: No such file: ' + include_name + \
 136                   ' (search path: ' + ':'.join (include_path)+')'
 137         return ''
 138
 139 lang_re = re.compile (r'^@documentlanguage (.+)', re.M)
 140
 141 def extract_sections (filename):
 142     result = ''
 143     f = open (filename, 'r')
 144     page = f.read ()
 145     f.close()
 146     # Search document language
 147     m = lang_re.search (page)
 148     if m and m.group (1) != 'en':
 149         lang_suffix = '.' + m.group (1)
 150     else:
 151         lang_suffix = ''
 152     # Replace all includes by their list of sections and extract all sections
 153     page = include_re.sub (lambda m: expand_includes (m, filename), page)
 154     sections = section_translation_re.findall (page)
 155     for sec in sections:
 156         result += "@" + sec[0] + " " + sec[1] + "\n"
 157     return (lang_suffix, result)
 158
 159 # Convert a given node name to its proper file name (normalization as
 160 # explained in the texinfo manual:
 161 # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
 162 def texinfo_file_name(title):
 163     # exception: The top node is always mapped to index.html
 164     if title == "Top":
 165         return "index"
 166     # File name normalization by texinfo (described in the texinfo manual):
 167     # 1/2: letters and numbers are left unchanged
 168     # 3/4: multiple, leading and trailing whitespace is removed
 169     title = title.strip ();
 170     title = whitespaces.sub (' ', title)
 171     # 5:   all remaining spaces are converted to '-'
 172     # 6:   all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
 173     result = ''
 174     for index in range(len(title)):
 175         char = title[index]
 176         if char == ' ': # space -> '-'
 177             result += '-'
 178         elif ( ('0' <= char and char <= '9' ) or
 179                ('A' <= char and char <= 'Z' ) or
 180                ('a' <= char and char <= 'z' ) ):  # number or letter
 181             result += char
 182         else:
 183             ccode = ord(char)
 184             if ccode <= 0xFFFF:
 185                 result += "_%04x" % ccode
 186             else:
 187                 result += "__%06x" % ccode
 188     # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
 189     if (result != '') and (ord(result[0]) in range (ord('0'), ord('9'))):
 190         result = 't_g' + result
 191     return result
 192
 193 texinfo_re = re.compile (r'@.*?{(.*?)}')
 194 def remove_texinfo (title):
 195     title = title.replace ('--', '-')
 196     return texinfo_re.sub (r'\1', title).strip ()
 197
 198 def create_texinfo_anchor (title):
 199     return texinfo_file_name (remove_texinfo (title))
 200
 201 unnumbered_re = re.compile (r'unnumbered.+|lydoctitle')
 202 file_name_section_level = {
 203     'top': 4,
 204     'chapter':3,
 205     'unnumbered':3,
 206     'appendix':3,
 207     'section':2,
 208     'unnumberedsec':2,
 209     'appendixsec':2,
 210     'subsection':1,
 211     'unnumberedsubsec':1,
 212     'appendixsubsec':1,
 213     'subsubsection':0,
 214     'unnumberedsubsubsec':0,
 215     'appendixsubsubsec':0
 216 }
 217 if split in file_name_section_level:
 218     splitting_level = file_name_section_level[split]
 219 else:
 220     splitting_level = -1
 221 def process_sections (filename, lang_suffix, page):
 222     sections = section_translation_re.findall (page)
 223     basename = os.path.splitext (os.path.basename (filename))[0]
 224     p = os.path.join (outdir, basename) + lang_suffix + '.xref-map'
 225     if not suppress_output:
 226         print 'writing:', p
 227     f = open (p, 'w')
 228
 229     node_prefix_title = ''
 230     this_title = ''
 231     this_filename = 'index'
 232     this_anchor = ''
 233     this_unnumbered = False
 234     had_section = False
 235     for sec in sections:
 236         if sec[0] == "node":
 237             # Write out the cached values to the file and start a new
 238             # section:
 239             if this_title and this_title != 'Top':
 240                     f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 241             had_section = False
 242             this_title = remove_texinfo (sec[1])
 243             this_anchor = create_texinfo_anchor (sec[1])
 244             # delete entry from master map file
 245             if this_title in initial_map:
 246                 del initial_map[this_title]
 247         elif sec[0] == "translationof":
 248             (original_node, external_node) = external_node_re.subn ('', sec[1])
 249             original_node = remove_texinfo (original_node)
 250             # The following binds the translator to use the
 251             # translated node name in cross-references in case
 252             # it exists
 253             if external_node and original_node in initial_map:
 254                 del initial_map[original_node]
 255             anchor = create_texinfo_anchor (sec[1])
 256             # If @translationof is used, it gives the original
 257             # node name, which we use for the anchor and the file
 258             # name (if it is a numbered node)
 259             this_anchor = anchor
 260             if not this_unnumbered:
 261                 this_filename = anchor
 262             elif original_node in initial_map:
 263                 this_filename = initial_map[original_node][2]
 264         elif sec[0] == "nodeprefix":
 265             node_prefix_title = remove_texinfo (sec[1])
 266             node_prefix_anchor = create_texinfo_anchor (sec[1])
 267         else:
 268             # Some pages might not use a node for every section, so
 269             # treat this case here, too: If we already had a section
 270             # and encounter another one before the next @node, we
 271             # write out the old one and start with the new values
 272             if had_section and split != 'node' and this_title:
 273                 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 274                 this_title = remove_texinfo (sec[1])
 275                 this_anchor = create_texinfo_anchor (sec[1])
 276             had_section = True
 277
 278             if sec[0] == "lydoctitle" and node_prefix_title:
 279                 this_title = "%s: %s" % (node_prefix_title, this_title)
 280                 this_anchor = "%s-%s" % (node_prefix_anchor, this_anchor)
 281
 282             if split == 'custom':
 283                 # unnumbered nodes use the previously used file name,
 284                 # only numbered nodes get their own filename! However,
 285                 # top-level @unnumbered still get their own file.
 286                 this_unnumbered = unnumbered_re.match (sec[0])
 287                 if not this_unnumbered:
 288                     this_filename = this_anchor
 289             elif split == 'node':
 290                 this_filename = this_anchor
 291             else:
 292                 if sec[0] in file_name_section_level and \
 293                         file_name_section_level[sec[0]] >= splitting_level:
 294                     this_filename = this_anchor
 295
 296     if this_title and this_title != 'Top':
 297         f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 298
 299     for node in initial_map:
 300         f.write ("\t".join (initial_map[node]) + "\n")
 301     f.close ()
 302
 303 xref_map_line_re = re.compile (r'(.*?)\t(.*?)\t(.*?)$')
 304 if master_map_file:
 305     for line in open (master_map_file):
 306         m = xref_map_line_re.match (line)
 307         if m:
 308             initial_map[m.group (1)] = (m.group (1), m.group (2), m.group (3))
 309
 310 for filename in files:
 311     if not suppress_output:
 312         print "extract_texi_filenames.py: Processing %s" % filename
 313     (lang_suffix, sections) = extract_sections (filename)
 314     process_sections (filename, lang_suffix, sections)