scripts/build/extract_texi_filenames.py

   1 #!@PYTHON@
   2 # -*- coding: utf-8 -*-
   3 # extract_texi_filenames.py
   4
   5 # USAGE:  extract_texi_filenames.py [-o OUTDIR] FILES
   6 #
   7 # -o OUTDIR specifies that output files should rather be written in OUTDIR
   8 #
   9 # Description:
  10 # This script parses the .texi file given and creates a file with the
  11 # nodename <=> filename/anchor map.
  12 # The idea behind: Unnumbered subsections go into the same file as the
  13 # previous numbered section, @translationof gives the original node name,
  14 # which is then used for the filename/anchor.
  15 #
  16 # If this script is run on a file texifile.texi, it produces a file
  17 # texifile[.LANG].xref-map with tab-separated entries of the form
  18 #        NODE\tFILENAME\tANCHOR
  19 # LANG is the document language in case it's not 'en'
  20 # Note: The filename does not have any extension appended!
  21 # This file can then be used by our texi2html init script to determine
  22 # the correct file name and anchor for external refs
  23
  24 import sys
  25 import re
  26 import os
  27 import getopt
  28
  29 options_list, files = getopt.getopt (sys.argv[1:],'o:s:hI:m:',
  30                                      ['output=', 'split=',
  31                                       'help', 'include=',
  32                                       'master-map-file='])
  33
  34 help_text = r"""Usage: %(program_name)s [OPTIONS]... TEXIFILE...
  35 Extract files names for texinfo (sub)sections from the texinfo files.
  36
  37 Options:
  38  -h, --help                     print this help
  39  -I, --include=DIRECTORY        append DIRECTORY to include search path
  40  -m, --master-map-file=FILE     use FILE as master map file
  41  -o, --output=DIRECTORY         write .xref-map files to DIRECTORY
  42  -s, --split=MODE               split manual according to MODE. Possible values
  43                                 are section and custom (default)
  44 """
  45
  46 def help (text):
  47     sys.stdout.write ( text)
  48     sys.exit (0)
  49
  50 outdir = '.'
  51 split = "custom"
  52 include_path = []
  53 master_map_file = ''
  54 initial_map = {}
  55 for opt in options_list:
  56     o = opt[0]
  57     a = opt[1]
  58     if o == '-h' or o == '--help':
  59         help (help_text % vars ())
  60     if o == '-I' or o == '--include':
  61         if os.path.isdir (a):
  62             include_path.append (a)
  63     elif o == '-o' or o == '--output':
  64         outdir = a
  65     elif o == '-s' or o == '--split':
  66         split = a
  67     elif o == '-m' or o == '--master-map-file':
  68         if os.path.isfile (a):
  69             master_map_file = a
  70     else:
  71         raise Exception ('unknown option: ' + o)
  72
  73
  74 if not os.path.isdir (outdir):
  75     if os.path.exists (outdir):
  76         os.unlink (outdir)
  77     os.makedirs (outdir)
  78
  79 include_re = re.compile (r'@include ((?!../lily-).*?\.i?texi)$', re.M)
  80 whitespaces = re.compile (r'\s+')
  81 section_translation_re = re.compile ('^@(node|(?:unnumbered|appendix)\
  82 (?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|\
  83 (?:major|chap|(?:sub){0,2})heading|lydoctitle|translationof) \
  84 (.+)$', re.MULTILINE)
  85 external_node_re = re.compile (r'\s+@c\s+external.*')
  86
  87 def expand_includes (m, filename):
  88     filepath = os.path.join (os.path.dirname (filename), m.group(1))
  89     if os.path.exists (filepath):
  90         return extract_sections (filepath)[1]
  91     else:
  92         for directory in include_path:
  93             filepath = os.path.join (directory, m.group(1))
  94             if os.path.exists (filepath):
  95                 return extract_sections (filepath)[1]
  96         print "Unable to locate include file " + filepath
  97         return ''
  98
  99 lang_re = re.compile (r'^@documentlanguage (.+)', re.M)
 100
 101 def extract_sections (filename):
 102     result = ''
 103     f = open (filename, 'r')
 104     page = f.read ()
 105     f.close()
 106     # Search document language
 107     m = lang_re.search (page)
 108     if m and m.group (1) != 'en':
 109         lang_suffix = '.' + m.group (1)
 110     else:
 111         lang_suffix = ''
 112     # Replace all includes by their list of sections and extract all sections
 113     page = include_re.sub (lambda m: expand_includes (m, filename), page)
 114     sections = section_translation_re.findall (page)
 115     for sec in sections:
 116         result += "@" + sec[0] + " " + sec[1] + "\n"
 117     return (lang_suffix, result)
 118
 119 # Convert a given node name to its proper file name (normalization as
 120 # explained in the texinfo manual:
 121 # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
 122 def texinfo_file_name(title):
 123     # exception: The top node is always mapped to index.html
 124     if title == "Top":
 125         return "index"
 126     # File name normalization by texinfo (described in the texinfo manual):
 127     # 1/2: letters and numbers are left unchanged
 128     # 3/4: multiple, leading and trailing whitespace is removed
 129     title = title.strip ();
 130     title = whitespaces.sub (' ', title)
 131     # 5:   all remaining spaces are converted to '-'
 132     # 6:   all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
 133     result = ''
 134     for index in range(len(title)):
 135         char = title[index]
 136         if char == ' ': # space -> '-'
 137             result += '-'
 138         elif ( ('0' <= char and char <= '9' ) or
 139                ('A' <= char and char <= 'Z' ) or
 140                ('a' <= char and char <= 'z' ) ):  # number or letter
 141             result += char
 142         else:
 143             ccode = ord(char)
 144             if ccode <= 0xFFFF:
 145                 result += "_%04x" % ccode
 146             else:
 147                 result += "__%06x" % ccode
 148     # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
 149     if (result != '') and (ord(result[0]) in range (ord('0'), ord('9'))):
 150         result = 't_g' + result
 151     return result
 152
 153 texinfo_re = re.compile (r'@.*{(.*)}')
 154 def remove_texinfo (title):
 155     return texinfo_re.sub (r'\1', title)
 156
 157 def create_texinfo_anchor (title):
 158     return texinfo_file_name (remove_texinfo (title))
 159
 160 unnumbered_re = re.compile (r'unnumbered.+|lydoctitle')
 161 file_name_section_level = {
 162     'top': 4,
 163     'chapter':3,
 164     'unnumbered':3,
 165     'appendix':3,
 166     'section':2,
 167     'unnumberedsec':2,
 168     'appendixsec':2,
 169     'subsection':1,
 170     'unnumberedsubsec':1,
 171     'appendixsubsec':1,
 172     'subsubsection':0,
 173     'unnumberedsubsubsec':0,
 174     'appendixsubsubsec':0
 175 }
 176 if split in file_name_section_level:
 177     splitting_level = file_name_section_level[split]
 178 else:
 179     splitting_level = -1
 180 def process_sections (filename, lang_suffix, page):
 181     sections = section_translation_re.findall (page)
 182     basename = os.path.splitext (os.path.basename (filename))[0]
 183     p = os.path.join (outdir, basename) + lang_suffix + '.xref-map'
 184     f = open (p, 'w')
 185
 186     this_title = ''
 187     this_filename = 'index'
 188     this_anchor = ''
 189     this_unnumbered = False
 190     had_section = False
 191     for sec in sections:
 192         if sec[0] == "node":
 193             # Write out the cached values to the file and start a new
 194             # section:
 195             if this_title and this_title != 'Top':
 196                     f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 197             had_section = False
 198             this_title = remove_texinfo (sec[1])
 199             this_anchor = create_texinfo_anchor (sec[1])
 200             # delete entry from master map file
 201             if this_title in initial_map:
 202                 del initial_map[this_title]
 203         elif sec[0] == "translationof":
 204             print sec
 205             (original_node, external_node) = external_node_re.subn ('', sec[1])
 206             original_node = remove_texinfo (original_node)
 207             # The following binds the translator to use the
 208             # translated node name in cross-references in case
 209             # it exists
 210             if external_node and original_node in initial_map:
 211                 del initial_map[original_node]
 212             anchor = create_texinfo_anchor (sec[1])
 213             # If @translationof is used, it gives the original
 214             # node name, which we use for the anchor and the file
 215             # name (if it is a numbered node)
 216             this_anchor = anchor
 217             if not this_unnumbered:
 218                 this_filename = anchor
 219             elif original_node in initial_map:
 220                 this_filename = initial_map[original_node][2]
 221         else:
 222             # Some pages might not use a node for every section, so
 223             # treat this case here, too: If we already had a section
 224             # and encounter another one before the next @node, we
 225             # write out the old one and start with the new values
 226             if had_section and this_title:
 227                 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 228                 this_title = remove_texinfo (sec[1])
 229                 this_anchor = create_texinfo_anchor (sec[1])
 230             had_section = True
 231
 232             if split == 'custom':
 233                 # unnumbered nodes use the previously used file name,
 234                 # only numbered nodes get their own filename! However,
 235                 # top-level @unnumbered still get their own file.
 236                 this_unnumbered = unnumbered_re.match (sec[0])
 237                 if not this_unnumbered:
 238                     this_filename = this_anchor
 239             elif split == 'node':
 240                 this_filename = this_anchor
 241             else:
 242                 if sec[0] in file_name_section_level and \
 243                         file_name_section_level[sec[0]] >= splitting_level:
 244                     this_filename = this_anchor
 245
 246     if this_title and this_title != 'Top':
 247         f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
 248
 249     for node in initial_map:
 250         f.write ("\t".join (initial_map[node]) + "\n")
 251     f.close ()
 252
 253 xref_map_line_re = re.compile (r'(.*?)\t(.*?)\t(.*?)$')
 254 if master_map_file:
 255     for line in open (master_map_file):
 256         m = xref_map_line_re.match (line)
 257         if m:
 258             initial_map[m.group (1)] = (m.group (1), m.group (2), m.group (3))
 259
 260 for filename in files:
 261     print "extract_texi_filenames.py: Processing %s" % filename
 262     (lang_suffix, sections) = extract_sections (filename)
 263     process_sections (filename, lang_suffix, sections)