sphinx/sphinxext/feed/absolutify_urls.py

   1 # By Gareth Rees
   2 # http://gareth-rees.livejournal.com/27148.html
   3
   4 import html5lib
   5 import html5lib.serializer
   6 import html5lib.treewalkers
   7 import urlparse
   8
   9 # List of (ELEMENT, ATTRIBUTE) for HTML5 attributes which contain URLs.
  10 # Based on the list at http://www.feedparser.org/docs/resolving-relative-links.html
  11 url_attributes = [
  12     ('a', 'href'),
  13     ('applet', 'codebase'),
  14     ('area', 'href'),
  15     ('blockquote', 'cite'),
  16     ('body', 'background'),
  17     ('del', 'cite'),
  18     ('form', 'action'),
  19     ('frame', 'longdesc'),
  20     ('frame', 'src'),
  21     ('iframe', 'longdesc'),
  22     ('iframe', 'src'),
  23     ('head', 'profile'),
  24     ('img', 'longdesc'),
  25     ('img', 'src'),
  26     ('img', 'usemap'),
  27     ('input', 'src'),
  28     ('input', 'usemap'),
  29     ('ins', 'cite'),
  30     ('link', 'href'),
  31     ('object', 'classid'),
  32     ('object', 'codebase'),
  33     ('object', 'data'),
  34     ('object', 'usemap'),
  35     ('q', 'cite'),
  36     ('script', 'src')]
  37
  38 def absolutify(src, base_url):
  39     """absolutify(SRC, BASE_URL): Resolve relative URLs in SRC.
  40 SRC is a string containing HTML. All URLs in SRC are resolved relative
  41 to BASE_URL. Return the body of the result as HTML."""
  42
  43     # Parse SRC as HTML.
  44     tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
  45     parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
  46     dom = parser.parse(src)
  47
  48     # Handle <BASE> if any.
  49     head = dom.getElementsByTagName('head')[0]
  50     for b in head.getElementsByTagName('base'):
  51         u = b.getAttribute('href')
  52         if u:
  53             base_url = urlparse.urljoin(base_url, u)
  54             # HTML5 4.2.3 "if there are multiple base elements with href
  55             # attributes, all but the first are ignored."
  56             break
  57
  58     # Change all relative URLs to absolute URLs by resolving them
  59     # relative to BASE_URL. Note that we need to do this even for URLs
  60     # that consist only of a fragment identifier, because Google Reader
  61     # changes href=#foo to href=http://site/#foo
  62     for tag, attr in url_attributes:
  63         for e in dom.getElementsByTagName(tag):
  64             u = e.getAttribute(attr)
  65             if u:
  66                 e.setAttribute(attr, urlparse.urljoin(base_url, u))
  67
  68     # Return the HTML5 serialization of the <BODY> of the result (we don't
  69     # want the <HEAD>: this breaks feed readers).
  70     body = dom.getElementsByTagName('body')[0]
  71     tree_walker = html5lib.treewalkers.getTreeWalker('dom')
  72     html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
  73     return u''.join(html_serializer.serialize(tree_walker(body)))
  74
  75
  76 # Alternative option, from http://stackoverflow.com/questions/589833/how-to-find-a-relative-url-and-translate-it-to-an-absolute-url-in-python/589939#589939
  77 #
  78 # import re, urlparse
  79 #
  80 # find_re = re.compile(r'\bhref\s*=\s*("[^"]*"|\'[^\']*\'|[^"\'<>=\s]+)')
  81 #
  82 # def fix_urls(document, base_url):
  83 #     ret = []
  84 #     last_end = 0
  85 #     for match in find_re.finditer(document):
  86 #         url = match.group(1)
  87 #         if url[0] in "\"'":
  88 #             url = url.strip(url[0])
  89 #         parsed = urlparse.urlparse(url)
  90 #         if parsed.scheme == parsed.netloc == '': #relative to domain
  91 #             url = urlparse.urljoin(base_url, url)
  92 #             ret.append(document[last_end:match.start(1)])
  93 #             ret.append('"%s"' % (url,))
  94 #             last_end = match.end(1)
  95 #     ret.append(document[last_end:])
  96 #     return ''.join(ret)