2 # http://gareth-rees.livejournal.com/27148.html
5 import html5lib.serializer
6 import html5lib.treewalkers
9 # List of (ELEMENT, ATTRIBUTE) for HTML5 attributes which contain URLs.
10 # Based on the list at http://www.feedparser.org/docs/resolving-relative-links.html
13 ('applet', 'codebase'),
15 ('blockquote', 'cite'),
16 ('body', 'background'),
19 ('frame', 'longdesc'),
21 ('iframe', 'longdesc'),
31 ('object', 'classid'),
32 ('object', 'codebase'),
38 def absolutify(src, base_url):
39 """absolutify(SRC, BASE_URL): Resolve relative URLs in SRC.
40 SRC is a string containing HTML. All URLs in SRC are resolved relative
41 to BASE_URL. Return the body of the result as HTML."""
44 tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
45 parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
46 dom = parser.parse(src)
48 # Handle <BASE> if any.
49 head = dom.getElementsByTagName('head')[0]
50 for b in head.getElementsByTagName('base'):
51 u = b.getAttribute('href')
53 base_url = urlparse.urljoin(base_url, u)
54 # HTML5 4.2.3 "if there are multiple base elements with href
55 # attributes, all but the first are ignored."
58 # Change all relative URLs to absolute URLs by resolving them
59 # relative to BASE_URL. Note that we need to do this even for URLs
60 # that consist only of a fragment identifier, because Google Reader
61 # changes href=#foo to href=http://site/#foo
62 for tag, attr in url_attributes:
63 for e in dom.getElementsByTagName(tag):
64 u = e.getAttribute(attr)
66 e.setAttribute(attr, urlparse.urljoin(base_url, u))
68 # Return the HTML5 serialization of the <BODY> of the result (we don't
69 # want the <HEAD>: this breaks feed readers).
70 body = dom.getElementsByTagName('body')[0]
71 tree_walker = html5lib.treewalkers.getTreeWalker('dom')
72 html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
73 return u''.join(html_serializer.serialize(tree_walker(body)))
76 # Alternative option, from http://stackoverflow.com/questions/589833/how-to-find-a-relative-url-and-translate-it-to-an-absolute-url-in-python/589939#589939
80 # find_re = re.compile(r'\bhref\s*=\s*("[^"]*"|\'[^\']*\'|[^"\'<>=\s]+)')
82 # def fix_urls(document, base_url):
85 # for match in find_re.finditer(document):
86 # url = match.group(1)
88 # url = url.strip(url[0])
89 # parsed = urlparse.urlparse(url)
90 # if parsed.scheme == parsed.netloc == '': #relative to domain
91 # url = urlparse.urljoin(base_url, url)
92 # ret.append(document[last_end:match.start(1)])
93 # ret.append('"%s"' % (url,))
94 # last_end = match.end(1)
95 # ret.append(document[last_end:])