2 #emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
3 #ex: set sts=4 ts=4 sw=4 noet:
4 """Script to do rudimentary checks of NeuroDebian mirrors to verify they are in good shape
9 from urllib import urlopen
10 from ConfigParser import SafeConfigParser
12 #cfg_path="/etc/neurodebian/neurodebian.cfg"
13 cfg_path="./neurodebian.cfg"
17 cfg = SafeConfigParser()
20 # load information about mirrors
21 mirrors = cfg.options('mirrors')
22 urls = dict([(x, cfg.get('mirrors', x)) for x in mirrors])
23 slave_mirrors = mirrors.pop(mirrors.index(main_mirror))
26 def fetch_listing(url):
27 """Traverses whole website, obtains listing of all files available
30 TODO: eventually use scrapy, but stable one has only 0.8 while
31 0.16 is out... so -- later
34 #url = 'http://neuro.debian.net/debian/dists/dapper/'
35 #url = "http://mirror.aarnet.edu.au/pub/neurodebian/dists/dapper/"
36 parser = etree.HTMLParser()
37 from lxml.html import parse, submit_form, fromstring
38 #page = etree.parse(urlopen('http://neuro.debian.net/debian/dists/dapper/'), parser)
39 #page = objectify.parse(urlopen('http://neuro.debian.net/debian/dists/dapper/'), parser)
40 page = parse(url).getroot()
42 #page = fromstring(''.join(urlopen(url).readlines()))
43 #page.make_links_absolute(url)
46 # go through all rows with links
47 rows = [row for row in page.iter('tr')]
52 # do I need parent actually for anything? yes -- time/size
53 # actually -- of no use since presence/presentation heavily varies
54 # across mirrors, so let's not rely on them
55 links = [ (l[0].getparent().getparent(),
56 l[2].endswith('/'),) +
58 for l in page.iterlinks()
62 or l[2].startswith('http://')
63 or l[2].startswith('mailto:')
66 for p, isdir, a, _, name, _ in links:
69 fetch_listing('%s/%s' %
73 for m, url in urls.iteritems():
75 fetch_listing(url + '/dists/dapper')
77 fetch_listing(urls[main_mirror] + '/dists/dapper')
79 au has fancier index pages, so we would need to distil page first more