#!/usr/bin/python #emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- #ex: set sts=4 ts=4 sw=4 noet: """Script to do rudimentary checks of NeuroDebian mirrors to verify they are in good shape """ import sys from urllib import urlopen from ConfigParser import SafeConfigParser #cfg_path="/etc/neurodebian/neurodebian.cfg" cfg_path="./neurodebian.cfg" main_mirror='us-nh' # read configuration cfg = SafeConfigParser() cfg.read(cfg_path) # load information about mirrors mirrors = cfg.options('mirrors') urls = dict([(x, cfg.get('mirrors', x)) for x in mirrors]) slave_mirrors = mirrors.pop(mirrors.index(main_mirror)) #if True: def fetch_listing(url): """Traverses whole website, obtains listing of all files available TODO: eventually use scrapy, but stable one has only 0.8 while 0.16 is out... so -- later """ print url #url = 'http://neuro.debian.net/debian/dists/dapper/' #url = "http://mirror.aarnet.edu.au/pub/neurodebian/dists/dapper/" parser = etree.HTMLParser() from lxml.html import parse, submit_form, fromstring #page = etree.parse(urlopen('http://neuro.debian.net/debian/dists/dapper/'), parser) #page = objectify.parse(urlopen('http://neuro.debian.net/debian/dists/dapper/'), parser) page = parse(url).getroot() #page = fromstring(''.join(urlopen(url).readlines())) #page.make_links_absolute(url) # go through all rows with links rows = [row for row in page.iter('tr')] res = {} for row in rows: pass # do I need parent actually for anything? yes -- time/size # actually -- of no use since presence/presentation heavily varies # across mirrors, so let's not rely on them links = [ (l[0].getparent().getparent(), l[2].endswith('/'),) + l for l in page.iterlinks() if (l[1] == 'href' and not ( l[2][0] in ('/', '?') or l[2].startswith('http://') or l[2].startswith('mailto:') )) ] for p, isdir, a, _, name, _ in links: print name if isdir: fetch_listing('%s/%s' % (url, name)) if False: for m, url in urls.iteritems(): print "Mirror %s" % m fetch_listing(url + '/dists/dapper') else: fetch_listing(urls[main_mirror] + '/dists/dapper') """ au has fancier index pages, so we would need to distil page first more """