tools/nd_verifymirrors

   1 #!/usr/bin/python
   2 #emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
   3 #ex: set sts=4 ts=4 sw=4 noet:
   4 """Script to do rudimentary checks of NeuroDebian mirrors to verify they are in good shape
   5 """
   6
   7
   8 import sys
   9 from urllib import urlopen
  10 from ConfigParser import SafeConfigParser
  11
  12 #cfg_path="/etc/neurodebian/neurodebian.cfg"
  13 cfg_path="./neurodebian.cfg"
  14 main_mirror='us-nh'
  15
  16 # read configuration
  17 cfg = SafeConfigParser()
  18 cfg.read(cfg_path)
  19
  20 # load information about mirrors
  21 mirrors = cfg.options('mirrors')
  22 urls = dict([(x, cfg.get('mirrors', x)) for x in mirrors])
  23 slave_mirrors = mirrors.pop(mirrors.index(main_mirror))
  24
  25 #if True:
  26 def fetch_listing(url):
  27     """Traverses whole website, obtains listing of all files available
  28
  29
  30     TODO: eventually use scrapy, but stable one has only 0.8 while
  31     0.16 is out... so -- later
  32     """
  33     print url
  34     #url = 'http://neuro.debian.net/debian/dists/dapper/'
  35     #url = "http://mirror.aarnet.edu.au/pub/neurodebian/dists/dapper/"
  36     parser = etree.HTMLParser()
  37     from lxml.html import parse, submit_form, fromstring
  38     #page = etree.parse(urlopen('http://neuro.debian.net/debian/dists/dapper/'), parser)
  39     #page = objectify.parse(urlopen('http://neuro.debian.net/debian/dists/dapper/'), parser)
  40     page = parse(url).getroot()
  41
  42     #page = fromstring(''.join(urlopen(url).readlines()))
  43     #page.make_links_absolute(url)
  44
  45
  46     # go through all rows with links
  47     rows = [row for row in page.iter('tr')]
  48     res = {}
  49     for row in rows:
  50         pass
  51
  52     # do I need parent actually for anything?  yes -- time/size
  53     # actually -- of no use since presence/presentation heavily varies
  54     # across mirrors, so let's not rely on them
  55     links = [ (l[0].getparent().getparent(),
  56                l[2].endswith('/'),) +
  57               l
  58               for l in page.iterlinks()
  59               if (l[1] == 'href'
  60                   and not (
  61                       l[2][0] in ('/', '?')
  62                       or l[2].startswith('http://')
  63                       or l[2].startswith('mailto:')
  64                       )) ]
  65
  66     for p, isdir, a, _, name, _ in links:
  67         print name
  68         if isdir:
  69             fetch_listing('%s/%s' %
  70                           (url, name))
  71
  72 if False:
  73     for m, url in urls.iteritems():
  74         print "Mirror %s" % m
  75         fetch_listing(url + '/dists/dapper')
  76 else:
  77     fetch_listing(urls[main_mirror] + '/dists/dapper')
  78 """
  79 au has fancier index pages, so we would need to distil page first more
  80 """