From fcf7fcaa3885547d9916120b28077c69a21f6ec3 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Wed, 1 May 2013 08:06:58 +0200 Subject: [PATCH] Simplify logfile parser to spit out JSON Part of the move to a javascript-based visualization --- tools/nd_apachelogs2subscriptionstats | 180 ++++++++------------------ 1 file changed, 56 insertions(+), 124 deletions(-) diff --git a/tools/nd_apachelogs2subscriptionstats b/tools/nd_apachelogs2subscriptionstats index 4de60c1..c856337 100755 --- a/tools/nd_apachelogs2subscriptionstats +++ b/tools/nd_apachelogs2subscriptionstats @@ -2,137 +2,69 @@ # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- # vi: set ft=python sts=4 ts=4 sw=4 et: # -# Create a figure with the NeuroDebian repo subscription stats from the apache logs -# Requires out put of -# zgrep "GET /lists/[-a-z\.]\+ HTTP" neuro.debian.net-*access.log* | sed -e 's,[^:]*:\([0-9\.]\+\).*\[\(.*\):.*:.*:.*/lists/\(.*\) HTTP.*,\2;\3;\1,' -e 's,/, ,g' -# either from a file or on stdin. Needs output filename as the only argument - import fileinput import sys +import time from datetime import datetime -import numpy as np -import matplotlib -matplotlib.use('Agg') -import pylab as pl -from matplotlib.dates import date2num, num2date -from matplotlib.dates import YearLocator, MonthLocator, DateFormatter -from matplotlib.font_manager import FontProperties -from ConfigParser import SafeConfigParser -from math import ceil - -dt = [('ip', '|S16'), - ('loc', '|S3'), - ('suite', '|S20'), - ('date', float)] - - -def make_figure(data, ymax=None): - fig = pl.figure(figsize=(14,3)) - distros = ('Debian', 'Ubuntu') - # Sorting is actually seems to be not needed on Python 2.7 - # which probably returns release codenames in the order as - # in the config file which is already correct - # But since our server is still on previous stable release - # let's sort for now explicitly - # 9999 for 'nd' == 'sid' - sorting_ids = dict([(x[0], len(x[1])>2 and float(x[1][2:]) or 9999) - for x in cfg.items('release backport ids')]) - for idistro, distro in enumerate(distros): - ax = fig.add_subplot(1, len(distros), idistro+1) - suites = [code for code in cfg.options('release codenames') - if cfg.get('release codenames', code).count(distro)] - # sort suites according to backport ids - # and in reverse order so the freshiest is on top - suites = sorted(suites, - cmp=lambda x,y: cmp(sorting_ids[x], sorting_ids[y]), - reverse=True) - plot_datehist(ax, data, 10, suites, title=distro, ymax=ymax) - fig.autofmt_xdate() - return fig - +import re +import sets +import json +import operator -def plot_datehist(ax, data, bins, suites, title=None, ymax=None): - colors=['#ff0088', '#20435C', '#45902C', '#E08720'] - linestyle=['-', '--'] - global_x_max = None - global_x_min = None - global_y_max = None - for i, suite in enumerate(suites): - dates = data['date'][data['suite'] == suite] - # history in days - history_length = dates.max() - dates.min() - # make approx monthly bins, smaller bins yield spiky curves - # needs new=True to work with oldish numpy - (hist, bin_edges) = np.histogram(dates, np.ceil(history_length/30.)) - if False: - # debug output ;-) - print dates.min(), num2date(dates.min()), dates.max(), \ - num2date(dates.max()), history_length - print bin_edges - if len(bin_edges) < 2: - # protect against single data point entries by ignoring them - # wouldn't be able to draw a line anyway ;-) - continue - width = bin_edges[1] - bin_edges[0] - # think lines - y = hist / width - global_y_max = max(max(y), global_y_max) - ax.plot(bin_edges[:-1]+(width/2), y, - label=suite, color=colors[i%4], linestyle=linestyle[i//4], lw=2) - # transparent curve shading - ax.fill_between(bin_edges[:-1]+(width/2), 0, hist / width, alpha=0.2, - label=suite, color=colors[i%4]) - # figure out axis limits to avoid whitespace in plots - x_max = bin_edges[-2] + width/2 - x_min = bin_edges[0] + width/2 - - global_x_max = max(x_max, global_x_max) - if global_x_min is None or x_min < global_x_min: - global_x_min = x_min - - ax.set_xlim(global_x_min, global_x_max) - ax.set_ylabel('New subscriptions [1/day]') - if title: - ax.set_title(title) - if not ymax: - # Always leave significant 5% for improvement ;-) - ymax = global_y_max * 1.05 - ax.set_ylim(0, ymax) - # set x-ticks in date - # see: http://matplotlib.sourceforge.net/examples/api/date_demo.html - ax.xaxis.set_major_locator(YearLocator()) - ax.xaxis.set_major_formatter(DateFormatter('\n\n%Y')) - ax.xaxis.set_minor_locator(MonthLocator(interval=2)) - ax.xaxis.set_minor_formatter(DateFormatter('%b')) - # format the coords message box - ax.format_xdata = DateFormatter('%Y-%m-%d') - ax.grid(True) - # pukes with old matplotlib - #font = FontProperties() - #font.set_size = 8 - pl.legend(loc='upper left', #prop=font, - labelspacing=.2, borderaxespad=.2, - handletextpad=.2, borderpad=.2) +releases = { + 'etch': 'Debian GNU/Linux 4.0 (etch)', + 'lenny': 'Debian GNU/Linux 5.0 (lenny)', + 'squeeze': 'Debian GNU/Linux 6.0 (squeeze)', + 'wheezy': 'Debian testing (wheezy)', + 'sid': 'Debian unstable (sid)', + 'hardy': 'Ubuntu 08.04 LTS "Hardy Heron" (hardy)', + 'jaunty': 'Ubuntu 09.04 "Jaunty Jackalope" (jaunty)', + 'karmic': 'Ubuntu 09.10 "Karmic Koala" (karmic)', + 'lucid': 'Ubuntu 10.04 LTS "Lucid Lynx" (lucid)', + 'maverick': 'Ubuntu 10.10 "Maverick Meerkat" (maverick)', + 'natty': 'Ubuntu 11.04 "Natty Narwhal" (natty)', + 'oneiric': 'Ubuntu 11.10 "Oneiric Ocelot" (oneiric)', + 'precise': 'Ubuntu 12.04 LTS "Precise Pangolin" (precise)', + 'quantal': 'Ubuntu 12.10 "Quantal Quetzal" (quantal)', + 'raring': 'Ubuntu 13.04 "Raring Ringtail" (raring)', + 'saucy': 'Ubuntu 13.10 "Saucy Salamander" (saucy)', +} if __name__ == '__main__': - if not len(sys.argv) > 1: - print 'Need output filename.' - sys.exit(1) - cfg_path="/home/neurodebian/neurodebian.git/neurodebian.cfg" - #cfg_path="../neurodebian.cfg" - cfg = SafeConfigParser() - cfg.read(cfg_path) - data = [] - for line in fileinput.FileInput(sys.argv[2:], openhook=fileinput.hook_compressed): - date, list_, ip = line.split(';') + data = {} + # get the IP, date and target release + # the date is truncated to a month/year combo + listget = re.compile(r'^([0-9.:]*) .*\[([^:]*).*GET /lists/([a-z]*)') + for line in fileinput.FileInput(openhook=fileinput.hook_compressed): + match = listget.match(line) + if not match: + continue + addr, date, release = match.groups() + if not release in releases: + # ignore fantasy names + continue + date = datetime.strptime(date, '%d/%b/%Y') + # truncate to a week try: - suite, loc = list_.split('.') + date = datetime(date.year, date.month, date.day / 7 * 7 + 1) except ValueError: - suite = list_ - loc = '' - date = datetime.strptime(date, "%d %b %Y") - data.append((ip.strip(), loc, suite, date2num(date))) - data = np.array(data, dtype=dt) - make_figure(data).savefig(sys.argv[1], bbox_inches='tight', dpi=60) + # only on Feb28... + date = datetime(date.year, date.month, date.day / 7 * 7) + # microseconds since epoch + date = int(time.mktime(date.timetuple()) * 1000) + rstats = data.setdefault(releases[release], {}) + rtime = rstats.setdefault(date, 0) + rtime += 1 + rstats[date] = rtime + data[releases[release]] = rstats + # determine the union of all timestamps + timestamps = sets.Set() + for codename, stats in data.iteritems(): + timestamps.union_update(stats.keys()) + export = [{'key': release, + 'values': [[ts, float(data[release].setdefault(ts, 0)) / 7] + for ts in sorted(timestamps)]} + for release in sorted(data)] + print json.dumps(export) -- 2.39.2