Commit fcf7fcaa authored by Michael Hanke's avatar Michael Hanke

Simplify logfile parser to spit out JSON

Part of the move to a javascript-based visualization
parent 7f53ce3f
......@@ -2,137 +2,69 @@
# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
# Create a figure with the NeuroDebian repo subscription stats from the apache logs
# Requires out put of
# zgrep "GET /lists/[-a-z\.]\+ HTTP"*access.log* | sed -e 's,[^:]*:\([0-9\.]\+\).*\[\(.*\):.*:.*:.*/lists/\(.*\) HTTP.*,\2;\3;\1,' -e 's,/, ,g'
# either from a file or on stdin. Needs output filename as the only argument
import fileinput
import sys
import time
from datetime import datetime
import numpy as np
import matplotlib
import pylab as pl
from matplotlib.dates import date2num, num2date
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter
from matplotlib.font_manager import FontProperties
from ConfigParser import SafeConfigParser
from math import ceil
dt = [('ip', '|S16'),
('loc', '|S3'),
('suite', '|S20'),
('date', float)]
def make_figure(data, ymax=None):
fig = pl.figure(figsize=(14,3))
distros = ('Debian', 'Ubuntu')
# Sorting is actually seems to be not needed on Python 2.7
# which probably returns release codenames in the order as
# in the config file which is already correct
# But since our server is still on previous stable release
# let's sort for now explicitly
# 9999 for 'nd' == 'sid'
sorting_ids = dict([(x[0], len(x[1])>2 and float(x[1][2:]) or 9999)
for x in cfg.items('release backport ids')])
for idistro, distro in enumerate(distros):
ax = fig.add_subplot(1, len(distros), idistro+1)
suites = [code for code in cfg.options('release codenames')
if cfg.get('release codenames', code).count(distro)]
# sort suites according to backport ids
# and in reverse order so the freshiest is on top
suites = sorted(suites,
cmp=lambda x,y: cmp(sorting_ids[x], sorting_ids[y]),
plot_datehist(ax, data, 10, suites, title=distro, ymax=ymax)
return fig
import re
import sets
import json
import operator
def plot_datehist(ax, data, bins, suites, title=None, ymax=None):
colors=['#ff0088', '#20435C', '#45902C', '#E08720']
linestyle=['-', '--']
global_x_max = None
global_x_min = None
global_y_max = None
for i, suite in enumerate(suites):
dates = data['date'][data['suite'] == suite]
# history in days
history_length = dates.max() - dates.min()
# make approx monthly bins, smaller bins yield spiky curves
# needs new=True to work with oldish numpy
(hist, bin_edges) = np.histogram(dates, np.ceil(history_length/30.))
if False:
# debug output ;-)
print dates.min(), num2date(dates.min()), dates.max(), \
num2date(dates.max()), history_length
print bin_edges
if len(bin_edges) < 2:
# protect against single data point entries by ignoring them
# wouldn't be able to draw a line anyway ;-)
width = bin_edges[1] - bin_edges[0]
# think lines
y = hist / width
global_y_max = max(max(y), global_y_max)
ax.plot(bin_edges[:-1]+(width/2), y,
label=suite, color=colors[i%4], linestyle=linestyle[i//4], lw=2)
# transparent curve shading
ax.fill_between(bin_edges[:-1]+(width/2), 0, hist / width, alpha=0.2,
label=suite, color=colors[i%4])
# figure out axis limits to avoid whitespace in plots
x_max = bin_edges[-2] + width/2
x_min = bin_edges[0] + width/2
global_x_max = max(x_max, global_x_max)
if global_x_min is None or x_min < global_x_min:
global_x_min = x_min
ax.set_xlim(global_x_min, global_x_max)
ax.set_ylabel('New subscriptions [1/day]')
if title:
if not ymax:
# Always leave significant 5% for improvement ;-)
ymax = global_y_max * 1.05
ax.set_ylim(0, ymax)
# set x-ticks in date
# see:
# format the coords message box
ax.format_xdata = DateFormatter('%Y-%m-%d')
# pukes with old matplotlib
#font = FontProperties()
#font.set_size = 8
pl.legend(loc='upper left', #prop=font,
labelspacing=.2, borderaxespad=.2,
handletextpad=.2, borderpad=.2)
releases = {
'etch': 'Debian GNU/Linux 4.0 (etch)',
'lenny': 'Debian GNU/Linux 5.0 (lenny)',
'squeeze': 'Debian GNU/Linux 6.0 (squeeze)',
'wheezy': 'Debian testing (wheezy)',
'sid': 'Debian unstable (sid)',
'hardy': 'Ubuntu 08.04 LTS "Hardy Heron" (hardy)',
'jaunty': 'Ubuntu 09.04 "Jaunty Jackalope" (jaunty)',
'karmic': 'Ubuntu 09.10 "Karmic Koala" (karmic)',
'lucid': 'Ubuntu 10.04 LTS "Lucid Lynx" (lucid)',
'maverick': 'Ubuntu 10.10 "Maverick Meerkat" (maverick)',
'natty': 'Ubuntu 11.04 "Natty Narwhal" (natty)',
'oneiric': 'Ubuntu 11.10 "Oneiric Ocelot" (oneiric)',
'precise': 'Ubuntu 12.04 LTS "Precise Pangolin" (precise)',
'quantal': 'Ubuntu 12.10 "Quantal Quetzal" (quantal)',
'raring': 'Ubuntu 13.04 "Raring Ringtail" (raring)',
'saucy': 'Ubuntu 13.10 "Saucy Salamander" (saucy)',
if __name__ == '__main__':
if not len(sys.argv) > 1:
print 'Need output filename.'
cfg = SafeConfigParser()
data = []
for line in fileinput.FileInput(sys.argv[2:], openhook=fileinput.hook_compressed):
date, list_, ip = line.split(';')
data = {}
# get the IP, date and target release
# the date is truncated to a month/year combo
listget = re.compile(r'^([0-9.:]*) .*\[([^:]*).*GET /lists/([a-z]*)')
for line in fileinput.FileInput(openhook=fileinput.hook_compressed):
match = listget.match(line)
if not match:
addr, date, release = match.groups()
if not release in releases:
# ignore fantasy names
date = datetime.strptime(date, '%d/%b/%Y')
# truncate to a week
suite, loc = list_.split('.')
date = datetime(date.year, date.month, / 7 * 7 + 1)
except ValueError:
suite = list_
loc = ''
date = datetime.strptime(date, "%d %b %Y")
data.append((ip.strip(), loc, suite, date2num(date)))
data = np.array(data, dtype=dt)
make_figure(data).savefig(sys.argv[1], bbox_inches='tight', dpi=60)
# only on Feb28...
date = datetime(date.year, date.month, / 7 * 7)
# microseconds since epoch
date = int(time.mktime(date.timetuple()) * 1000)
rstats = data.setdefault(releases[release], {})
rtime = rstats.setdefault(date, 0)
rtime += 1
rstats[date] = rtime
data[releases[release]] = rstats
# determine the union of all timestamps
timestamps = sets.Set()
for codename, stats in data.iteritems():
export = [{'key': release,
'values': [[ts, float(data[release].setdefault(ts, 0)) / 7]
for ts in sorted(timestamps)]}
for release in sorted(data)]
print json.dumps(export)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment