get-www-stats 3.02 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#!/usr/bin/python

# get-www-stats - Debian web site popularity statistics
# Copyright 2010 Marcin Owsiany <porridge@debian.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
17
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
18

19
# This program is run from debwww crontab on Debian website master server.
20 21 22 23 24 25 26

try:
  import json
except ImportError:
  import simplejson as json

from gzip import open as gzopen
27
from glob import glob
28 29 30 31 32 33 34
import logging
import os
import re
import sys

#logging.basicConfig(level=logging.INFO)

35
log_files = glob('/srv/weblogs.debian.org/incoming/*.debian.org/www.debian.org-access.log*')
36
logs = []
37 38 39

for f in log_files:
  parts = os.path.split(f)[-1].split('-')
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
  if len(parts) == 2:
    logs.append((99999999, f, False))
  elif len(parts) == 3:
    if f.endswith('.gz'):
      gzipped = True
      stamp = parts[2][:-3]
    else:
      gzipped = False
      stamp = parts[2]
    logs.append((int(stamp), f, gzipped))
  else:
    logging.warn('Skipping unexpected filename [%s].' % f)

counts = {}

55
for n, logfile, gzipped in sorted(logs):
56 57 58 59 60
  logging.info('Reading %s.' % logfile)
  opener = gzipped and gzopen or open
  for line in opener(logfile):
    line = line.rstrip()
    tokens = line.split()
61 62
    if tokens[5] != '"GET':
        continue
63
    url = tokens[6]
64 65 66 67 68 69 70 71
    url = re.sub(r'#.*$', '', url)
    url = re.sub(r'\?.*$', '', url)
    url = re.sub(r'//+', '/', url)
    url = re.sub(r'/(\./)+', '/', url)
    url = re.sub(r'^/\.\./', '/', url)
    url = re.sub(r'/[^./]*/\.\./', '/', url)
    url = re.sub(r'\.([a-z]{2}|[a-z]{2}-[a-z]{2})\.(html|xml|rdf|pdf)$', '', url)
    url = re.sub(r'\.(html|xml|rdf|pdf)(\.([a-z]{2}|[a-z]{2}-[a-z]{2}))?$', '', url)
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
    url = re.sub(r'/$', '/index', url)
    if url in counts:
      counts[url] += 1
    else:
      counts[url] = 1
  
if '/index' not in counts:
  raise Exception('No data for /index')
elif counts['/index'] < 50000:
  logging.warn('Less than 50k hits for /index')
elif counts['/index'] < 10000:
  raise Exception('Less than 10k hits for /index')

json.dump(sorted([(v, k) for (k, v) in counts.iteritems() if v > 2], reverse=True),
          sys.stdout,
          indent=2)

# for v, k in sorted([(v, k) for (k, v) in counts.iteritems()], reverse=True):
#   print '%8d %s' % (v, k)
#   if v < 3:
#     break

# Perl original:
# @f=split;
# $s = $f[6];
# $s =~ s,\...\.html,,;
# $s =~ s,/$,/index,;
# $S{$s} += 1;
# END{
#   printf "%d normalized URLs\n", scalar keys %S;
#   foreach my $k (sort { $S{$b} <=> $S{$a} } keys %S) {
#     printf "%8d %s\n", $S{$k}, $k
#   }
# }