| 1 |
#!/usr/bin/python
|
| 2 |
|
| 3 |
# get-www-stats - Debian web site popularity statistics
|
| 4 |
# Copyright 2010 Marcin Owsiany <porridge@debian.org>
|
| 5 |
#
|
| 6 |
# This program is free software; you can redistribute it and/or modify
|
| 7 |
# it under the terms of the GNU General Public License as published by
|
| 8 |
# the Free Software Foundation; either version 2 of the License, or
|
| 9 |
# (at your option) any later version.
|
| 10 |
#
|
| 11 |
# This program is distributed in the hope that it will be useful,
|
| 12 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 14 |
# GNU General Public License for more details.
|
| 15 |
#
|
| 16 |
# You should have received a copy of the GNU General Public License
|
| 17 |
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
| 18 |
|
| 19 |
|
| 20 |
# This program is run from a crontab on a Debian website mirror like this:
|
| 21 |
#
|
| 22 |
# # Atomically and concurrent-safely create a stats.tgz
|
| 23 |
# 18 3 * * * cd "$HOME" && d=$(mktemp -d stats-wip-XXXXXXXXXX) && printf '{"hostname":"\%s"}' $(hostname -f) > "$d/stats.meta.json" && ./get-www-stats > "$d/stats.json" && tar zcf stats-wip.tgz "$d" && rm -rf "$d" && mv stats-wip.tgz stats.tgz
|
| 24 |
#
|
| 25 |
# And the output is transferred to dde.debian.net like this:
|
| 26 |
#
|
| 27 |
# # Atomically transfer stats and replace them.
|
| 28 |
# 18 4 * * * cd $HOME && { [ ! -e stats-old ] || please_cleanup_failed_run ; } && cp -al stats-new stats-old && ln -s stats-old stats-old.s && mv -T stats-old.s stats && { scp -q -i .ssh/stats-transfer-nopass senfl.debian.org:stats.tgz stats.tgz || scp_failed ; } && rm -rf stats-new && mkdir stats-new && tar zxf stats.tgz -C stats-new --strip-components=1 && rm stats.tgz && ln -s stats-new stats-new.s && mv -T stats-new.s stats && rm -rf stats-old
|
| 29 |
#
|
| 30 |
# The output is then exported via DDE (see http://wiki.debian.org/DDE) and used
|
| 31 |
# by the stattrans.pl script to sort the page lists in the Debian web site
|
| 32 |
# translation statistics pages.
|
| 33 |
|
| 34 |
try:
|
| 35 |
import json
|
| 36 |
except ImportError:
|
| 37 |
import simplejson as json
|
| 38 |
|
| 39 |
from gzip import open as gzopen
|
| 40 |
import logging
|
| 41 |
import os
|
| 42 |
import re
|
| 43 |
import sys
|
| 44 |
|
| 45 |
#logging.basicConfig(level=logging.INFO)
|
| 46 |
|
| 47 |
logs_dir = '/var/log/apache2'
|
| 48 |
logs_prefix = 'www.debian.org-access.log'
|
| 49 |
logs_count = 10
|
| 50 |
|
| 51 |
logs = []
|
| 52 |
for f in os.listdir(logs_dir):
|
| 53 |
if not f.startswith(logs_prefix):
|
| 54 |
continue
|
| 55 |
parts = f.split('-')
|
| 56 |
if len(parts) == 2:
|
| 57 |
logs.append((99999999, f, False))
|
| 58 |
elif len(parts) == 3:
|
| 59 |
if f.endswith('.gz'):
|
| 60 |
gzipped = True
|
| 61 |
stamp = parts[2][:-3]
|
| 62 |
else:
|
| 63 |
gzipped = False
|
| 64 |
stamp = parts[2]
|
| 65 |
logs.append((int(stamp), f, gzipped))
|
| 66 |
else:
|
| 67 |
logging.warn('Skipping unexpected filename [%s].' % f)
|
| 68 |
|
| 69 |
counts = {}
|
| 70 |
|
| 71 |
for n, f, gzipped in sorted(logs)[-logs_count:]:
|
| 72 |
logfile = os.path.join(logs_dir, f)
|
| 73 |
logging.info('Reading %s.' % logfile)
|
| 74 |
opener = gzipped and gzopen or open
|
| 75 |
for line in opener(logfile):
|
| 76 |
line = line.rstrip()
|
| 77 |
tokens = line.split()
|
| 78 |
url = tokens[6]
|
| 79 |
url = re.sub(r'\...\.html$', '', url)
|
| 80 |
url = re.sub(r'/$', '/index', url)
|
| 81 |
if url in counts:
|
| 82 |
counts[url] += 1
|
| 83 |
else:
|
| 84 |
counts[url] = 1
|
| 85 |
|
| 86 |
if '/index' not in counts:
|
| 87 |
raise Exception('No data for /index')
|
| 88 |
elif counts['/index'] < 50000:
|
| 89 |
logging.warn('Less than 50k hits for /index')
|
| 90 |
elif counts['/index'] < 10000:
|
| 91 |
raise Exception('Less than 10k hits for /index')
|
| 92 |
|
| 93 |
json.dump(sorted([(v, k) for (k, v) in counts.iteritems() if v > 2], reverse=True),
|
| 94 |
sys.stdout,
|
| 95 |
indent=2)
|
| 96 |
|
| 97 |
# for v, k in sorted([(v, k) for (k, v) in counts.iteritems()], reverse=True):
|
| 98 |
# print '%8d %s' % (v, k)
|
| 99 |
# if v < 3:
|
| 100 |
# break
|
| 101 |
|
| 102 |
# Perl original:
|
| 103 |
# @f=split;
|
| 104 |
# $s = $f[6];
|
| 105 |
# $s =~ s,\...\.html,,;
|
| 106 |
# $s =~ s,/$,/index,;
|
| 107 |
# $S{$s} += 1;
|
| 108 |
# END{
|
| 109 |
# printf "%d normalized URLs\n", scalar keys %S;
|
| 110 |
# foreach my $k (sort { $S{$b} <=> $S{$a} } keys %S) {
|
| 111 |
# printf "%8d %s\n", $S{$k}, $k
|
| 112 |
# }
|
| 113 |
# }
|
| 114 |
|