/[webwml]/webwml/get-www-stats
ViewVC logotype

Contents of /webwml/get-www-stats

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (show annotations) (download)
Sun Jun 17 21:48:29 2012 UTC (11 months ago) by porridge
Branch: MAIN
Changes since 1.2: +1 -1 lines
Add missing quote.
1 #!/usr/bin/python
2
3 # get-www-stats - Debian web site popularity statistics
4 # Copyright 2010 Marcin Owsiany <porridge@debian.org>
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
19
20 # This program is run from a crontab on a Debian website mirror like this:
21 #
22 # # Atomically and concurrent-safely create a stats.tgz
23 # 18 3 * * * cd "$HOME" && d=$(mktemp -d stats-wip-XXXXXXXXXX) && printf '{"hostname":"\%s"}' $(hostname -f) > "$d/stats.meta.json" && ./get-www-stats > "$d/stats.json" && tar zcf stats-wip.tgz "$d" && rm -rf "$d" && mv stats-wip.tgz stats.tgz
24 #
25 # And the output is transferred to dde.debian.net like this:
26 #
27 # # Atomically transfer stats and replace them.
28 # 18 4 * * * cd $HOME && { [ ! -e stats-old ] || please_cleanup_failed_run ; } && cp -al stats-new stats-old && ln -s stats-old stats-old.s && mv -T stats-old.s stats && { scp -q -i .ssh/stats-transfer-nopass senfl.debian.org:stats.tgz stats.tgz || scp_failed ; } && rm -rf stats-new && mkdir stats-new && tar zxf stats.tgz -C stats-new --strip-components=1 && rm stats.tgz && ln -s stats-new stats-new.s && mv -T stats-new.s stats && rm -rf stats-old
29 #
30 # The output is then exported via DDE (see http://wiki.debian.org/DDE) and used
31 # by the stattrans.pl script to sort the page lists in the Debian web site
32 # translation statistics pages.
33
34 try:
35 import json
36 except ImportError:
37 import simplejson as json
38
39 from gzip import open as gzopen
40 import logging
41 import os
42 import re
43 import sys
44
45 #logging.basicConfig(level=logging.INFO)
46
47 logs_dir = '/var/log/apache2'
48 logs_prefix = 'www.debian.org-access.log'
49 logs_count = 10
50
51 logs = []
52 for f in os.listdir(logs_dir):
53 if not f.startswith(logs_prefix):
54 continue
55 parts = f.split('-')
56 if len(parts) == 2:
57 logs.append((99999999, f, False))
58 elif len(parts) == 3:
59 if f.endswith('.gz'):
60 gzipped = True
61 stamp = parts[2][:-3]
62 else:
63 gzipped = False
64 stamp = parts[2]
65 logs.append((int(stamp), f, gzipped))
66 else:
67 logging.warn('Skipping unexpected filename [%s].' % f)
68
69 counts = {}
70
71 for n, f, gzipped in sorted(logs)[-logs_count:]:
72 logfile = os.path.join(logs_dir, f)
73 logging.info('Reading %s.' % logfile)
74 opener = gzipped and gzopen or open
75 for line in opener(logfile):
76 line = line.rstrip()
77 tokens = line.split()
78 url = tokens[6]
79 url = re.sub(r'\...\.html$', '', url)
80 url = re.sub(r'/$', '/index', url)
81 if url in counts:
82 counts[url] += 1
83 else:
84 counts[url] = 1
85
86 if '/index' not in counts:
87 raise Exception('No data for /index')
88 elif counts['/index'] < 50000:
89 logging.warn('Less than 50k hits for /index')
90 elif counts['/index'] < 10000:
91 raise Exception('Less than 10k hits for /index')
92
93 json.dump(sorted([(v, k) for (k, v) in counts.iteritems() if v > 2], reverse=True),
94 sys.stdout,
95 indent=2)
96
97 # for v, k in sorted([(v, k) for (k, v) in counts.iteritems()], reverse=True):
98 # print '%8d %s' % (v, k)
99 # if v < 3:
100 # break
101
102 # Perl original:
103 # @f=split;
104 # $s = $f[6];
105 # $s =~ s,\...\.html,,;
106 # $s =~ s,/$,/index,;
107 # $S{$s} += 1;
108 # END{
109 # printf "%d normalized URLs\n", scalar keys %S;
110 # foreach my $k (sort { $S{$b} <=> $S{$a} } keys %S) {
111 # printf "%8d %s\n", $S{$k}, $k
112 # }
113 # }
114

  ViewVC Help
Powered by ViewVC 1.1.5