/[webwml]/webwml/get-www-stats
ViewVC logotype

Contents of /webwml/get-www-stats

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations) (download)
Mon Dec 27 15:39:28 2010 UTC (2 years, 5 months ago) by porridge
Branch: MAIN
Added the website hit statistics generator, produces data used by stattrans.pl
1 #!/usr/bin/python
2
3 # get-www-stats - Debian web site popularity statistics
4 # Copyright 2010 Marcin Owsiany <porridge@debian.org>
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
19
20 # This program is run daily on a Debian website mirror like this:
21 #
22 # get-www-stats > stats.txt-pending && mv stats.txt-pending stats.txt
23 #
24 # The output is then exported via DDE (see http://wiki.debian.org/DDE) and used
25 # by the stattrans.pl script to sort the page lists in the Debian web site
26 # translation statistics pages.
27
28 try:
29 import json
30 except ImportError:
31 import simplejson as json
32
33 from gzip import open as gzopen
34 import logging
35 import os
36 import re
37 import sys
38
39 #logging.basicConfig(level=logging.INFO)
40
41 logs_dir = '/var/log/apache2'
42 logs_prefix = 'www.debian.org-access.log'
43 logs_count = 10
44
45 logs = []
46 for f in os.listdir(logs_dir):
47 if not f.startswith(logs_prefix):
48 continue
49 parts = f.split('-')
50 if len(parts) == 2:
51 logs.append((99999999, f, False))
52 elif len(parts) == 3:
53 if f.endswith('.gz'):
54 gzipped = True
55 stamp = parts[2][:-3]
56 else:
57 gzipped = False
58 stamp = parts[2]
59 logs.append((int(stamp), f, gzipped))
60 else:
61 logging.warn('Skipping unexpected filename [%s].' % f)
62
63 counts = {}
64
65 for n, f, gzipped in sorted(logs)[-logs_count:]:
66 logfile = os.path.join(logs_dir, f)
67 logging.info('Reading %s.' % logfile)
68 opener = gzipped and gzopen or open
69 for line in opener(logfile):
70 line = line.rstrip()
71 tokens = line.split()
72 url = tokens[6]
73 url = re.sub(r'\...\.html$', '', url)
74 url = re.sub(r'/$', '/index', url)
75 if url in counts:
76 counts[url] += 1
77 else:
78 counts[url] = 1
79
80 if '/index' not in counts:
81 raise Exception('No data for /index')
82 elif counts['/index'] < 50000:
83 logging.warn('Less than 50k hits for /index')
84 elif counts['/index'] < 10000:
85 raise Exception('Less than 10k hits for /index')
86
87 json.dump(sorted([(v, k) for (k, v) in counts.iteritems() if v > 2], reverse=True),
88 sys.stdout,
89 indent=2)
90
91 # for v, k in sorted([(v, k) for (k, v) in counts.iteritems()], reverse=True):
92 # print '%8d %s' % (v, k)
93 # if v < 3:
94 # break
95
96 # Perl original:
97 # @f=split;
98 # $s = $f[6];
99 # $s =~ s,\...\.html,,;
100 # $s =~ s,/$,/index,;
101 # $S{$s} += 1;
102 # END{
103 # printf "%d normalized URLs\n", scalar keys %S;
104 # foreach my $k (sort { $S{$b} <=> $S{$a} } keys %S) {
105 # printf "%8d %s\n", $S{$k}, $k
106 # }
107 # }
108

  ViewVC Help
Powered by ViewVC 1.1.5