715ced378fa4aa7f9541a59e385ab9d176cf0756
[dex/census.git] / bin / compare-source-package-list
1 #!/usr/bin/python
2
3 # Copyright 2011 Paul Wise
4 # Released under the MIT/Expat license, see doc/COPYING
5
6 # Uses the snapshot.debian.org metadata database and SHA-1 based filesystem to
7 # compute debdiffs between Debian and individual derivatives. The metadata
8 # allows knowing if a particular file was ever in Debian and the filesystem
9 # allows the creation of debdiffs.
10 #
11 # The script works approximately like this:
12 #
13 # Load the Sources files previously downloaded by get-package-lists as indicated
14 # by the sources.list of the derivative.
15 #
16 # For each source package in the derivative:
17 #
18 # Check if the dsc has ever been in Debian, if not, check if the other
19 # parts have and therefore decide if the package is unmodified or not.
20 # Unmodified source packages are skipped and include those with the exact
21 # same dsc file or those where all the non-dsc parts are identical.
22 #
23 # Try some heuristics (name, version, changelog entries) to find out if
24 # the package could be based on some package that is or was in Debian.
25 #
26 # If it was not then skip to the next one and make a note, since Debian
27 # might want to know about source packages that are missing from Debian.
28 #
29 # If it was then use debdiff to create a diff and filterdiff to create a
30 # diff of the debian/ dir.
31 #
32 # Usage:
33 # compare-source-package-list <sources.list> <apt dir> <patches list> <links list> <new package list> <log file>
34
35 # FIXME: write out some statistics and rrdtool graphs
36 #               source package types per derivative
37 #               number of source packages
38 #               cache misses: md5, sha256, sha1, patch, changelog
39 # FIXME: comment the code to list assumptions and function purpose
40 # FIXME: add options to allow re-processing only specific packages
41 # FIXME: write something to clean up old files and patches
42 # FIXME: don't unpack or make a patch when we don't have all the parts
43 # FIXME: don't make a patch when we were not able to unpack the source package
44 # FIXME: cleanup files at start of run
45 # FIXME: extract new debian/patches/ patches
46 # FIXME: print out packages that are no longer in Debian
47 # FIXME: deal with really large patches:
48 # FIXME:   kde-l10n-*: too few parts to be useful
49 # FIXME:   divergence: too many changelog entries between versions to be useful
50 # FIXME:   derivative is older than Debian
51 # FIXME:   derivative renamed the source package
52 # FIXME:   just a really big diff
53 # FIXME: when there are multiple dsc files in snapshots, prefer the debian/debian-archive one
54 # FIXME: when the source package is ancient and the dsc is missing, make a fake one to use
55 # FIXME: add an in-memory cache of hashes so that hashes in multiple releases hit the disk once
56 # FIXME: deal with rate-limited websites like alioth that do not like many requests
57
58 import re
59 import os
60 import sys
61 import httplib
62 import urllib2
63 import hashlib
64 import shutil
65 import logging
66 import tempfile
67 import string
68 import socket
69 import signal
70 import subprocess
71 import yaml
72 from debian import deb822, changelog
73 import apt_pkg
74 import psycopg2
75 try: import simplejson as json
76 except ImportError: import json
77 import struct
78
79 # Helper functions for python stuff with annoying error handling
80
81 def makedirs(dirs):
82         try: os.makedirs(dirs)
83         except OSError: pass
84
85 def rmtree(dir):
86         try: shutil.rmtree(dir)
87         except OSError: pass
88
89 def remove(file):
90         try: os.remove(file)
91         except OSError: pass
92
93 def symlink(source, link):
94         try: os.symlink(source, link)
95         except OSError: pass
96
97 # http://www.chiark.greenend.org.uk/ucgi/~cjwatson/blosxom/2009-07-02-python-sigpipe.html
98 def subprocess_setup():
99         # Python installs a SIGPIPE handler by default. This is usually not what
100         # non-Python subprocesses expect.
101         signal.signal(signal.SIGPIPE, signal.SIG_DFL)
102
103 # We need to map apt_pkg.version_compare return values to cmp return values
104 # The documentation is incorrect: http://bugs.debian.org/680891
105 def apt_version_cmp(a, b):
106         ret = apt_pkg.version_compare(a, b)
107         if ret < 0: return -1
108         elif ret > 0: return 1
109         else: return 0
110
111 # Config
112 md5_cache_dir = os.path.abspath('../md5-farm')
113 sha1_cache_dir = os.path.abspath('../sha1-farm')
114 sha256_cache_dir = os.path.abspath('../sha256-farm')
115 sha1_patch_dir = os.path.abspath('../sha1-patches')
116 sha1_lsdiff_dir = os.path.abspath('../sha1-lsdiff')
117 sha1_changelog_dir = os.path.abspath('../sha1-changelog')
118 deriv_patch_dir = os.path.abspath('patches')
119 global_patch_dir = os.path.abspath('../patches')
120 snapshot_cache_dir = '/srv/snapshot.debian.org/farm'
121 patch_too_large = os.path.abspath('../../doc/patch-too-large.txt')
122 checksum_types = ('sha1', 'sha256', 'md5sum')
123 checksum_hashlib = ('sha1', 'sha256', 'md5')
124 checksum_headers = ('Checksums-Sha1', 'Checksums-Sha256', 'Files')
125 user_agent = 'Debian Derivatives Census QA bot'
126 timeout = 60
127 ishex = lambda s: not(set(s)-set(string.hexdigits))
128
129 # Init
130 apt_pkg.init()
131
132 # Preparation
133 sources_list = apt_pkg.SourceList()
134 sources_list.read_main_list()
135 conn = psycopg2.connect("service=snapshot-guest")
136 cur = conn.cursor()
137 remove(sys.argv[7])
138 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG, filename=sys.argv[7])
139
140 # Voodoo
141 lists_dir = apt_pkg.config.find_dir('Dir::State::lists')
142 source_entries = [[i for i in x.index_files if i.label=='Debian Source Index'] for x in sources_list.list]
143 derivative_short_name = os.path.basename(os.getcwd())
144 modifies_dsc_files = 0
145 repackaged_but_identical = 0
146
147 # Generic helper functions
148
149 def uncompressed_size(filename):
150         uc_size = 0
151         file_size = os.path.getsize(filename)
152         with open(filename, 'rb') as f:
153                 # *.gz
154                 magic = f.read(4)
155                 if magic[:2] == "\x1f\x8b":
156                         f.seek(-4, 2)
157                         data = f.read()
158                         uc_size = struct.unpack('<I', data)[0]
159                 # *.bz2
160                 elif magic[:3] == 'BZh':
161                         # Crude estimate based on average compression ratio of 25%
162                         uc_size = file_size*4
163                 # *.xz
164                 elif magic == '7zXZ':
165                         cmdline = ['xz', '--verbose', '--list', filename]
166                         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
167                         output = process.communicate()[0]
168                         if process.returncode:
169                                 logging.warning('xz reported failure to check size of %s:', filename)
170                                 logging.warning(output)
171                         else:
172                                 for line in output:
173                                         line = line.strip()
174                                         if line.startswith('Uncompressed size:'):
175                                                 match = re.match(r'Uncompressed size:[^\(]*\(([0-9,]+) *B\)', line)
176                                                 if match: uc_size = int(''.join(match.group(1).split(',')))
177                                                 else: logging.warning('xz reported weird output for %s: %s', filename, line)
178                 # *.lz
179                 elif magic == 'LZIP':
180                         f.seek(-16, 2)
181                         data = f.read(8)
182                         uc_size = struct.unpack('<Q', data)[0]
183         return max(file_size, uc_size)
184
185 def tmp_space():
186         stats = os.statvfs(tempfile.gettempdir())
187         return stats.f_frsize*stats.f_bavail
188
189 # Helper functions for generating path names
190
191 def hash_path_parent(dir, hash):
192         return os.path.join(dir, hash[0:2], hash[2:4])
193
194 def hash_path(dir, hash):
195         return os.path.join(dir, hash[0:2], hash[2:4], hash)
196
197 def hash_path_exists(dir, hash):
198         return os.path.exists(os.path.join(dir, hash[0:2], hash[2:4], hash))
199
200 def snapshot_hash_path(hash):
201         return hash_path(snapshot_cache_dir, hash)
202
203 def snapshot_hash_path_exists(hash):
204         return hash_path_exists(snapshot_cache_dir, hash)
205
206 def part_hash_path(part):
207         if 'sha1' in part:
208                 path = snapshot_hash_path(part['sha1'])
209                 if not os.path.exists(path): path = hash_path(sha1_cache_dir, part['sha1'])
210                 return path
211         elif 'sha256' in part:
212                 return hash_path(sha256_cache_dir, part['sha256'])
213         elif 'md5sum' in part:
214                 return hash_path(md5_cache_dir, part['md5sum'])
215         else:
216                 return None
217
218 def sha1_patch_path(debian_dsc_sha1, dsc_sha1, type=None):
219         path = os.path.join(hash_path(sha1_patch_dir, debian_dsc_sha1), hash_path('', dsc_sha1))
220         if type: path += '.%s' % type
221         path += '.patch'
222         return os.path.abspath(path)
223
224 def sha1_lsdiff_path(debian_dsc_sha1, dsc_sha1, type=None):
225         path = os.path.join(hash_path(sha1_lsdiff_dir, debian_dsc_sha1), hash_path('', dsc_sha1))
226         if type: path += '.%s' % type
227         path += '.lsdiff'
228         return os.path.abspath(path)
229
230 def shortslug(name):
231         return name[:4] if name.startswith('lib') else name[0]
232
233 def deriv_patch_path(name, version, debian_name, debian_version, type=None):
234         path = os.path.join(deriv_patch_dir, shortslug(debian_name), debian_name, '')
235         path += '_'.join((debian_name, debian_version, name, version))
236         if type: path += '.%s' % type
237         path += '.patch'
238         return os.path.abspath(path)
239
240 def global_patch_path(name, version, debian_name, debian_version, type=None):
241         path = os.path.join(global_patch_dir, shortslug(debian_name), debian_name, '')
242         path += '_'.join(('Debian', debian_name, debian_version, derivative_short_name, name, version))
243         if type: path += '.%s' % type
244         path += '.patch'
245         return os.path.abspath(path)
246
247 # Functions for munging source packages
248
249 def convert_lzip_to_gzip(dir, name):
250         cmdline = ['lzip', '-d', '--', name]
251         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
252         output = process.communicate()[0]
253         if process.returncode:
254                 logging.warning('lzip reported failure to decompress %s:', name)
255                 logging.warning(output)
256         bname = name[0:-3] # Strip off .lz
257         cmdline = ['gzip', '-1', '--', bname] # gzip -1 to reduce overhead
258         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
259         output = process.communicate()[0]
260         if process.returncode:
261                 logging.warning('gzip reported failure to compress %s:', bname)
262                 logging.warning(output)
263         return (name, bname+'.gz')
264
265 def update_dsc_file(dir, dsc_name, parts):
266         dsc_path = os.path.join(dir,dsc_name)
267         dsc_file = open(dsc_path,'rb')
268         dsc = deb822.Dsc(dsc_file)
269         for (old, name) in parts:
270                 path = os.path.join(dir,name)
271                 size = os.path.getsize(path)
272                 with open(path,'rb') as f:
273                         hashes = {}
274                         for (type, func) in zip(checksum_types, checksum_hashlib):
275                                 hashes[type] = getattr(hashlib, func)()
276                         for chunk in iter(lambda: f.read(128*64L), b''):
277                                 for type in checksum_types:
278                                         hashes[type].update(chunk)
279                         for type in checksum_types:
280                                 hashes[type] = hashes[type].hexdigest()
281                         for (header, type) in zip(checksum_headers, checksum_types):
282                                 if header in dsc:
283                                         dsc[header] = [{type: hashes[type], 'size': size, 'name': name} if p['name'] == old else p for p in dsc[header]]
284         dsc_file.close()
285         os.remove(dsc_path) # So we don't change the original that the dsc links to
286         dsc_file = open(dsc_path,'wb')
287         dsc.dump(dsc_file)
288         dsc_file.close()
289
290 # Functions for downloading files and storing them in the hash caches
291
292 def download_and_check_hash(url, dir, hash, hash_type):
293         try:
294                 parent = hash_path_parent(dir,hash)
295                 path = hash_path(dir,hash)
296                 logging.debug('downloading %s', url)
297                 makedirs(parent)
298                 headers = { 'User-Agent' : user_agent }
299                 req = urllib2.Request(url, None, headers)
300                 u = urllib2.urlopen(req, None, timeout)
301                 data = u.read()
302                 if hash_type == 'sha256':
303                         data_hash = hashlib.sha256(data).hexdigest()
304                 elif hash_type == 'md5sum':
305                         data_hash = hashlib.md5(data).hexdigest()
306                 else:
307                         logging.warning('unknown hash type detected: %s %s %s', hash_type, hash, url)
308                         return ('unknown', None)
309                 if data_hash != hash:
310                         logging.warning('incorrect hash for downloaded file, ignoring: %s %s != %s %s', hash_type, hash, data_hash, url)
311                         return ('unknown', None)
312                 sha1 = hashlib.sha1(data).hexdigest()
313                 sha1_path = hash_path(sha1_cache_dir, sha1)
314                 sha1_parent = hash_path_parent(sha1_cache_dir, sha1)
315                 makedirs(sha1_parent)
316                 snapshot_path = snapshot_hash_path(sha1)
317                 if os.path.exists(snapshot_path):
318                         symlink(snapshot_path, path)
319                         logging.debug('exists in snapshot sha1 cache: %s %s %s %s', hash_type, hash, sha1, url)
320                         return (True, sha1)
321                 else:
322                         if not os.path.exists(sha1_path):
323                                 logging.debug('correct hash for downloaded file, saving: %s %s %s %s', hash_type, hash, sha1, url)
324                                 f = open(sha1_path, 'w')
325                                 f.write(data)
326                                 f.close()
327                         else:
328                                 logging.debug('correct hash for downloaded file, not saving: already in derivs cache: %s %s %s %s', hash_type, hash, sha1, url)
329                         symlink(os.path.relpath(sha1_path, os.path.dirname(path)), path)
330                         logging.debug('does not exist in snapshot sha1 cache: %s %s %s %s', hash_type, hash, sha1, url)
331                         return (False, sha1)
332         except urllib2.URLError, e:
333                 if hasattr(e, 'reason'): reason = e.reason
334                 elif hasattr(e, 'code'): reason = e.code
335                 else: reason = e
336                 logging.warning('unable to download hash file, ignoring: %s %s', reason, url)
337                 return ('unknown', None)
338         except httplib.HTTPException, e:
339                 logging.warning('unable to download hash file, ignoring: %s %s', repr(e), url)
340                 return ('unknown', None)
341         except socket.error, e:
342                 logging.warning('unable to download hash file, ignoring: %s %s', e, url)
343                 return ('unknown', None)
344
345 def download_sha1(url, dir, sha1):
346         try:
347                 parent = hash_path_parent(dir,sha1)
348                 path = hash_path(dir,sha1)
349                 logging.debug('downloading sha1: %s %s', sha1, url)
350                 makedirs(parent)
351                 headers = { 'User-Agent' : user_agent }
352                 req = urllib2.Request(url, None, headers)
353                 u = urllib2.urlopen(req, None, timeout)
354                 data = u.read()
355                 data_sha1 = hashlib.sha1(data).hexdigest()
356                 if data_sha1 == sha1:
357                         logging.debug('correct sha1 for downloaded file, saving: %s %s', sha1, url)
358                         if not os.path.exists(path):
359                                 f = open(path, 'w')
360                                 f.write(data)
361                                 f.close()
362                         return (False, sha1)
363                 else:
364                         logging.warning('incorrect sha1 for downloaded file, ignoring: %s != %s %s', sha1, data_sha1, url)
365                         return ('unknown', None)
366         except urllib2.URLError, e:
367                 if hasattr(e, 'reason'): reason = e.reason
368                 elif hasattr(e, 'code'): reason = e.code
369                 else: reason = e
370                 logging.warning('unable to download sha1 file, ignoring: %s %s', reason, url)
371                 return ('unknown', None)
372         except httplib.HTTPException, e:
373                 logging.warning('unable to download hash file, ignoring: %s %s', repr(e), url)
374                 return ('unknown', None)
375         except socket.error, e:
376                 logging.warning('unable to download hash file, ignoring: %s %s', e, url)
377                 return ('unknown', None)
378
379 # Functions for checking the hash caches
380
381 def check_hash_cache(dir, hash, hash_type, url):
382         logging.debug('checking hash cache: %s %s', hash_type, hash)
383         path = hash_path(dir, hash)
384         try:
385                 result = os.readlink(path)
386                 path = os.path.join(os.path.dirname(path), result)
387         except OSError:
388                 logging.debug('does not exist in hash cache: %s %s', hash_type, hash)
389                 return download_and_check_hash(url, dir, hash, hash_type)
390         logging.debug('exists in hash cache: %s %s', hash_type, hash)
391         sha1 = os.path.basename(path)
392         if snapshot_hash_path_exists(sha1):
393                 logging.debug('exists in snapshot sha1 cache: %s', sha1)
394                 remove(hash_path(sha1_cache_dir,sha1))
395                 return (True, sha1)
396         elif hash_path_exists(sha1_cache_dir, sha1):
397                 logging.debug('exists in derivatives sha1 cache: %s', sha1)
398                 return (False, sha1)
399         else:
400                 logging.debug('missing in derivatives sha1 cache: %s', sha1)
401                 return download_and_check_hash(url, dir, hash, hash_type)
402
403 def check_sha1_cache(sha1, url):
404         logging.debug('checking sha1 caches: %s', sha1)
405         if snapshot_hash_path_exists(sha1):
406                 logging.debug('exists in snapshot sha1 cache: %s', sha1)
407                 remove(hash_path(sha1_cache_dir,sha1))
408                 return (True, sha1)
409         elif hash_path_exists(sha1_cache_dir, sha1):
410                 logging.debug('exists in derivatives sha1 cache: %s', sha1)
411                 return (False, sha1)
412         else:
413                 logging.debug('does not exist in any sha1 caches: %s', sha1)
414                 return download_sha1(url, sha1_cache_dir, sha1)
415
416 def status(type, hash, url):
417         logging.debug('checking status of hash: %s %s %s', type, hash, url)
418         if type == 'sha1':
419                 (ret, sha1) = check_sha1_cache(hash, url)
420                 if ret == True:
421                         return ('unmodified', sha1)
422                 elif ret == False:
423                         return ('modified', sha1)
424                 else:
425                         return (ret, sha1)
426         elif type == 'sha256':
427                 (ret, sha1) = check_hash_cache(sha256_cache_dir, hash, type, url)
428                 if ret == True:
429                         return ('unmodified', sha1)
430                 elif ret == False:
431                         return ('modified', sha1)
432                 else:
433                         return (ret, sha1)
434         elif type == 'md5sum':
435                 (ret, sha1) = check_hash_cache(md5_cache_dir, hash, type, url)
436                 if ret == True:
437                         return ('unmodified', sha1)
438                 elif ret == False:
439                         return ('modified', sha1)
440                 else:
441                         return (ret, sha1)
442         else:
443                 logging.warning('unknown hash type detected: %s %s %s', type, hash, url)
444                 return ('unknown', None)
445
446 # Functions for getting information about source packages
447
448 def get_info(srcpkg):
449         dsc = None
450         for header in checksum_headers:
451                 if not dsc and header in srcpkg:
452                         dsc = [x for x in srcpkg[header] if x['name'].endswith('.dsc')]
453         if not dsc:
454                 logging.warning('did not find any dsc files')
455                 return None
456         if len(dsc) > 1:
457                 logging.warning('found multiple dsc files: %s' % ' '.join(['%s %s' % (dsc_name, dsc_sha1) for dsc_name, dsc_sha1 in dsc]))
458                 return None
459         dsc = dsc[0]
460         dsc_name = dsc['name']
461         dsc_hash_type, dsc_hash =  [(k, v) for k, v in dsc.iteritems() if k not in ('name', 'size')][0]
462
463         parts = []
464         part_names = []
465         for header in checksum_headers:
466                 if header in srcpkg:
467                         for part in srcpkg[header]:
468                                 if 'name' in part and part['name'] not in part_names and not part['name'].endswith('.dsc'):
469                                         parts.append(part)
470                                         part_names.append(part['name'])
471
472         return (dsc_hash_type, dsc_hash, dsc_name, parts)
473
474 def get_debian_info(files):
475         dsc = [file for file in files if file[0].endswith('.dsc')]
476         if not dsc:
477                 logging.warning('did not find any Debian dsc files: snapshots bug or ancient source package')
478                 return None
479         if len(dsc) > 1:
480                 logging.warning('found multiple Debian dsc files, choosing first one: %s' % ' '.join(['%s %s' % (dsc_name, dsc_sha1) for dsc_name, dsc_sha1 in dsc]))
481
482         dsc = dsc[0]
483         dsc_name, dsc_sha1 = dsc
484
485         parts = []
486         part_names = []
487         for file in files:
488                 part_name, part_sha1 = file
489                 if part_name not in part_names and not part_name.endswith('.dsc'):
490                         parts.append(file)
491                         part_names.append(part_name)
492
493         return (dsc_sha1, dsc_name, parts)
494
495 # Functions for extracting information from the snapshots database
496
497 def database_error(e):
498         reason = None
499         code = None
500         if hasattr(e, 'pgerror'): reason = e.pgerror
501         if hasattr(e, 'pgcode'): code = e.pgcode
502         logging.warning('unable to execute database query: %s %s', code, reason)
503         conn.reset()
504
505 def srcpkg_was_in_debian(name, version=None):
506         try:
507                 if version:
508                         cur.execute('SELECT version FROM srcpkg WHERE name=%s AND version=%s LIMIT 1;', (name, version))
509                         return not not cur.fetchone()
510                 else:
511                         cur.execute('SELECT version FROM srcpkg WHERE name=%s LIMIT 1;', (name,))
512                         return not not cur.fetchone()
513         except psycopg2.Error, e:
514                 database_error(e)
515                 return None
516
517 def sha1_to_srcpkgs(sha1):
518         try:
519                 cur.execute(
520                         '''SELECT name, version
521                         FROM srcpkg
522                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
523                         WHERE hash=%s;''', (sha1,))
524                 return cur.fetchall()
525         except psycopg2.Error, e:
526                 database_error(e)
527                 return None
528
529 def srcpkg_to_sha1s(name, version):
530         try:
531                 cur.execute(
532                         '''SELECT hash
533                         FROM file_srcpkg_mapping
534                         JOIN srcpkg ON srcpkg.srcpkg_id=file_srcpkg_mapping.srcpkg_id
535                         WHERE name=%s AND version=%s;''', (name, version))
536                 return cur.fetchall()
537         except psycopg2.Error, e:
538                 database_error(e)
539                 return None
540
541 def srcpkg_to_srcpkgs(name):
542         try:
543                 cur.execute(
544                         '''SELECT name, version
545                         FROM srcpkg
546                         WHERE name=%s ORDER BY version DESC;''', (name,))
547                 return cur.fetchall()
548         except psycopg2.Error, e:
549                 database_error(e)
550                 return None
551
552 def sha1s_to_files(sha1):
553         try:
554                 cur.execute('SELECT DISTINCT ON (name, hash) name, hash FROM file WHERE hash=%s;', hash)
555                 return cur.fetchall()
556         except psycopg2.Error, e:
557                 database_error(e)
558                 return None
559
560 def srcpkg_to_files(name, version):
561         try:
562                 cur.execute(
563                         '''SELECT DISTINCT ON (file.name, file.hash) file.name, file.hash
564                         FROM file_srcpkg_mapping
565                         JOIN srcpkg ON srcpkg.srcpkg_id=file_srcpkg_mapping.srcpkg_id
566                         JOIN file ON file_srcpkg_mapping.hash=file.hash
567                         WHERE srcpkg.name=%s AND srcpkg.version=%s;''', (name, version))
568                 return cur.fetchall()
569         except psycopg2.Error, e:
570                 database_error(e)
571                 return None
572
573 def sha1_version_to_derived_from(sha1, version):
574         try:
575                 cur.execute(
576                         '''SELECT name, version
577                         FROM srcpkg
578                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
579                         WHERE hash=%s and version<=%s
580                         ORDER BY name ASC, version DESC
581                         LIMIT 1;''', (sha1, version))
582                 res = cur.fetchall()
583                 if res: return res
584                 cur.execute(
585                         '''SELECT name, version
586                         FROM srcpkg
587                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
588                         WHERE hash=%s
589                         ORDER BY name ASC, version ASC
590                         LIMIT 1;''', (sha1, version))
591                 return cur.fetchall()
592         except psycopg2.Error, e:
593                 database_error(e)
594                 return None
595
596 def srcpkg_to_derived_from(name, version):
597         try:
598                 cur.execute(
599                         '''SELECT name, version
600                         FROM srcpkg
601                         WHERE name=%s and version<=%s
602                         ORDER BY version DESC
603                         LIMIT 1;''', (name, version))
604                 res = cur.fetchall()
605                 if res: return res
606                 cur.execute(
607                         '''SELECT name, version
608                         FROM srcpkg
609                         WHERE name=%s
610                         ORDER BY version ASC
611                         LIMIT 1;''', (name,))
612                 return cur.fetchall()
613         except psycopg2.Error, e:
614                 database_error(e)
615                 return None
616
617 # Functions related to creating patches
618
619 # Add symlinks for all needed files
620 def prepare(dsc_name, dsc_sha1, parts):
621         logging.debug('preparing deriv directory for %s', dsc_name)
622         size = 0
623         unreadable_parts = []
624         tmp_dir = tempfile.mkdtemp(prefix='derivs-cmp-srcpkg-%s-' % derivative_short_name)
625         path = snapshot_hash_path(dsc_sha1)
626         if not os.path.exists(path): path = hash_path(sha1_cache_dir, dsc_sha1)
627         if not os.access(path, os.R_OK): unreadable_parts.append(dsc_name)
628         # The dsc data does not end up in the tmp dir, don't add the size here
629         dsc_path = os.path.join(tmp_dir, dsc_name)
630         os.symlink(path, dsc_path)
631         converted_parts = []
632         for part in parts:
633                 path = part_hash_path(part)
634                 if not path: continue
635                 if not os.access(path, os.R_OK):
636                         unreadable_parts.append(part['name'])
637                         continue
638                 else:
639                         size += uncompressed_size(path)
640                 part_path = os.path.join(tmp_dir, part['name'])
641                 os.symlink(path, part_path)
642                 # Some distributions allow additional compression schemes
643                 # Here we work around this by recompressing with gzip
644                 if part['name'].endswith('.lz'):
645                         converted_parts.append(convert_lzip_to_gzip(tmp_dir, part['name']))
646         # Update the dsc file if we recompressed any files
647         if converted_parts:
648                 update_dsc_file(tmp_dir, dsc_name, converted_parts)
649         if unreadable_parts:
650                 logging.warning('some parts of %s are unreadable: %s', dsc_name, ' '.join(unreadable_parts))
651                 rmtree(tmp_dir)
652                 return (None, size)
653         space = tmp_space()
654         if size > space:
655                 logging.warning('not enough space for temporary files: %s > %s', size, space)
656                 rmtree(tmp_dir)
657                 return (None, size)
658         return (tmp_dir, size)
659
660 def prepare_debian(dsc_name, dsc_sha1, files, size):
661         logging.debug('preparing Debian directory for %s', dsc_name)
662         unreadable_parts = []
663         debian_tmp_dir = tempfile.mkdtemp(prefix='derivs-cmp-srcpkg-Debian-')
664         path = snapshot_hash_path(dsc_sha1)
665         if not os.access(path, os.R_OK): unreadable_parts.append(dsc_name)
666         # The dsc data does not end up in the tmp dir, don't add the size here
667         dsc_path = os.path.join(debian_tmp_dir, dsc_name)
668         os.symlink(path, dsc_path)
669         for file in files:
670                 part_name, part_sha1 = file
671                 path = snapshot_hash_path(part_sha1)
672                 if not os.access(path, os.R_OK):
673                         unreadable_parts.append(part_name)
674                         continue
675                 else:
676                         size += uncompressed_size(path)
677                 part_path = os.path.join(debian_tmp_dir, part_name)
678                 os.symlink(path, part_path)
679         if unreadable_parts:
680                 logging.warning('some parts of %s are unreadable: %s', dsc_name, ' '.join(unreadable_parts))
681                 rmtree(debian_tmp_dir)
682                 return None
683         space = tmp_space()
684         if size > space:
685                 logging.warning('not enough space for all temporary files: %s > %s', size, space)
686                 rmtree(debian_tmp_dir)
687                 return None
688         return debian_tmp_dir
689
690 def get_changelog_entries(tmp_dir, dsc_name, dsc_sha1):
691         logging.debug('getting changelog entries from %s', dsc_name)
692
693         # Cache check
694         changelog_path = hash_path(sha1_changelog_dir, dsc_sha1)
695         if os.path.exists(changelog_path):
696                 logging.debug('changelog cache exists for %s %s', dsc_name, dsc_sha1)
697                 f = file(changelog_path)
698                 if f:
699                         try: changelog_entries = json.load(f)
700                         except ValueError: pass
701                         else: return [tuple(entry) for entry in changelog_entries]
702                         finally: f.close()
703
704         # Preparation
705         extract_path = os.path.join(tmp_dir,'extracted')
706
707         # Unpack the source tree
708         logging.debug('unpacking source package %s', dsc_name)
709         cmdline = ['dpkg-source', '-x', dsc_name, 'extracted']
710         process = subprocess.Popen(cmdline, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
711         output = process.communicate()[0]
712         if process.returncode:
713                 logging.warning('dpkg-source reported failure to extract %s:', dsc_name)
714                 logging.warning(output)
715                 cmdline = ['ls', '-lR', '--time-style=+']
716                 process = subprocess.Popen(cmdline, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
717                 output = process.communicate()[0]
718                 logging.warning(output)
719                 rmtree(extract_path)
720                 return None
721
722         # Sanitise the debian dir and changelog file in case it is a symlink to outside
723         debian_dir = os.path.join(extract_path, 'debian')
724         changelog_filename = os.path.join(debian_dir,'changelog')
725         if os.path.islink(debian_dir) or os.path.islink(changelog_filename):
726                 logging.warning('debian dir or changelog is a symbolic link %s', dsc_name)
727                 rmtree(extract_path)
728                 return None
729
730         # Check if the changelog exists
731         if not os.path.exists(changelog_filename):
732                 logging.warning('could not find changelog in %s', dsc_name)
733                 rmtree(extract_path)
734                 return None
735
736         # Find out which source package is the most likely derivative
737         logging.debug('parsing changelog for %s', dsc_name)
738         changelog_file = open(changelog_filename)
739         try:
740                 changelog_obj = changelog.Changelog(changelog_file)
741         except UnicodeDecodeError:
742                 changelog_file.seek(0)
743                 changelog_obj = changelog.Changelog(changelog_file, encoding='iso-8859-1')
744         try:
745                 changelog_entries = [(entry.package, str(entry._raw_version)) for entry in changelog_obj]
746         except:
747                 logging.warning('could not read changelog from %s', dsc_name)
748                 rmtree(extract_path)
749                 return None
750         del changelog_obj
751         changelog_file.close()
752
753         # Clean up again
754         rmtree(extract_path)
755
756         # Write the cache
757         makedirs(hash_path_parent(sha1_changelog_dir, dsc_sha1))
758         remove(changelog_path)
759         f = file(changelog_path, 'w')
760         json.dump(changelog_entries, f)
761         f.close()
762
763         return changelog_entries
764
765 # Find the source package name and version this is probably derived from
766 def find_derived_from(tmp_dir, name, version, dsc_name, dsc_sha1, parts_unmodified):
767         logging.debug('finding base source package of %s %s', name, version)
768
769         # Get a list of changelog entries
770         changelog_entries = get_changelog_entries(tmp_dir, dsc_name, dsc_sha1)
771         if changelog_entries:
772                 logging.debug('changelog entries are: %s', ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in changelog_entries]))
773
774         # Get a list of candidate versions from the database
775         possibly_derived_from = []
776         logging.debug('checking which parts were in Debian')
777         for part_sha1, part_name in parts_unmodified:
778                 part_derived_from = sha1_to_srcpkgs(part_sha1)
779                 if part_derived_from:
780                         logging.debug('part %s %s available in %s', part_sha1, part_name, ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in part_derived_from]))
781                         possibly_derived_from.extend(part_derived_from)
782
783         if not possibly_derived_from:
784                 logging.debug('no parts in common with Debian, obtaining old versions')
785                 old_packages = srcpkg_to_srcpkgs(name)
786                 if old_packages: possibly_derived_from = old_packages
787
788         # Uniqify
789         possibly_derived_from = list(set(possibly_derived_from))
790         if possibly_derived_from:
791                 logging.debug('possibly derived from: %s', ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in possibly_derived_from]))
792         else:
793                 logging.debug('nothing in possibly derived from list')
794
795         # Match changelog versions against candidates
796         if changelog_entries:
797                 logging.debug('matching changelog entries against versions possibly derived from')
798                 for entry in changelog_entries:
799                         entry_name, entry_version = entry
800                         if entry in possibly_derived_from:
801                                 logging.debug('%s %s in possibly derived from', entry_name, entry_version)
802                                 return entry
803                 logging.debug('checking if changelog entries were ever in Debian')
804                 for entry_name, entry_version in changelog_entries:
805                         if srcpkg_was_in_debian(entry_name, entry_version):
806                                 logging.debug('%s %s was in Debian', entry_name, entry_version)
807                                 return (entry_name, entry_version)
808         if possibly_derived_from:
809                 logging.debug('finding closest entry in possibly derived from')
810                 possibly_derived_from.sort(cmp=lambda a,b: apt_version_cmp(b[1],a[1]))
811                 for entry_name, entry_version in possibly_derived_from:
812                         if name == entry_name and apt_version_cmp(version, entry_version) >= 0:
813                                 logging.debug('%s %s is an equal or lower version', entry_name, entry_version)
814                                 return (entry_name, entry_version)
815                 entry = possibly_derived_from[-1]
816                 entry_name, entry_version = entry
817                 logging.debug('no lower version numbers, returning next highest version %s %s', entry_name, entry_version)
818                 return entry
819         logging.debug('finding closest version number in Debian')
820         for entry in srcpkg_to_derived_from(name, version):
821                 entry_name, entry_version = entry
822                 logging.debug('closest package was %s %s', entry_name, entry_version)
823                 return entry
824         logging.debug('could not find Debian package %s %s is derived from', name, version)
825         return None
826
827 # Generate a patch file
828 def create_patch(tmp_dir, dsc_name, dsc_sha1, debian_tmp_dir, debian_dsc_name, debian_dsc_sha1):
829         global repackaged_but_identical
830
831         dsc_path = os.path.join(tmp_dir, dsc_name)
832         debian_dsc_path = os.path.join(debian_tmp_dir, debian_dsc_name)
833         path_everything = sha1_patch_path(debian_dsc_sha1, dsc_sha1)
834         path_debian = sha1_patch_path(debian_dsc_sha1, dsc_sha1, 'debian')
835
836         # Generate the main patch
837         if not os.path.exists(path_everything) and os.path.exists(debian_dsc_path) and os.path.exists(dsc_path):
838                 makedirs(os.path.dirname(path_everything))
839                 cmdline = ['debdiff', '--quiet', '--diffstat', debian_dsc_path, dsc_path]
840                 stdout = open(path_everything, 'w')
841                 process = subprocess.Popen(cmdline, stdout=stdout, stderr=subprocess.PIPE, preexec_fn=subprocess_setup)
842                 output = process.communicate()[1]
843                 stdout.close()
844                 if process.returncode == 255:
845                         logging.warning('debdiff reported failure %s %s:', debian_dsc_name, dsc_name)
846                         logging.warning(output)
847                         cmdline = ['ls', '-lR', '--time-style=+']
848                         for name, dir in (derivative_short_name, tmp_dir), ('Debian', debian_tmp_dir):
849                                 logging.warning('dir listing for %s:', name)
850                                 process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
851                                 output = process.communicate()[0]
852                                 logging.warning(output)
853                         return False
854                 elif process.returncode == 0:
855                         logging.info('derivative repackaged in an identical way %s %s', debian_dsc_name, dsc_name)
856                         repackaged_but_identical += 1
857                         return False
858                 elif process.returncode != 1:
859                         logging.warning('debdiff reported unknown return code %s %s %s:', process.returncode, debian_dsc_name, dsc_name)
860                         logging.warning(output)
861                         cmdline = ['ls', '-lR', '--time-style=+']
862                         for name, dir in (derivative_short_name, tmp_dir), ('Debian', debian_tmp_dir):
863                                 logging.warning('dir listing for %s:', name)
864                                 process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
865                                 output = process.communicate()[0]
866                                 logging.warning(output)
867                         return False
868
869         # Filter the main patch to include only the debian/ directory
870         if os.path.exists(path_everything) and not os.path.exists(path_debian):
871                 makedirs(os.path.dirname(path_debian))
872                 cmdline = ['filterdiff', '--include=*/debian/*', path_everything]
873                 filterdiff = subprocess.Popen(cmdline, stdout=subprocess.PIPE, preexec_fn=subprocess_setup)
874                 filterdiff_output = filterdiff.communicate()[0]
875                 diffstat = subprocess.Popen('diffstat', stdin=subprocess.PIPE, stdout=subprocess.PIPE, preexec_fn=subprocess_setup)
876                 diffstat_output = diffstat.communicate(filterdiff_output)[0]
877                 f = open(path_debian, 'w')
878                 f.write('diffstat of debian/ for %s %s\n' % (os.path.splitext(debian_dsc_name)[0], os.path.splitext(dsc_name)[0]))
879                 f.write('\n')
880                 f.write(diffstat_output)
881                 f.write('\n')
882                 f.write(filterdiff_output)
883                 f.close()
884
885         # Patches > 100MB are probably not that useful, replace them with a link
886         for path in path_everything, path_debian:
887                 try:
888                         if os.path.getsize(path) > 104857600:
889                                 logging.info('patch between %s and %s is larger than 100MB', dsc_name, debian_dsc_name)
890                                 remove(path)
891                                 symlink(os.path.relpath(patch_too_large, os.path.dirname(path)), path)
892                 except OSError:
893                         pass
894
895         return True
896
897 def check_patch(debian_dsc_sha1, dsc_sha1):
898         patch_path = sha1_patch_path(debian_dsc_sha1, dsc_sha1)
899         lsdiff_path = sha1_lsdiff_path(debian_dsc_sha1, dsc_sha1)
900         if os.path.exists(lsdiff_path):
901                 logging.debug('lsdiff cache exists for %s', patch_path)
902                 f = file(lsdiff_path)
903                 lsdiff = f.read()
904                 f.close()
905         else:
906                 logging.debug('lsdiff cache does not exist for %s', patch_path)
907                 cmdline = ['lsdiff', patch_path]
908                 process = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
909                 lsdiff = process.communicate()[0]
910                 makedirs(os.path.dirname(lsdiff_path))
911                 f = file(lsdiff_path,'w')
912                 f.write(lsdiff)
913                 f.close()
914         lsdiff = lsdiff.splitlines()
915         for line in lsdiff:
916                 if line != 'debian/changelog' and not line.endswith('/debian/changelog'):
917                         logging.debug('patch changes files other than debian/changelog')
918                         return True
919         logging.debug('patch does not change files other than debian/changelog')
920         return False
921
922 def present_patch(name, version, dsc_sha1, debian_name, debian_version,  debian_dsc_sha1):
923         useful_patch = check_patch(debian_dsc_sha1, dsc_sha1)
924         patches = []
925         types = ('', 'debian')
926         for type in types:
927                 ln_to = sha1_patch_path(debian_dsc_sha1, dsc_sha1, type)
928                 if not os.path.exists(ln_to):
929                         continue
930                 ln_from_deriv = deriv_patch_path(name, version, debian_name, debian_version, type)
931                 ln_from_global = global_patch_path(name, version, debian_name, debian_version, type)
932                 makedirs(os.path.dirname(ln_from_deriv))
933                 makedirs(os.path.dirname(ln_from_global))
934                 remove(ln_from_deriv)
935                 remove(ln_from_global)
936                 symlink(os.path.relpath(ln_to, os.path.dirname(ln_from_deriv)), ln_from_deriv)
937                 symlink(os.path.relpath(ln_to, os.path.dirname(ln_from_global)), ln_from_global)
938                 if useful_patch:
939                         patches.append(os.path.relpath(ln_from_global, os.path.abspath(global_patch_dir)))
940         return tuple(patches)
941
942 # Functions that wrap other functions and decide what to do
943
944 def check_source_package(source_entry, srcpkg):
945         global modifies_dsc_files
946
947         try:
948                 name = None
949                 version = None
950                 dir = None
951                 name = srcpkg['Package']
952                 version = srcpkg['Version']
953                 dir = srcpkg['Directory']
954                 if '/' in name or name == '..':
955                         logging.warning('could not process source package %s %s: possibly malicious name', name, version)
956                         return None
957                 if '/' in version or version == '..':
958                         logging.warning('could not process source package %s %s: possibly malicious version', name, version)
959                         return None
960                 if '..' in dir.split('/'):
961                         logging.warning('could not process source package %s %s: possibly malicious dir: %s', name, version, dir)
962                         return None
963         except KeyError:
964                 logging.warning('could not process source package %s %s', name, version)
965                 return None
966         logging.debug('started processing source package %s %s', name, version)
967         info = get_info(srcpkg)
968         if not info:
969                 logging.warning('finished processing source package %s %s: could not get any info', name, version)
970                 return None
971         dsc_hash_type, dsc_hash, dsc_name, parts = info
972         if '/' in dsc_name or dsc_name == '..':
973                 logging.warning('could not process source package %s %s: possibly malicious dsc name %s', name, version, dsc_name)
974                 return None
975         if not ishex(dsc_hash):
976                 logging.warning('could not process source package %s %s: possibly malicious dsc hash %s', name, version, dsc_hash)
977                 return None
978         dsc_url = source_entry.archive_uri('%s/%s' % (dir, dsc_name))
979         logging.debug('found dsc file: %s %s %s', dsc_hash_type, dsc_hash, dsc_url)
980         dsc_status, dsc_sha1 = status(dsc_hash_type, dsc_hash, dsc_url)
981         logging.debug('checked dsc status: %s %s %s', dsc_status, dsc_sha1, dsc_url)
982         if dsc_status == 'unmodified':
983                 # Ignore the srcpkg since we know it is was in Debian
984                 # at one point and is hopefully therefore unmodified
985                 logging.debug('finished processing source package %s %s: dsc unmodified', name, version)
986                 return None
987         else:
988                 files = [(dsc_sha1, dsc_hash_type, dsc_hash)]
989                 parts_unmodified = []
990                 parts_modified = []
991                 parts_unknown = []
992                 for part in parts:
993                         part_name = part['name']
994                         if '/' in part_name or part_name == '..':
995                                 logging.warning('could not process source package %s %s: possibly malicious part name %s', name, version, part_name)
996                                 return None
997                         part_url = source_entry.archive_uri('%s/%s' % (dir, part_name))
998                         part_hash_type, part_hash = [(k, v) for k, v in part.iteritems() if k not in ('name', 'size')][0]
999                         if not ishex(part_hash):
1000                                 logging.warning('could not process source package %s %s: possibly malicious part hash %s', name, version, part_hash)
1001                                 return None
1002                         logging.debug('found part file: %s %s %s', part_hash_type, part_hash, part_url)
1003                         part_status, part_sha1 = status(part_hash_type, part_hash, part_url)
1004                         logging.debug('checked part status: %s %s %s', part_status, part_sha1, part_url)
1005                         if 'sha1' not in part and part_sha1: part['sha1'] = part_sha1
1006                         if part_status == 'unmodified': parts_unmodified.append((part_sha1, part_name))
1007                         elif part_status == 'modified': parts_modified.append((part_sha1, part_name))
1008                         else: parts_unknown.append((part_sha1, part_name))
1009                         if part_status == 'modified': files.append((part_sha1, part_hash_type, part_hash))
1010
1011                 all_parts_unmodified = (len(parts_unmodified) == len(parts))
1012                 parts_unmodified = list(set(parts_unmodified))
1013                 logging.debug('source package status %s %s: dsc %s, %s parts unmodified, %s parts modified, %s parts unknown', name, version, dsc_status, len(parts_unmodified), len(parts_modified), len(parts_unknown))
1014
1015                 if all_parts_unmodified:
1016                         # Ignore the srcpkg since we know all the parts were
1017                         # in Debian at one point and ergo, it is unmodified
1018                         logging.debug('finished processing source package %s %s: all non-dsc parts unmodified', name, version)
1019                         if dsc_status == 'modified':
1020                                 logging.info('source package %s %s: unmodified, but dsc different', name, version)
1021                                 modifies_dsc_files += 1
1022                         return (files, None, None, None)
1023                 else:
1024                         logging.debug('some parts modified, looking for derived version %s %s', name, version)
1025                         if not dsc_sha1:
1026                                 logging.warning('finished processing source package %s %s: sha1 missing for dsc file', name, version)
1027                                 return (files, None, None, None)
1028                         if parts_unknown:
1029                                 logging.warning('finished processing source package %s %s: sha1 missing for some parts', name, version)
1030                                 return (files, None, None, None)
1031                         new = None
1032                         link = None
1033                         patch = None
1034                         tmp_dir, size = prepare(dsc_name, dsc_sha1, parts)
1035                         if not tmp_dir:
1036                                 logging.warning('source package %s %s: could not create temporary dir for deriv: %s', name, version, dsc_name)
1037                                 return (files, None, None, None)
1038                         derived_from = find_derived_from(tmp_dir, name, version, dsc_name, dsc_sha1, parts_unmodified)
1039                         if derived_from:
1040                                 debian_name, debian_version = derived_from
1041                                 link = (debian_name, debian_version, name, version, dsc_url)
1042                                 logging.debug('source package %s %s derived from %s %s', name, version, debian_name, debian_version)
1043                                 debian_files = srcpkg_to_files(debian_name, debian_version)
1044                                 if debian_files:
1045                                         debian_info = get_debian_info(debian_files)
1046                                         if debian_info:
1047                                                 debian_dsc_sha1, debian_dsc_name, debian_parts = debian_info
1048                                                 logging.debug('Debian source package %s %s dsc found %s %s', debian_name, debian_version, debian_dsc_name, debian_dsc_sha1)
1049                                                 debian_tmp_dir = prepare_debian(debian_dsc_name, debian_dsc_sha1, debian_parts, size)
1050                                                 if debian_tmp_dir:
1051                                                         patch_created = create_patch(tmp_dir, dsc_name, dsc_sha1, debian_tmp_dir, debian_dsc_name, debian_dsc_sha1)
1052                                                         if patch_created:
1053                                                                 patch_names = present_patch(name, version, dsc_sha1, debian_name, debian_version, debian_dsc_sha1)
1054                                                                 if patch_names:
1055                                                                         patch = (debian_name, debian_version, debian_dsc_sha1, name, version, dsc_sha1, [part[0] for part in parts_modified], patch_names)
1056                                                                 else:
1057                                                                         logging.debug('patch between %s %s and %s %s is probably not useful', debian_name, debian_version, name, version)
1058                                                         rmtree(debian_tmp_dir)
1059                                                 else:
1060                                                         # This could be an issue with disk space, snapshots or a file that is not distributable
1061                                                         logging.warning('source package %s %s: could not create temporary dir for Debian: %s %s', name, version, debian_name, debian_version)
1062                                         else:
1063                                                 logging.warning('source package %s %s: could not get Debian info for %s %s: %s', name, version, debian_name, debian_version, debian_info)
1064                                 else:
1065                                         if srcpkg_was_in_debian(debian_name, debian_version):
1066                                                 logging.warning('source package %s %s: snapshot database issue, no Debian files found', debian_name, debian_version)
1067                                         else:
1068                                                 logging.warning('source package %s %s: derived from %s %s possibly bogus', name, version, debian_name, debian_version)
1069                         else:
1070                                 new = (name, version, dsc_url)
1071                         rmtree(tmp_dir)
1072                         logging.debug('finished processing source package %s %s: all done', name, version)
1073                         return (files, patch, link, new)
1074
1075 def process_sources(source_entries, lists_dir):
1076         files = []
1077         patches = []
1078         links = []
1079         new = []
1080         for source in source_entries:
1081                 for source_entry in source:
1082                         logging.debug('processing sources.list entry %s', source_entry.describe)
1083                         fn = os.path.join(lists_dir, source_entry.describe.rstrip(')').rpartition('(')[2])
1084                         try: f = file(fn)
1085                         except IOError: continue
1086                         for srcpkg in deb822.Sources.iter_paragraphs(f):
1087                                 actions = check_source_package(source_entry, srcpkg)
1088                                 if actions:
1089                                         action_files, action_patch, action_link, action_new = actions
1090                                         if action_files:
1091                                                 files.append(action_files)
1092                                                 logging.debug('action: return files %s', ' '.join([' '.join([str(item) for item in action]) for action in action_files]))
1093                                         if action_patch:
1094                                                 patches.append(action_patch)
1095                                                 logging.debug('action: return patches %s', ' '.join([' '.join(action) for action in action_patch]))
1096                                         if action_link:
1097                                                 links.append(action_link)
1098                                                 logging.debug('action: return links to modified source packages %s', ' '.join(action_link))
1099                                         if action_new:
1100                                                 new.append(action_new)
1101                                                 logging.debug('action: return links to new source packages %s', ' '.join(action_new))
1102                                 logging.debug('done')
1103                                 logging.debug('')
1104                         f.close()
1105         return (files, patches, links, new)
1106
1107 logging.debug('processing distribution %s', derivative_short_name)
1108
1109 files, patches, links, new = process_sources(source_entries, lists_dir)
1110
1111 # Done with the database, close the connection
1112 cur.close()
1113 conn.close()
1114
1115 # Write out the results
1116 filename = sys.argv[3]
1117 data = files
1118 if data:
1119         output_data = {}
1120         for package in data:
1121                 for modified_file in package:
1122                         sha1, hash_type, hash = modified_file
1123                         if sha1 not in output_data:
1124                                 output_data[sha1] = {}
1125                         if hash_type != 'sha1' and hash_type not in output_data[sha1]:
1126                                 output_data[sha1][hash_type] = hash
1127                         elif hash_type != 'sha1' and hash != output_data[sha1][hash_type]:
1128                                 logging.warning('hashes mismatched: %s: %s %s != %s', sha1, hash_type, hash, output_data[sha1][hash_type])
1129         output = file(os.path.abspath(filename), 'wb')
1130         yaml.safe_dump(output_data, output)
1131         output.close()
1132
1133 filename = sys.argv[4]
1134 data = patches
1135 if data:
1136         if not os.path.exists(os.path.join(global_patch_dir,'HEADER.html')):
1137                 symlink('../../doc/HEADER.patches.html',os.path.join(global_patch_dir,'HEADER.html'))
1138         if not os.path.exists(os.path.join(global_patch_dir,'.htaccess')):
1139                 symlink('../../etc/htaccess.patches',os.path.join(global_patch_dir,'.htaccess'))
1140         if not os.path.exists(os.path.join(deriv_patch_dir,'HEADER.html')):
1141                 symlink('../../../doc/HEADER.patches.html',os.path.join(deriv_patch_dir,'HEADER.html'))
1142         if not os.path.exists(os.path.join(global_patch_dir,'.htaccess')):
1143                 symlink('../../../etc/htaccess.patches',os.path.join(deriv_patch_dir,'.htaccess'))
1144         output_data = []
1145         for item in data:
1146                 debian_name, debian_version, debian_sha1, name, version, sha1, parts_sha1, patches = item
1147                 item = {}
1148                 item['debian_name'] = debian_name
1149                 item['debian_version'] = debian_version
1150                 item['debian_sha1'] = debian_sha1
1151                 item['name'] = name
1152                 item['version'] = version
1153                 item['sha1'] = sha1
1154                 item['patches'] = patches
1155                 item['parts'] = parts_sha1
1156                 output_data.append(item)
1157         output = file(os.path.abspath(filename), 'wb')
1158         yaml.safe_dump(output_data, output)
1159         output.close()
1160 else:
1161         remove(filename)
1162
1163 filename = sys.argv[5]
1164 data = links
1165 if data:
1166         data = list(set(data))
1167         data.sort(cmp=lambda a,b: cmp(a[0],b[0]) or apt_version_cmp(a[1],b[1]) or cmp(a[2],b[2]) or apt_version_cmp(a[3],b[3]))
1168         output_data = {}
1169         output = file(os.path.abspath(filename), 'wb')
1170         for debian_name, debian_version, name, version, dsc_url in data:
1171                 if debian_name not in output_data:
1172                         output_data[debian_name] = {}
1173                 if debian_version not in output_data[debian_name]:
1174                         output_data[debian_name][debian_version] = []
1175                 item = {}
1176                 item['name'] = name
1177                 item['version'] = version
1178                 item['dsc'] = dsc_url
1179                 output_data[debian_name][debian_version].append(item)
1180         yaml.safe_dump(output_data, output)
1181         output.close()
1182 else:
1183         remove(filename)
1184
1185 filename = sys.argv[6]
1186 data = new
1187 if data:
1188         data = list(set(data))
1189         data.sort(cmp=lambda a,b: cmp(a[0],b[0]) or apt_version_cmp(a[1],b[1]) or cmp(a[2],b[2]))
1190         output_data = {}
1191         output = file(os.path.abspath(filename), 'wb')
1192         for name, version, dsc_url in data:
1193                 if name not in output_data:
1194                         output_data[name] = {}
1195                 if version not in output_data[name]:
1196                         output_data[name][version] = []
1197                 output_data[name][version].append(str(dsc_url))
1198         yaml.safe_dump(output_data, output)
1199         output.close()
1200 else:
1201         remove(filename)
1202
1203 logging.shutdown()