6be1e15af3add520fc5bf64c6d67e17c91e0733e
[dex/census.git] / bin / compare-source-package-list
1 #!/usr/bin/python
2
3 # Copyright 2011 Paul Wise
4 # Released under the MIT/Expat license, see doc/COPYING
5
6 # Uses the snapshot.debian.org metadata database and SHA-1 based filesystem to
7 # compute debdiffs between Debian and individual derivatives. The metadata
8 # allows knowing if a particular file was ever in Debian and the filesystem
9 # allows the creation of debdiffs.
10 #
11 # The script works approximately like this:
12 #
13 # Load the Sources files previously downloaded by get-package-lists as indicated
14 # by the sources.list of the derivative.
15 #
16 # For each source package in the derivative:
17 #
18 # Check if the dsc has ever been in Debian, if not, check if the other
19 # parts have and therefore decide if the package is unmodified or not.
20 # Unmodified source packages are skipped and include those with the exact
21 # same dsc file or those where all the non-dsc parts are identical.
22 #
23 # Try some heuristics (name, version, changelog entries) to find out if
24 # the package could be based on some package that is or was in Debian.
25 #
26 # If it was not then skip to the next one and make a note, since Debian
27 # might want to know about source packages that are missing from Debian.
28 #
29 # If it was then use debdiff to create a diff and filterdiff to create a
30 # diff of the debian/ dir.
31 #
32 # Usage:
33 # compare-source-package-list <sources.list> <apt dir> <patches list> <links list> <new package list> <log file>
34
35 # FIXME: write out some statistics and rrdtool graphs
36 #               source package types per derivative
37 #               number of source packages
38 #               cache misses: md5, sha256, sha1, patch, changelog
39 # FIXME: comment the code to list assumptions and function purpose
40 # FIXME: add options to allow re-processing only specific packages
41 # FIXME: write something to clean up old files and patches
42 # FIXME: don't unpack or make a patch when we don't have all the parts
43 # FIXME: don't make a patch when we were not able to unpack the source package
44 # FIXME: cleanup files at start of run
45 # FIXME: extract new debian/patches/ patches
46 # FIXME: print out packages that are no longer in Debian
47 # FIXME: deal with really large patches:
48 # FIXME:   kde-l10n-*: too few parts to be useful
49 # FIXME:   divergence: too many changelog entries between versions to be useful
50 # FIXME:   derivative is older than Debian
51 # FIXME:   derivative renamed the source package
52 # FIXME:   just a really big diff
53 # FIXME: when there are multiple dsc files in snapshots, prefer the debian/debian-archive one
54 # FIXME: when the source package is ancient and the dsc is missing, make a fake one to use
55 # FIXME: add an in-memory cache of hashes so that hashes in multiple releases hit the disk once
56 # FIXME: deal with rate-limited websites like alioth that do not like many requests
57
58 import re
59 import os
60 import sys
61 import requests
62 import hashlib
63 import shutil
64 import logging
65 import tempfile
66 import string
67 import socket
68 import signal
69 import subprocess
70 import yaml
71 from debian import deb822, changelog
72 import apt_pkg
73 import psycopg2
74 try: import simplejson as json
75 except ImportError: import json
76 import struct
77
78 # Helper functions for python stuff with annoying error handling
79
80 def makedirs(dirs):
81         try: os.makedirs(dirs)
82         except OSError: pass
83
84 def rmtree(dir):
85         try: shutil.rmtree(dir)
86         except OSError: pass
87
88 def remove(file):
89         try: os.remove(file)
90         except OSError: pass
91
92 def symlink(source, link):
93         try: os.symlink(source, link)
94         except OSError: pass
95
96 # http://www.chiark.greenend.org.uk/ucgi/~cjwatson/blosxom/2009-07-02-python-sigpipe.html
97 def subprocess_setup():
98         # Python installs a SIGPIPE handler by default. This is usually not what
99         # non-Python subprocesses expect.
100         signal.signal(signal.SIGPIPE, signal.SIG_DFL)
101
102 # We need to map apt_pkg.version_compare return values to cmp return values
103 # The documentation is incorrect: https://bugs.debian.org/680891
104 def apt_version_cmp(a, b):
105         ret = apt_pkg.version_compare(a, b)
106         if ret < 0: return -1
107         elif ret > 0: return 1
108         else: return 0
109
110 # Config
111 md5_cache_dir = os.path.abspath('../md5-farm')
112 sha1_cache_dir = os.path.abspath('../sha1-farm')
113 sha256_cache_dir = os.path.abspath('../sha256-farm')
114 sha1_patch_dir = os.path.abspath('../sha1-patches')
115 sha1_lsdiff_dir = os.path.abspath('../sha1-lsdiff')
116 sha1_changelog_dir = os.path.abspath('../sha1-changelog')
117 deriv_patch_dir = os.path.abspath('patches')
118 global_patch_dir = os.path.abspath('../patches')
119 snapshot_cache_dir = '/srv/snapshot.debian.org/farm'
120 patch_too_large = os.path.abspath('../../doc/patch-too-large.txt')
121 checksum_types = ('sha1', 'sha256', 'md5sum')
122 checksum_hashlib = ('sha1', 'sha256', 'md5')
123 checksum_headers = ('Checksums-Sha1', 'Checksums-Sha256', 'Files')
124 user_agent = 'Debian Derivatives Census QA bot'
125 timeout = 60
126 ishex = lambda s: not(set(s)-set(string.hexdigits))
127 debian_ssl_bundle = '/etc/ssl/ca-global/ca-certificates.crt'
128 if os.path.exists(debian_ssl_bundle):
129         ssl_verify = debian_ssl_bundle
130 else:
131         ssl_verify = True
132
133 # Init
134 apt_pkg.init()
135
136 # Preparation
137 sources_list = apt_pkg.SourceList()
138 sources_list.read_main_list()
139 conn = psycopg2.connect("service=snapshot-guest")
140 cur = conn.cursor()
141 remove(sys.argv[7])
142 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG, filename=sys.argv[7])
143
144 # Voodoo
145 lists_dir = apt_pkg.config.find_dir('Dir::State::lists')
146 source_entries = [[i for i in x.index_files if i.label=='Debian Source Index'] for x in sources_list.list]
147 derivative_short_name = os.path.basename(os.getcwd())
148 modifies_dsc_files = 0
149 repackaged_but_identical = 0
150
151 # Generic helper functions
152
153 def uncompressed_size(filename):
154         uc_size = 0
155         file_size = os.path.getsize(filename)
156         with open(filename, 'rb') as f:
157                 magic = f.read(6)
158                 # *.gz
159                 if magic[:2] == "\x1f\x8b":
160                         f.seek(-4, 2)
161                         data = f.read()
162                         uc_size = struct.unpack('<I', data)[0]
163                 # *.bz2
164                 elif magic[:3] == 'BZh':
165                         # Crude estimate based on average compression ratio of 25%
166                         uc_size = file_size*4
167                 # *.xz
168                 elif magic == "\xfd7zXZ\x00":
169                         cmdline = ['xz', '--verbose', '--list', filename]
170                         process = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
171                         output = process.communicate()[0]
172                         if process.returncode:
173                                 logging.warning('xz reported failure to check size of %s:', filename)
174                                 logging.warning(output)
175                         else:
176                                 for line in output.splitlines():
177                                         line = line.strip()
178                                         if line.startswith('Uncompressed size:'):
179                                                 match = re.match(r'Uncompressed size:  .*?([0-9,]+) B', line)
180                                                 if match: uc_size = int(''.join(match.group(1).split(',')))
181                                                 else: logging.warning('xz reported weird output for %s: %s', filename, line)
182                 # *.lz
183                 elif magic[:4] == 'LZIP':
184                         f.seek(-16, 2)
185                         data = f.read(8)
186                         uc_size = struct.unpack('<Q', data)[0]
187         return max(file_size, uc_size)
188
189 def tmp_size():
190         stats = os.statvfs(tempfile.gettempdir())
191         return stats.f_frsize*stats.f_blocks
192
193 def tmp_space():
194         stats = os.statvfs(tempfile.gettempdir())
195         return stats.f_frsize*stats.f_bavail
196
197 def tmp_environ(tmp_dir):
198         tmp_env = { 'TMP': tmp_dir, 'TMPDIR': tmp_dir, 'TEMP': tmp_dir, 'TEMPDIR': tmp_dir }
199         return dict(list(os.environ.items()) + list(tmp_env.items()))
200
201 # Helper functions for generating path names
202
203 def hash_path_parent(dir, hash):
204         return os.path.join(dir, hash[0:2], hash[2:4])
205
206 def hash_path(dir, hash):
207         return os.path.join(dir, hash[0:2], hash[2:4], hash)
208
209 def hash_path_exists(dir, hash):
210         return os.path.exists(os.path.join(dir, hash[0:2], hash[2:4], hash))
211
212 def snapshot_hash_path(hash):
213         return hash_path(snapshot_cache_dir, hash)
214
215 def snapshot_hash_path_exists(hash):
216         return hash_path_exists(snapshot_cache_dir, hash)
217
218 def part_hash_path(part):
219         if 'sha1' in part:
220                 path = snapshot_hash_path(part['sha1'])
221                 if not os.path.exists(path): path = hash_path(sha1_cache_dir, part['sha1'])
222                 return path
223         elif 'sha256' in part:
224                 return hash_path(sha256_cache_dir, part['sha256'])
225         elif 'md5sum' in part:
226                 return hash_path(md5_cache_dir, part['md5sum'])
227         else:
228                 return None
229
230 def sha1_patch_path(debian_dsc_sha1, dsc_sha1, type=None):
231         path = os.path.join(hash_path(sha1_patch_dir, debian_dsc_sha1), hash_path('', dsc_sha1))
232         if type: path += '.%s' % type
233         path += '.patch'
234         return os.path.abspath(path)
235
236 def sha1_lsdiff_path(debian_dsc_sha1, dsc_sha1, type=None):
237         path = os.path.join(hash_path(sha1_lsdiff_dir, debian_dsc_sha1), hash_path('', dsc_sha1))
238         if type: path += '.%s' % type
239         path += '.lsdiff'
240         return os.path.abspath(path)
241
242 def shortslug(name):
243         return name[:4] if name.startswith('lib') else name[0]
244
245 def deriv_patch_path(name, version, debian_name, debian_version, type=None):
246         path = os.path.join(deriv_patch_dir, shortslug(debian_name), debian_name, '')
247         path += '_'.join((debian_name, debian_version, name, version))
248         if type: path += '.%s' % type
249         path += '.patch'
250         return os.path.abspath(path)
251
252 def global_patch_path(name, version, debian_name, debian_version, type=None):
253         path = os.path.join(global_patch_dir, shortslug(debian_name), debian_name, '')
254         path += '_'.join(('Debian', debian_name, debian_version, derivative_short_name, name, version))
255         if type: path += '.%s' % type
256         path += '.patch'
257         return os.path.abspath(path)
258
259 # Functions for munging source packages
260
261 def convert_lzip_to_gzip(dir, name):
262         cmdline = ['lzip', '-d', '--', name]
263         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
264         output = process.communicate()[0]
265         if process.returncode:
266                 logging.warning('lzip reported failure to decompress %s:', name)
267                 logging.warning(output)
268                 return None
269         bname = name[0:-3] # Strip off .lz
270         cmdline = ['gzip', '-1', '--', bname] # gzip -1 to reduce overhead
271         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
272         output = process.communicate()[0]
273         if process.returncode:
274                 logging.warning('gzip reported failure to compress %s:', bname)
275                 logging.warning(output)
276                 return None
277         return (name, bname+'.gz')
278
279 def update_dsc_file(dir, dsc_name, parts):
280         dsc_path = os.path.join(dir,dsc_name)
281         dsc_file = open(dsc_path,'rb')
282         dsc = deb822.Dsc(dsc_file)
283         for (old, name) in parts:
284                 path = os.path.join(dir,name)
285                 size = os.path.getsize(path)
286                 with open(path,'rb') as f:
287                         hashes = {}
288                         for (type, func) in zip(checksum_types, checksum_hashlib):
289                                 hashes[type] = getattr(hashlib, func)()
290                         for chunk in iter(lambda: f.read(128*64L), b''):
291                                 for type in checksum_types:
292                                         hashes[type].update(chunk)
293                         for type in checksum_types:
294                                 hashes[type] = hashes[type].hexdigest()
295                         for (header, type) in zip(checksum_headers, checksum_types):
296                                 if header in dsc:
297                                         dsc[header] = [{type: hashes[type], 'size': size, 'name': name} if p['name'] == old else p for p in dsc[header]]
298         dsc_file.close()
299         os.remove(dsc_path) # So we don't change the original that the dsc links to
300         with open(dsc_path,'wb') as dsc_file:
301                 dsc.dump(dsc_file)
302
303 # Functions for downloading files and storing them in the hash caches
304
305 def download_and_check_hash(url, dir, hash, hash_type):
306         try:
307                 parent = hash_path_parent(dir,hash)
308                 path = hash_path(dir,hash)
309                 logging.debug('downloading %s', url)
310                 makedirs(parent)
311                 headers = { 'User-Agent' : user_agent }
312                 response = requests.get(url, headers=headers, timeout=timeout, verify=ssl_verify)
313                 response.raise_for_status() # to catch HTTP errors
314                 data = response.raw.read()
315                 if hash_type == 'sha256':
316                         data_hash = hashlib.sha256(data).hexdigest()
317                 elif hash_type == 'md5sum':
318                         data_hash = hashlib.md5(data).hexdigest()
319                 else:
320                         logging.warning('unknown hash type detected: %s %s %s', hash_type, hash, url)
321                         return ('unknown', None)
322                 if data_hash != hash:
323                         logging.warning('incorrect hash for downloaded file, ignoring: %s %s != %s %s', hash_type, hash, data_hash, url)
324                         return ('unknown', None)
325                 sha1 = hashlib.sha1(data).hexdigest()
326                 sha1_path = hash_path(sha1_cache_dir, sha1)
327                 sha1_parent = hash_path_parent(sha1_cache_dir, sha1)
328                 makedirs(sha1_parent)
329                 snapshot_path = snapshot_hash_path(sha1)
330                 if os.path.exists(snapshot_path):
331                         symlink(snapshot_path, path)
332                         logging.debug('exists in snapshot sha1 cache: %s %s %s %s', hash_type, hash, sha1, url)
333                         return (True, sha1)
334                 else:
335                         if not os.path.exists(sha1_path):
336                                 logging.debug('correct hash for downloaded file, saving: %s %s %s %s', hash_type, hash, sha1, url)
337                                 with open(sha1_path, 'w') as f:
338                                         f.write(data)
339                         else:
340                                 logging.debug('correct hash for downloaded file, not saving: already in derivs cache: %s %s %s %s', hash_type, hash, sha1, url)
341                         symlink(os.path.relpath(sha1_path, os.path.dirname(path)), path)
342                         logging.debug('does not exist in snapshot sha1 cache: %s %s %s %s', hash_type, hash, sha1, url)
343                         return (False, sha1)
344         except requests.ConnectionError as e:
345                 logging.warning('unable to download hash file, ignoring: %s %s', e, url)
346                 return ('unknown', None)
347         except requests.HTTPError, e:
348                 st = response.status_code
349                 logging.warning('unable to download hash file, ignoring: %s: %s %s', st, e, url)
350                 return ('unknown', None)
351         except socket.error, e:
352                 logging.warning('unable to download hash file, ignoring: %s %s', e, url)
353                 return ('unknown', None)
354
355 def download_sha1(url, dir, sha1):
356         try:
357                 parent = hash_path_parent(dir,sha1)
358                 path = hash_path(dir,sha1)
359                 logging.debug('downloading sha1: %s %s', sha1, url)
360                 makedirs(parent)
361                 headers = { 'User-Agent' : user_agent }
362                 response = requests.get(url, headers=headers, timeout=timeout, verify=ssl_verify)
363                 response.raise_for_status() # to catch HTTP errors
364                 data = response.raw.read()
365                 data_sha1 = hashlib.sha1(data).hexdigest()
366                 if data_sha1 == sha1:
367                         logging.debug('correct sha1 for downloaded file, saving: %s %s', sha1, url)
368                         if not os.path.exists(path):
369                                 with open(path, 'w') as f:
370                                         f.write(data)
371                         return (False, sha1)
372                 else:
373                         logging.warning('incorrect sha1 for downloaded file, ignoring: %s != %s %s', sha1, data_sha1, url)
374                         return ('unknown', None)
375         except requests.ConnectionError as e:
376                 logging.warning('unable to download sha1 file, ignoring: %s %s', e, url)
377                 return ('unknown', None)
378         except requests.HTTPError as e:
379                 st = response.status_code
380                 logging.warning('unable to download sha1 file, ignoring: %s: %s %s', st, e, url)
381                 return ('unknown', None)
382         except socket.error, e:
383                 logging.warning('unable to download sha1 file, ignoring: %s %s', e, url)
384                 return ('unknown', None)
385
386 # Functions for checking the hash caches
387
388 def check_hash_cache(dir, hash, hash_type, url):
389         logging.debug('checking hash cache: %s %s', hash_type, hash)
390         path = hash_path(dir, hash)
391         try:
392                 result = os.readlink(path)
393                 path = os.path.join(os.path.dirname(path), result)
394         except OSError:
395                 logging.debug('does not exist in hash cache: %s %s', hash_type, hash)
396                 return download_and_check_hash(url, dir, hash, hash_type)
397         logging.debug('exists in hash cache: %s %s', hash_type, hash)
398         sha1 = os.path.basename(path)
399         if snapshot_hash_path_exists(sha1):
400                 logging.debug('exists in snapshot sha1 cache: %s', sha1)
401                 remove(hash_path(sha1_cache_dir,sha1))
402                 return (True, sha1)
403         elif hash_path_exists(sha1_cache_dir, sha1):
404                 logging.debug('exists in derivatives sha1 cache: %s', sha1)
405                 return (False, sha1)
406         else:
407                 logging.debug('missing in derivatives sha1 cache: %s', sha1)
408                 return download_and_check_hash(url, dir, hash, hash_type)
409
410 def check_sha1_cache(sha1, url):
411         logging.debug('checking sha1 caches: %s', sha1)
412         if snapshot_hash_path_exists(sha1):
413                 logging.debug('exists in snapshot sha1 cache: %s', sha1)
414                 remove(hash_path(sha1_cache_dir,sha1))
415                 return (True, sha1)
416         elif hash_path_exists(sha1_cache_dir, sha1):
417                 logging.debug('exists in derivatives sha1 cache: %s', sha1)
418                 return (False, sha1)
419         else:
420                 logging.debug('does not exist in any sha1 caches: %s', sha1)
421                 return download_sha1(url, sha1_cache_dir, sha1)
422
423 def status(type, hash, url):
424         logging.debug('checking status of hash: %s %s %s', type, hash, url)
425         if type == 'sha1':
426                 (ret, sha1) = check_sha1_cache(hash, url)
427                 if ret == True:
428                         return ('unmodified', sha1)
429                 elif ret == False:
430                         return ('modified', sha1)
431                 else:
432                         return (ret, sha1)
433         elif type == 'sha256':
434                 (ret, sha1) = check_hash_cache(sha256_cache_dir, hash, type, url)
435                 if ret == True:
436                         return ('unmodified', sha1)
437                 elif ret == False:
438                         return ('modified', sha1)
439                 else:
440                         return (ret, sha1)
441         elif type == 'md5sum':
442                 (ret, sha1) = check_hash_cache(md5_cache_dir, hash, type, url)
443                 if ret == True:
444                         return ('unmodified', sha1)
445                 elif ret == False:
446                         return ('modified', sha1)
447                 else:
448                         return (ret, sha1)
449         else:
450                 logging.warning('unknown hash type detected: %s %s %s', type, hash, url)
451                 return ('unknown', None)
452
453 # Functions for getting information about source packages
454
455 def get_info(srcpkg):
456         dsc = None
457         for header in checksum_headers:
458                 if not dsc and header in srcpkg:
459                         dsc = [x for x in srcpkg[header] if x['name'].endswith('.dsc')]
460         if not dsc:
461                 logging.warning('did not find any dsc files')
462                 return None
463         if len(dsc) > 1:
464                 logging.warning('found multiple dsc files: %s' % ' '.join(['%s %s' % (dsc_name, dsc_sha1) for dsc_name, dsc_sha1 in dsc]))
465                 return None
466         dsc = dsc[0]
467         dsc_name = dsc['name']
468         dsc_hash_type, dsc_hash =  [(k, v) for k, v in dsc.iteritems() if k not in ('name', 'size')][0]
469
470         parts = []
471         part_names = []
472         for header in checksum_headers:
473                 if header in srcpkg:
474                         for part in srcpkg[header]:
475                                 if 'name' in part and part['name'] not in part_names and not part['name'].endswith('.dsc'):
476                                         parts.append(part)
477                                         part_names.append(part['name'])
478
479         return (dsc_hash_type, dsc_hash, dsc_name, parts)
480
481 def get_debian_info(files):
482         dsc = [file for file in files if file[0].endswith('.dsc')]
483         if not dsc:
484                 logging.warning('did not find any Debian dsc files: snapshots bug or ancient source package')
485                 return None
486         if len(dsc) > 1:
487                 logging.warning('found multiple Debian dsc files, choosing first one: %s' % ' '.join(['%s %s' % (dsc_name, dsc_sha1) for dsc_name, dsc_sha1 in dsc]))
488
489         dsc = dsc[0]
490         dsc_name, dsc_sha1 = dsc
491
492         parts = []
493         part_names = []
494         for file in files:
495                 part_name, part_sha1 = file
496                 if part_name not in part_names and not part_name.endswith('.dsc'):
497                         parts.append(file)
498                         part_names.append(part_name)
499
500         return (dsc_sha1, dsc_name, parts)
501
502 # Functions for extracting information from the snapshots database
503
504 def database_error(e):
505         reason = None
506         code = None
507         if hasattr(e, 'pgerror'): reason = e.pgerror
508         if hasattr(e, 'pgcode'): code = e.pgcode
509         logging.warning('unable to execute database query: %s %s', code, reason)
510         conn.reset()
511
512 def srcpkg_was_in_debian(name, version=None):
513         try:
514                 if version:
515                         cur.execute('SELECT version FROM srcpkg WHERE name=%s AND version=%s LIMIT 1;', (name, version))
516                         return not not cur.fetchone()
517                 else:
518                         cur.execute('SELECT version FROM srcpkg WHERE name=%s LIMIT 1;', (name,))
519                         return not not cur.fetchone()
520         except psycopg2.Error, e:
521                 database_error(e)
522                 return None
523
524 def sha1_to_srcpkgs(sha1):
525         try:
526                 cur.execute(
527                         '''SELECT name, version
528                         FROM srcpkg
529                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
530                         WHERE hash=%s;''', (sha1,))
531                 return cur.fetchall()
532         except psycopg2.Error, e:
533                 database_error(e)
534                 return None
535
536 def srcpkg_to_sha1s(name, version):
537         try:
538                 cur.execute(
539                         '''SELECT hash
540                         FROM file_srcpkg_mapping
541                         JOIN srcpkg ON srcpkg.srcpkg_id=file_srcpkg_mapping.srcpkg_id
542                         WHERE name=%s AND version=%s;''', (name, version))
543                 return cur.fetchall()
544         except psycopg2.Error, e:
545                 database_error(e)
546                 return None
547
548 def srcpkg_to_srcpkgs(name):
549         try:
550                 cur.execute(
551                         '''SELECT name, version
552                         FROM srcpkg
553                         WHERE name=%s ORDER BY version DESC;''', (name,))
554                 return cur.fetchall()
555         except psycopg2.Error, e:
556                 database_error(e)
557                 return None
558
559 def sha1s_to_files(sha1):
560         try:
561                 cur.execute('SELECT DISTINCT ON (name, hash) name, hash FROM file WHERE hash=%s;', hash)
562                 return cur.fetchall()
563         except psycopg2.Error, e:
564                 database_error(e)
565                 return None
566
567 def srcpkg_to_files(name, version):
568         try:
569                 cur.execute(
570                         '''SELECT DISTINCT ON (file.name, file.hash) file.name, file.hash
571                         FROM file_srcpkg_mapping
572                         JOIN srcpkg ON srcpkg.srcpkg_id=file_srcpkg_mapping.srcpkg_id
573                         JOIN file ON file_srcpkg_mapping.hash=file.hash
574                         WHERE srcpkg.name=%s AND srcpkg.version=%s;''', (name, version))
575                 return cur.fetchall()
576         except psycopg2.Error, e:
577                 database_error(e)
578                 return None
579
580 def sha1_version_to_derived_from(sha1, version):
581         try:
582                 cur.execute(
583                         '''SELECT name, version
584                         FROM srcpkg
585                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
586                         WHERE hash=%s and version<=%s
587                         ORDER BY name ASC, version DESC
588                         LIMIT 1;''', (sha1, version))
589                 res = cur.fetchall()
590                 if res: return res
591                 cur.execute(
592                         '''SELECT name, version
593                         FROM srcpkg
594                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
595                         WHERE hash=%s
596                         ORDER BY name ASC, version ASC
597                         LIMIT 1;''', (sha1, version))
598                 return cur.fetchall()
599         except psycopg2.Error, e:
600                 database_error(e)
601                 return None
602
603 def srcpkg_to_derived_from(name, version):
604         try:
605                 cur.execute(
606                         '''SELECT name, version
607                         FROM srcpkg
608                         WHERE name=%s and version<=%s
609                         ORDER BY version DESC
610                         LIMIT 1;''', (name, version))
611                 res = cur.fetchall()
612                 if res: return res
613                 cur.execute(
614                         '''SELECT name, version
615                         FROM srcpkg
616                         WHERE name=%s
617                         ORDER BY version ASC
618                         LIMIT 1;''', (name,))
619                 return cur.fetchall()
620         except psycopg2.Error, e:
621                 database_error(e)
622                 return None
623
624 # Functions related to creating patches
625
626 # Add symlinks for all needed files
627 def prepare(dsc_name, dsc_sha1, parts):
628         logging.debug('preparing deriv directory for %s', dsc_name)
629         total_size = 0
630         unreadable_parts = []
631         symlink_parts = []
632         convert_lzip_parts = []
633         converted_parts = []
634         path = snapshot_hash_path(dsc_sha1)
635         if not os.path.exists(path): path = hash_path(sha1_cache_dir, dsc_sha1)
636         if not os.access(path, os.R_OK): unreadable_parts.append(dsc_name)
637         symlink_parts.append((path, dsc_name))
638         for part in parts:
639                 path = part_hash_path(part)
640                 if not path: continue
641                 if not os.access(path, os.R_OK):
642                         unreadable_parts.append(part['name'])
643                         continue
644                 size = uncompressed_size(path)
645                 total_size += size
646                 symlink_parts.append((path, part['name']))
647                 if part['name'].endswith('.lz'):
648                         total_size += size
649                         convert_lzip_parts.append(part['name'])
650
651         if unreadable_parts:
652                 logging.warning('some parts of %s are unreadable: %s', dsc_name, ' '.join(unreadable_parts))
653                 return (None, total_size)
654
655         # Ensure that the debdiff will have enough space
656         tmp_parent = None
657         if total_size*2.0 > tmp_size()/2.0 or total_size > tmp_space():
658                 logging.info('prepare: not enough space in /tmp, using scratch space: %s %s %s', total_size, tmp_size(), tmp_space())
659                 tmp_parent = os.path.expanduser('~/tmp')
660         elif dsc_name.startswith('libreoffice_') or dsc_name.startswith('iceweasel_'):
661                 logging.info('prepare: iceweasel/libreoffice, always using scratch space')
662                 tmp_parent = os.path.expanduser('~/tmp')
663         else:
664                 logging.debug('prepare: enough space in /tmp, not using scratch space: %s %s %s', total_size, tmp_size(), tmp_space())
665
666         tmp_dir = tempfile.mkdtemp(prefix='derivs-cmp-srcpkg-%s-' % derivative_short_name, dir=tmp_parent)
667
668         # Setup the symlinks for dpkg-source/debdiff/etc
669         for symlink_target, symlink_name in symlink_parts:
670                 symlink_path = os.path.join(tmp_dir, symlink_name)
671                 os.symlink(symlink_target, symlink_path)
672
673         # Some distributions allow additional compression schemes
674         # Here we work around this by recompressing with gzip
675         for part_name in convert_lzip_parts:
676                 converted = convert_lzip_to_gzip(tmp_dir, part_name)
677                 if converted is not None:
678                         converted_parts.append(converted)
679                 else:
680                         rmtree(tmp_dir)
681                         return (None, total_size)
682
683         # Update the dsc file if we recompressed any files
684         if converted_parts:
685                 update_dsc_file(tmp_dir, dsc_name, converted_parts)
686
687         return (tmp_dir, total_size)
688
689 def prepare_debian(dsc_name, dsc_sha1, files, total_size):
690         logging.debug('preparing Debian directory for %s', dsc_name)
691         unreadable_parts = []
692         symlink_parts = []
693         path = snapshot_hash_path(dsc_sha1)
694         if not os.access(path, os.R_OK): unreadable_parts.append(dsc_name)
695         symlink_parts.append((path, dsc_name))
696         for file in files:
697                 part_name, part_sha1 = file
698                 path = snapshot_hash_path(part_sha1)
699                 if not os.access(path, os.R_OK):
700                         unreadable_parts.append(part_name)
701                         continue
702                 size = uncompressed_size(path)
703                 total_size += size
704                 symlink_parts.append((path, part_name))
705
706         if unreadable_parts:
707                 logging.warning('some parts of %s are unreadable: %s', dsc_name, ' '.join(unreadable_parts))
708                 return None
709
710         # Ensure that the debdiff will have enough space
711         tmp_parent = None
712         if total_size > tmp_space():
713                 logging.info('prepare_debian: not enough space in /tmp, using scratch space: %s %s %s', total_size, tmp_size(), tmp_space())
714                 tmp_parent = os.path.expanduser('~/tmp')
715         elif dsc_name.startswith('libreoffice_') or dsc_name.startswith('iceweasel_'):
716                 logging.info('prepare: iceweasel/libreoffice, always using scratch space')
717                 tmp_parent = os.path.expanduser('~/tmp')
718         else:
719                 logging.debug('prepare_debian: enough space in /tmp, not using scratch space: %s %s %s', total_size, tmp_size(), tmp_space())
720
721         debian_tmp_dir = tempfile.mkdtemp(prefix='derivs-cmp-srcpkg-Debian-', dir=tmp_parent)
722
723         # Setup the symlinks for dpkg-source/debdiff/etc
724         for symlink_target, symlink_name in symlink_parts:
725                 symlink_path = os.path.join(debian_tmp_dir, symlink_name)
726                 os.symlink(symlink_target, symlink_path)
727
728         return debian_tmp_dir
729
730 def get_changelog_entries(tmp_dir, dsc_name, dsc_sha1):
731         logging.debug('getting changelog entries from %s', dsc_name)
732
733         # Cache check
734         changelog_path = hash_path(sha1_changelog_dir, dsc_sha1)
735         if os.path.exists(changelog_path):
736                 logging.debug('changelog cache exists for %s %s', dsc_name, dsc_sha1)
737                 with open(changelog_path) as f:
738                         try: changelog_entries = json.load(f)
739                         except ValueError: pass
740                         else: return [tuple(entry) for entry in changelog_entries]
741
742         # Preparation
743         extract_path = os.path.join(tmp_dir,'extracted')
744
745         # Unpack the source tree
746         logging.debug('unpacking source package %s', dsc_name)
747         cmdline = ['dpkg-source', '-x', dsc_name, 'extracted']
748         process = subprocess.Popen(cmdline, cwd=tmp_dir, env=tmp_environ(tmp_dir), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
749         output = process.communicate()[0]
750         if process.returncode:
751                 logging.warning('dpkg-source reported failure to extract %s:', dsc_name)
752                 logging.warning(output)
753                 cmdline = ['ls', '-lR', '--time-style=+']
754                 process = subprocess.Popen(cmdline, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
755                 output = process.communicate()[0]
756                 logging.warning(output)
757                 rmtree(extract_path)
758                 return None
759
760         # Sanitise the debian dir and changelog file in case it is a symlink to outside
761         debian_dir = os.path.join(extract_path, 'debian')
762         changelog_filename = os.path.join(debian_dir,'changelog')
763         if os.path.islink(debian_dir) or os.path.islink(changelog_filename):
764                 logging.warning('debian dir or changelog is a symbolic link %s', dsc_name)
765                 rmtree(extract_path)
766                 return None
767
768         # Check if the changelog exists
769         if not os.path.exists(changelog_filename):
770                 logging.warning('could not find changelog in %s', dsc_name)
771                 rmtree(extract_path)
772                 return None
773
774         # Find out which source package is the most likely derivative
775         logging.debug('parsing changelog for %s', dsc_name)
776         changelog_file = open(changelog_filename)
777         try:
778                 changelog_obj = changelog.Changelog(changelog_file)
779         except UnicodeDecodeError:
780                 changelog_file.seek(0)
781                 changelog_obj = changelog.Changelog(changelog_file, encoding='iso-8859-1')
782         try:
783                 changelog_entries = [(entry.package, str(entry._raw_version)) for entry in changelog_obj]
784         except:
785                 logging.warning('could not read changelog from %s', dsc_name)
786                 rmtree(extract_path)
787                 return None
788         del changelog_obj
789         changelog_file.close()
790
791         # Clean up again
792         rmtree(extract_path)
793
794         # Write the cache
795         makedirs(hash_path_parent(sha1_changelog_dir, dsc_sha1))
796         remove(changelog_path)
797         with open(changelog_path, 'w') as f:
798                 json.dump(changelog_entries, f)
799
800         return changelog_entries
801
802 # Find the source package name and version this is probably derived from
803 def find_derived_from(tmp_dir, name, version, dsc_name, dsc_sha1, parts_unmodified):
804         logging.debug('finding base source package of %s %s', name, version)
805
806         # Get a list of changelog entries
807         changelog_entries = get_changelog_entries(tmp_dir, dsc_name, dsc_sha1)
808         if changelog_entries:
809                 logging.debug('changelog entries are: %s', ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in changelog_entries]))
810
811         # Get a list of candidate versions from the database
812         possibly_derived_from = []
813         logging.debug('checking which parts were in Debian')
814         for part_sha1, part_name in parts_unmodified:
815                 part_derived_from = sha1_to_srcpkgs(part_sha1)
816                 if part_derived_from:
817                         logging.debug('part %s %s available in %s', part_sha1, part_name, ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in part_derived_from]))
818                         possibly_derived_from.extend(part_derived_from)
819
820         if not possibly_derived_from:
821                 logging.debug('no parts in common with Debian, obtaining old versions')
822                 old_packages = srcpkg_to_srcpkgs(name)
823                 if old_packages: possibly_derived_from = old_packages
824
825         # Uniqify
826         possibly_derived_from = list(set(possibly_derived_from))
827         if possibly_derived_from:
828                 logging.debug('possibly derived from: %s', ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in possibly_derived_from]))
829         else:
830                 logging.debug('nothing in possibly derived from list')
831
832         # Match changelog versions against candidates
833         if changelog_entries:
834                 logging.debug('matching changelog entries against versions possibly derived from')
835                 for entry in changelog_entries:
836                         entry_name, entry_version = entry
837                         if entry in possibly_derived_from:
838                                 logging.debug('%s %s in possibly derived from', entry_name, entry_version)
839                                 return entry
840                 logging.debug('checking if changelog entries were ever in Debian')
841                 for entry_name, entry_version in changelog_entries:
842                         if srcpkg_was_in_debian(entry_name, entry_version):
843                                 logging.debug('%s %s was in Debian', entry_name, entry_version)
844                                 return (entry_name, entry_version)
845         if possibly_derived_from:
846                 logging.debug('finding closest entry in possibly derived from')
847                 possibly_derived_from.sort(cmp=lambda a,b: apt_version_cmp(b[1],a[1]))
848                 for entry_name, entry_version in possibly_derived_from:
849                         if name == entry_name and apt_version_cmp(version, entry_version) >= 0:
850                                 logging.debug('%s %s is an equal or lower version', entry_name, entry_version)
851                                 return (entry_name, entry_version)
852                 entry = possibly_derived_from[-1]
853                 entry_name, entry_version = entry
854                 logging.debug('no lower version numbers, returning next highest version %s %s', entry_name, entry_version)
855                 return entry
856         logging.debug('finding closest version number in Debian')
857         for entry in srcpkg_to_derived_from(name, version):
858                 entry_name, entry_version = entry
859                 logging.debug('closest package was %s %s', entry_name, entry_version)
860                 return entry
861         logging.debug('could not find Debian package %s %s is derived from', name, version)
862         return None
863
864 # Generate a patch file
865 def create_patch(tmp_dir, dsc_name, dsc_sha1, debian_tmp_dir, debian_dsc_name, debian_dsc_sha1):
866         global repackaged_but_identical
867
868         dsc_path = os.path.join(tmp_dir, dsc_name)
869         debian_dsc_path = os.path.join(debian_tmp_dir, debian_dsc_name)
870         path_everything = sha1_patch_path(debian_dsc_sha1, dsc_sha1)
871         path_debian = sha1_patch_path(debian_dsc_sha1, dsc_sha1, 'debian')
872
873         # Generate the main patch
874         if not os.path.exists(path_everything) and os.path.exists(debian_dsc_path) and os.path.exists(dsc_path):
875                 makedirs(os.path.dirname(path_everything))
876                 cmdline = ['debdiff', '--quiet', '--diffstat', debian_dsc_path, dsc_path]
877                 stdout = open(path_everything, 'w')
878                 process = subprocess.Popen(cmdline, env=tmp_environ(tmp_dir), stdout=stdout, stderr=subprocess.PIPE, preexec_fn=subprocess_setup)
879                 output = process.communicate()[1]
880                 stdout.close()
881                 if process.returncode == 255:
882                         logging.warning('debdiff reported failure %s %s:', debian_dsc_name, dsc_name)
883                         logging.warning(output)
884                         cmdline = ['ls', '-lR', '--time-style=+']
885                         for name, dir in (derivative_short_name, tmp_dir), ('Debian', debian_tmp_dir):
886                                 logging.warning('dir listing for %s:', name)
887                                 process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
888                                 output = process.communicate()[0]
889                                 logging.warning(output)
890                         return False
891                 elif process.returncode == 0:
892                         logging.info('derivative repackaged in an identical way %s %s', debian_dsc_name, dsc_name)
893                         repackaged_but_identical += 1
894                         return False
895                 elif process.returncode != 1:
896                         logging.warning('debdiff reported unknown return code %s %s %s:', process.returncode, debian_dsc_name, dsc_name)
897                         logging.warning(output)
898                         cmdline = ['ls', '-lR', '--time-style=+']
899                         for name, dir in (derivative_short_name, tmp_dir), ('Debian', debian_tmp_dir):
900                                 logging.warning('dir listing for %s:', name)
901                                 process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
902                                 output = process.communicate()[0]
903                                 logging.warning(output)
904                         return False
905
906         # Filter the main patch to include only the debian/ directory
907         if os.path.exists(path_everything) and not os.path.exists(path_debian):
908                 makedirs(os.path.dirname(path_debian))
909                 cmdline = ['filterdiff', '--include=*/debian/*', path_everything]
910                 filterdiff = subprocess.Popen(cmdline, env=tmp_environ(tmp_dir), stdout=subprocess.PIPE, preexec_fn=subprocess_setup)
911                 filterdiff_output = filterdiff.communicate()[0]
912                 diffstat = subprocess.Popen('diffstat', stdin=subprocess.PIPE, stdout=subprocess.PIPE, preexec_fn=subprocess_setup)
913                 diffstat_output = diffstat.communicate(filterdiff_output)[0]
914                 with open(path_debian, 'w') as f:
915                         f.write('diffstat of debian/ for %s %s\n' % (os.path.splitext(debian_dsc_name)[0], os.path.splitext(dsc_name)[0]))
916                         f.write('\n')
917                         f.write(diffstat_output)
918                         f.write('\n')
919                         f.write(filterdiff_output)
920
921         # Patches > 100MB are probably not that useful, replace them with a link
922         for path in path_everything, path_debian:
923                 try:
924                         if os.path.getsize(path) > 104857600:
925                                 logging.info('patch between %s and %s is larger than 100MB', dsc_name, debian_dsc_name)
926                                 remove(path)
927                                 symlink(os.path.relpath(patch_too_large, os.path.dirname(path)), path)
928                 except OSError:
929                         pass
930
931         return True
932
933 def check_patch(debian_dsc_sha1, dsc_sha1):
934         patch_path = sha1_patch_path(debian_dsc_sha1, dsc_sha1)
935         lsdiff_path = sha1_lsdiff_path(debian_dsc_sha1, dsc_sha1)
936         if os.path.exists(lsdiff_path):
937                 logging.debug('lsdiff cache exists for %s', patch_path)
938                 with open(lsdiff_path) as f:
939                         lsdiff = f.read()
940         else:
941                 logging.debug('lsdiff cache does not exist for %s', patch_path)
942                 cmdline = ['lsdiff', patch_path]
943                 process = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
944                 lsdiff = process.communicate()[0]
945                 makedirs(os.path.dirname(lsdiff_path))
946                 with open(lsdiff_path,'w') as f:
947                         f.write(lsdiff)
948         lsdiff = lsdiff.splitlines()
949         for line in lsdiff:
950                 if line != 'debian/changelog' and not line.endswith('/debian/changelog'):
951                         logging.debug('patch changes files other than debian/changelog')
952                         return True
953         if lsdiff:
954                 logging.debug('patch does not change files other than debian/changelog')
955         else:
956                 logging.debug('patch does not change any files')
957         return False
958
959 def present_patch(name, version, dsc_sha1, debian_name, debian_version,  debian_dsc_sha1):
960         useful_patch = check_patch(debian_dsc_sha1, dsc_sha1)
961         patches = []
962         types = ('', 'debian')
963         for type in types:
964                 ln_to = sha1_patch_path(debian_dsc_sha1, dsc_sha1, type)
965                 if not os.path.exists(ln_to):
966                         continue
967                 ln_from_deriv = deriv_patch_path(name, version, debian_name, debian_version, type)
968                 ln_from_global = global_patch_path(name, version, debian_name, debian_version, type)
969                 makedirs(os.path.dirname(ln_from_deriv))
970                 makedirs(os.path.dirname(ln_from_global))
971                 remove(ln_from_deriv)
972                 remove(ln_from_global)
973                 symlink(os.path.relpath(ln_to, os.path.dirname(ln_from_deriv)), ln_from_deriv)
974                 symlink(os.path.relpath(ln_to, os.path.dirname(ln_from_global)), ln_from_global)
975                 if useful_patch:
976                         patches.append(os.path.relpath(ln_from_global, os.path.abspath(global_patch_dir)))
977         return tuple(patches)
978
979 # Functions that wrap other functions and decide what to do
980
981 def check_source_package(source_entry, srcpkg):
982         global modifies_dsc_files
983
984         try:
985                 name = None
986                 version = None
987                 dir = None
988                 name = srcpkg['Package']
989                 version = srcpkg['Version']
990                 dir = srcpkg['Directory']
991                 if '/' in name or name == '..':
992                         logging.warning('could not process source package %s %s: possibly malicious name', name, version)
993                         return None
994                 if '/' in version or version == '..':
995                         logging.warning('could not process source package %s %s: possibly malicious version', name, version)
996                         return None
997                 if '..' in dir.split('/'):
998                         logging.warning('could not process source package %s %s: possibly malicious dir: %s', name, version, dir)
999                         return None
1000         except KeyError:
1001                 logging.warning('could not process source package %s %s', name, version)
1002                 return None
1003         logging.debug('started processing source package %s %s', name, version)
1004         info = get_info(srcpkg)
1005         if not info:
1006                 logging.warning('finished processing source package %s %s: could not get any info', name, version)
1007                 return None
1008         dsc_hash_type, dsc_hash, dsc_name, parts = info
1009         if '/' in dsc_name or dsc_name == '..':
1010                 logging.warning('could not process source package %s %s: possibly malicious dsc name %s', name, version, dsc_name)
1011                 return None
1012         if not ishex(dsc_hash):
1013                 logging.warning('could not process source package %s %s: possibly malicious dsc hash %s', name, version, dsc_hash)
1014                 return None
1015         dsc_url = source_entry.archive_uri('%s/%s' % (dir, dsc_name))
1016         logging.debug('found dsc file: %s %s %s', dsc_hash_type, dsc_hash, dsc_url)
1017         dsc_status, dsc_sha1 = status(dsc_hash_type, dsc_hash, dsc_url)
1018         logging.debug('checked dsc status: %s %s %s', dsc_status, dsc_sha1, dsc_url)
1019         if dsc_status == 'unmodified':
1020                 # Ignore the srcpkg since we know it is was in Debian
1021                 # at one point and is hopefully therefore unmodified
1022                 logging.debug('finished processing source package %s %s: dsc unmodified', name, version)
1023                 return None
1024         else:
1025                 files = [(dsc_sha1, dsc_hash_type, dsc_hash)]
1026                 parts_unmodified = []
1027                 parts_modified = []
1028                 parts_unknown = []
1029                 for part in parts:
1030                         part_name = part['name']
1031                         if '/' in part_name or part_name == '..':
1032                                 logging.warning('could not process source package %s %s: possibly malicious part name %s', name, version, part_name)
1033                                 return None
1034                         part_url = source_entry.archive_uri('%s/%s' % (dir, part_name))
1035                         part_hash_type, part_hash = [(k, v) for k, v in part.iteritems() if k not in ('name', 'size')][0]
1036                         if not ishex(part_hash):
1037                                 logging.warning('could not process source package %s %s: possibly malicious part hash %s', name, version, part_hash)
1038                                 return None
1039                         logging.debug('found part file: %s %s %s', part_hash_type, part_hash, part_url)
1040                         part_status, part_sha1 = status(part_hash_type, part_hash, part_url)
1041                         logging.debug('checked part status: %s %s %s', part_status, part_sha1, part_url)
1042                         if 'sha1' not in part and part_sha1: part['sha1'] = part_sha1
1043                         if part_status == 'unmodified': parts_unmodified.append((part_sha1, part_name))
1044                         elif part_status == 'modified': parts_modified.append((part_sha1, part_name))
1045                         else: parts_unknown.append((part_sha1, part_name))
1046                         if part_status == 'modified': files.append((part_sha1, part_hash_type, part_hash))
1047
1048                 all_parts_unmodified = (len(parts_unmodified) == len(parts))
1049                 parts_unmodified = list(set(parts_unmodified))
1050                 logging.debug('source package status %s %s: dsc %s, %s parts unmodified, %s parts modified, %s parts unknown', name, version, dsc_status, len(parts_unmodified), len(parts_modified), len(parts_unknown))
1051
1052                 if all_parts_unmodified:
1053                         # Ignore the srcpkg since we know all the parts were
1054                         # in Debian at one point and ergo, it is unmodified
1055                         logging.debug('finished processing source package %s %s: all non-dsc parts unmodified', name, version)
1056                         if dsc_status == 'modified':
1057                                 logging.info('source package %s %s: unmodified, but dsc different', name, version)
1058                                 modifies_dsc_files += 1
1059                         return (files, None, None, None)
1060                 else:
1061                         logging.debug('some parts modified, looking for derived version %s %s', name, version)
1062                         if not dsc_sha1:
1063                                 logging.warning('finished processing source package %s %s: sha1 missing for dsc file', name, version)
1064                                 return (files, None, None, None)
1065                         if parts_unknown:
1066                                 logging.warning('finished processing source package %s %s: sha1 missing for some parts', name, version)
1067                                 return (files, None, None, None)
1068                         new = None
1069                         link = None
1070                         patch = None
1071                         tmp_dir, size = prepare(dsc_name, dsc_sha1, parts)
1072                         if not tmp_dir:
1073                                 logging.warning('source package %s %s: could not create temporary dir for deriv: %s', name, version, dsc_name)
1074                                 return (files, None, None, None)
1075                         derived_from = find_derived_from(tmp_dir, name, version, dsc_name, dsc_sha1, parts_unmodified)
1076                         if derived_from:
1077                                 debian_name, debian_version = derived_from
1078                                 link = (debian_name, debian_version, name, version, dsc_url)
1079                                 logging.debug('source package %s %s derived from %s %s', name, version, debian_name, debian_version)
1080                                 debian_files = srcpkg_to_files(debian_name, debian_version)
1081                                 if debian_files:
1082                                         debian_info = get_debian_info(debian_files)
1083                                         if debian_info:
1084                                                 debian_dsc_sha1, debian_dsc_name, debian_parts = debian_info
1085                                                 logging.debug('Debian source package %s %s dsc found %s %s', debian_name, debian_version, debian_dsc_name, debian_dsc_sha1)
1086                                                 debian_tmp_dir = prepare_debian(debian_dsc_name, debian_dsc_sha1, debian_parts, size)
1087                                                 if debian_tmp_dir:
1088                                                         patch_created = create_patch(tmp_dir, dsc_name, dsc_sha1, debian_tmp_dir, debian_dsc_name, debian_dsc_sha1)
1089                                                         if patch_created:
1090                                                                 patch_names = present_patch(name, version, dsc_sha1, debian_name, debian_version, debian_dsc_sha1)
1091                                                                 if patch_names:
1092                                                                         patch = (debian_name, debian_version, debian_dsc_sha1, name, version, dsc_sha1, [part[0] for part in parts_modified], patch_names)
1093                                                                 else:
1094                                                                         logging.debug('patch between %s %s and %s %s is probably not useful', debian_name, debian_version, name, version)
1095                                                         rmtree(debian_tmp_dir)
1096                                                 else:
1097                                                         # This could be an issue with disk space, snapshots or a file that is not distributable
1098                                                         logging.warning('source package %s %s: could not create temporary dir for Debian: %s %s', name, version, debian_name, debian_version)
1099                                         else:
1100                                                 logging.warning('source package %s %s: could not get Debian info for %s %s: %s', name, version, debian_name, debian_version, debian_info)
1101                                 else:
1102                                         if srcpkg_was_in_debian(debian_name, debian_version):
1103                                                 logging.warning('source package %s %s: snapshot database issue, no Debian files found', debian_name, debian_version)
1104                                         else:
1105                                                 logging.warning('source package %s %s: derived from %s %s possibly bogus', name, version, debian_name, debian_version)
1106                         else:
1107                                 new = (name, version, dsc_url)
1108                         rmtree(tmp_dir)
1109                         logging.debug('finished processing source package %s %s: all done', name, version)
1110                         return (files, patch, link, new)
1111
1112 def process_sources(source_entries, lists_dir):
1113         files = []
1114         patches = []
1115         links = []
1116         new = []
1117         for source in source_entries:
1118                 for source_entry in source:
1119                         logging.debug('processing sources.list entry %s', source_entry.describe)
1120                         fn = os.path.join(lists_dir, source_entry.describe.rstrip(')').rpartition('(')[2])
1121                         try: f = file(fn)
1122                         except IOError: continue
1123                         for srcpkg in deb822.Sources.iter_paragraphs(f):
1124                                 actions = check_source_package(source_entry, srcpkg)
1125                                 if actions:
1126                                         action_files, action_patch, action_link, action_new = actions
1127                                         if action_files:
1128                                                 files.append(action_files)
1129                                                 logging.debug('action: return files %s', ' '.join([' '.join([str(item) for item in action]) for action in action_files]))
1130                                         if action_patch:
1131                                                 patches.append(action_patch)
1132                                                 logging.debug('action: return patches %s', ' '.join([' '.join(action) for action in action_patch]))
1133                                         if action_link:
1134                                                 links.append(action_link)
1135                                                 logging.debug('action: return links to modified source packages %s', ' '.join(action_link))
1136                                         if action_new:
1137                                                 new.append(action_new)
1138                                                 logging.debug('action: return links to new source packages %s', ' '.join(action_new))
1139                                 logging.debug('done')
1140                                 logging.debug('')
1141                         f.close()
1142         return (files, patches, links, new)
1143
1144 logging.debug('processing distribution %s', derivative_short_name)
1145
1146 files, patches, links, new = process_sources(source_entries, lists_dir)
1147
1148 # Done with the database, close the connection
1149 cur.close()
1150 conn.close()
1151
1152 # Write out the results
1153 filename = sys.argv[3]
1154 data = files
1155 if data:
1156         output_data = {}
1157         for package in data:
1158                 for modified_file in package:
1159                         sha1, hash_type, hash = modified_file
1160                         if sha1 not in output_data:
1161                                 output_data[sha1] = {}
1162                         if hash_type != 'sha1' and hash_type not in output_data[sha1]:
1163                                 output_data[sha1][hash_type] = hash
1164                         elif hash_type != 'sha1' and hash != output_data[sha1][hash_type]:
1165                                 logging.warning('hashes mismatched: %s: %s %s != %s', sha1, hash_type, hash, output_data[sha1][hash_type])
1166         with open(os.path.abspath(filename), 'wb') as output:
1167                 yaml.safe_dump(output_data, output)
1168
1169 filename = sys.argv[4]
1170 data = patches
1171 if data:
1172         if not os.path.exists(os.path.join(global_patch_dir,'HEADER.html')):
1173                 symlink('../../doc/HEADER.patches.html',os.path.join(global_patch_dir,'HEADER.html'))
1174         if not os.path.exists(os.path.join(global_patch_dir,'.htaccess')):
1175                 symlink('../../etc/htaccess.patches',os.path.join(global_patch_dir,'.htaccess'))
1176         if not os.path.exists(os.path.join(deriv_patch_dir,'HEADER.html')):
1177                 symlink('../../../doc/HEADER.patches.html',os.path.join(deriv_patch_dir,'HEADER.html'))
1178         if not os.path.exists(os.path.join(global_patch_dir,'.htaccess')):
1179                 symlink('../../../etc/htaccess.patches',os.path.join(deriv_patch_dir,'.htaccess'))
1180         output_data = []
1181         for item in data:
1182                 debian_name, debian_version, debian_sha1, name, version, sha1, parts_sha1, patches = item
1183                 item = {}
1184                 item['debian_name'] = debian_name
1185                 item['debian_version'] = debian_version
1186                 item['debian_sha1'] = debian_sha1
1187                 item['name'] = name
1188                 item['version'] = version
1189                 item['sha1'] = sha1
1190                 item['patches'] = patches
1191                 item['parts'] = parts_sha1
1192                 output_data.append(item)
1193         with open (os.path.abspath(filename), 'wb') as output:
1194                 yaml.safe_dump(output_data, output)
1195 else:
1196         remove(filename)
1197
1198 filename = sys.argv[5]
1199 data = links
1200 if data:
1201         data = list(set(data))
1202         data.sort(cmp=lambda a,b: cmp(a[0],b[0]) or apt_version_cmp(a[1],b[1]) or cmp(a[2],b[2]) or apt_version_cmp(a[3],b[3]))
1203         output_data = {}
1204         for debian_name, debian_version, name, version, dsc_url in data:
1205                 if debian_name not in output_data:
1206                         output_data[debian_name] = {}
1207                 if debian_version not in output_data[debian_name]:
1208                         output_data[debian_name][debian_version] = []
1209                 item = {}
1210                 item['name'] = name
1211                 item['version'] = version
1212                 item['dsc'] = dsc_url
1213                 output_data[debian_name][debian_version].append(item)
1214         with open (os.path.abspath(filename), 'wb') as output:
1215                 yaml.safe_dump(output_data, output)
1216 else:
1217         remove(filename)
1218
1219 filename = sys.argv[6]
1220 data = new
1221 if data:
1222         data = list(set(data))
1223         data.sort(cmp=lambda a,b: cmp(a[0],b[0]) or apt_version_cmp(a[1],b[1]) or cmp(a[2],b[2]))
1224         output_data = {}
1225         for name, version, dsc_url in data:
1226                 if name not in output_data:
1227                         output_data[name] = {}
1228                 if version not in output_data[name]:
1229                         output_data[name][version] = []
1230                 output_data[name][version].append(str(dsc_url))
1231         with open(os.path.abspath(filename), 'wb') as output:
1232                 yaml.safe_dump(output_data, output)
1233 else:
1234         remove(filename)
1235
1236 logging.shutdown()