08f07fea8eaf8f53aae0fb05f0df44ab91fbb5a7
[dex/census.git] / bin / compare-source-package-list
1 #!/usr/bin/python
2
3 # Copyright 2011 Paul Wise
4 # Released under the MIT/Expat license, see doc/COPYING
5
6 # Uses the snapshot.debian.org metadata database and SHA-1 based filesystem to
7 # compute debdiffs between Debian and individual derivatives. The metadata
8 # allows knowing if a particular file was ever in Debian and the filesystem
9 # allows the creation of debdiffs.
10 #
11 # The script works approximately like this:
12 #
13 # Load the Sources files previously downloaded by get-package-lists as indicated
14 # by the sources.list of the derivative.
15 #
16 # For each source package in the derivative:
17 #
18 # Check if the dsc has ever been in Debian, if not, check if the other
19 # parts have and therefore decide if the package is unmodified or not.
20 # Unmodified source packages are skipped and include those with the exact
21 # same dsc file or those where all the non-dsc parts are identical.
22 #
23 # Try some heuristics (name, version, changelog entries) to find out if
24 # the package could be based on some package that is or was in Debian.
25 #
26 # If it was not then skip to the next one and make a note, since Debian
27 # might want to know about source packages that are missing from Debian.
28 #
29 # If it was then use debdiff to create a diff and filterdiff to create a
30 # diff of the debian/ dir.
31 #
32 # Usage:
33 # compare-source-package-list <sources.list> <apt dir> <patches list> <links list> <new package list> <log file>
34
35 # FIXME: write out some statistics and rrdtool graphs
36 #               source package types per derivative
37 #               number of source packages
38 #               cache misses: md5, sha256, sha1, patch, changelog
39 # FIXME: comment the code to list assumptions and function purpose
40 # FIXME: add options to allow re-processing only specific packages
41 # FIXME: write something to clean up old files and patches
42 # FIXME: don't unpack or make a patch when we don't have all the parts
43 # FIXME: don't make a patch when we were not able to unpack the source package
44 # FIXME: cleanup files at start of run
45 # FIXME: extract new debian/patches/ patches
46 # FIXME: print out packages that are no longer in Debian
47 # FIXME: deal with really large patches:
48 # FIXME:   kde-l10n-*: too few parts to be useful
49 # FIXME:   divergence: too many changelog entries between versions to be useful
50 # FIXME:   derivative is older than Debian
51 # FIXME:   derivative renamed the source package
52 # FIXME:   just a really big diff
53 # FIXME: when there are multiple dsc files in snapshots, prefer the debian/debian-archive one
54 # FIXME: when the source package is ancient and the dsc is missing, make a fake one to use
55 # FIXME: add an in-memory cache of hashes so that hashes in multiple releases hit the disk once
56 # FIXME: deal with rate-limited websites like alioth that do not like many requests
57
58 import re
59 import os
60 import sys
61 import httplib
62 import urllib2
63 import hashlib
64 import shutil
65 import logging
66 import tempfile
67 import string
68 import socket
69 import signal
70 import subprocess
71 import yaml
72 from debian import deb822, changelog
73 import apt_pkg
74 import psycopg2
75 try: import simplejson as json
76 except ImportError: import json
77 import struct
78
79 # Helper functions for python stuff with annoying error handling
80
81 def makedirs(dirs):
82         try: os.makedirs(dirs)
83         except OSError: pass
84
85 def rmtree(dir):
86         try: shutil.rmtree(dir)
87         except OSError: pass
88
89 def remove(file):
90         try: os.remove(file)
91         except OSError: pass
92
93 def symlink(source, link):
94         try: os.symlink(source, link)
95         except OSError: pass
96
97 # http://www.chiark.greenend.org.uk/ucgi/~cjwatson/blosxom/2009-07-02-python-sigpipe.html
98 def subprocess_setup():
99         # Python installs a SIGPIPE handler by default. This is usually not what
100         # non-Python subprocesses expect.
101         signal.signal(signal.SIGPIPE, signal.SIG_DFL)
102
103 # We need to map apt_pkg.version_compare return values to cmp return values
104 # The documentation is incorrect: https://bugs.debian.org/680891
105 def apt_version_cmp(a, b):
106         ret = apt_pkg.version_compare(a, b)
107         if ret < 0: return -1
108         elif ret > 0: return 1
109         else: return 0
110
111 # Config
112 md5_cache_dir = os.path.abspath('../md5-farm')
113 sha1_cache_dir = os.path.abspath('../sha1-farm')
114 sha256_cache_dir = os.path.abspath('../sha256-farm')
115 sha1_patch_dir = os.path.abspath('../sha1-patches')
116 sha1_lsdiff_dir = os.path.abspath('../sha1-lsdiff')
117 sha1_changelog_dir = os.path.abspath('../sha1-changelog')
118 deriv_patch_dir = os.path.abspath('patches')
119 global_patch_dir = os.path.abspath('../patches')
120 snapshot_cache_dir = '/srv/snapshot.debian.org/farm'
121 patch_too_large = os.path.abspath('../../doc/patch-too-large.txt')
122 checksum_types = ('sha1', 'sha256', 'md5sum')
123 checksum_hashlib = ('sha1', 'sha256', 'md5')
124 checksum_headers = ('Checksums-Sha1', 'Checksums-Sha256', 'Files')
125 user_agent = 'Debian Derivatives Census QA bot'
126 timeout = 60
127 ishex = lambda s: not(set(s)-set(string.hexdigits))
128
129 # Init
130 apt_pkg.init()
131
132 # Preparation
133 sources_list = apt_pkg.SourceList()
134 sources_list.read_main_list()
135 conn = psycopg2.connect("service=snapshot-guest")
136 cur = conn.cursor()
137 remove(sys.argv[7])
138 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG, filename=sys.argv[7])
139
140 # Voodoo
141 lists_dir = apt_pkg.config.find_dir('Dir::State::lists')
142 source_entries = [[i for i in x.index_files if i.label=='Debian Source Index'] for x in sources_list.list]
143 derivative_short_name = os.path.basename(os.getcwd())
144 modifies_dsc_files = 0
145 repackaged_but_identical = 0
146
147 # Generic helper functions
148
149 def uncompressed_size(filename):
150         uc_size = 0
151         file_size = os.path.getsize(filename)
152         with open(filename, 'rb') as f:
153                 magic = f.read(6)
154                 # *.gz
155                 if magic[:2] == "\x1f\x8b":
156                         f.seek(-4, 2)
157                         data = f.read()
158                         uc_size = struct.unpack('<I', data)[0]
159                 # *.bz2
160                 elif magic[:3] == 'BZh':
161                         # Crude estimate based on average compression ratio of 25%
162                         uc_size = file_size*4
163                 # *.xz
164                 elif magic == "\xfd7zXZ\x00":
165                         cmdline = ['xz', '--verbose', '--list', filename]
166                         process = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
167                         output = process.communicate()[0]
168                         if process.returncode:
169                                 logging.warning('xz reported failure to check size of %s:', filename)
170                                 logging.warning(output)
171                         else:
172                                 for line in output.splitlines():
173                                         line = line.strip()
174                                         if line.startswith('Uncompressed size:'):
175                                                 match = re.match(r'Uncompressed size:  .*?([0-9,]+) B', line)
176                                                 if match: uc_size = int(''.join(match.group(1).split(',')))
177                                                 else: logging.warning('xz reported weird output for %s: %s', filename, line)
178                 # *.lz
179                 elif magic[:4] == 'LZIP':
180                         f.seek(-16, 2)
181                         data = f.read(8)
182                         uc_size = struct.unpack('<Q', data)[0]
183         return max(file_size, uc_size)
184
185 def tmp_size():
186         stats = os.statvfs(tempfile.gettempdir())
187         return stats.f_frsize*stats.f_blocks
188
189 def tmp_space():
190         stats = os.statvfs(tempfile.gettempdir())
191         return stats.f_frsize*stats.f_bavail
192
193 def tmp_environ(tmp_dir):
194         tmp_env = { 'TMP': tmp_dir, 'TMPDIR': tmp_dir, 'TEMP': tmp_dir, 'TEMPDIR': tmp_dir }
195         return dict(list(os.environ.items()) + list(tmp_env.items()))
196
197 # Helper functions for generating path names
198
199 def hash_path_parent(dir, hash):
200         return os.path.join(dir, hash[0:2], hash[2:4])
201
202 def hash_path(dir, hash):
203         return os.path.join(dir, hash[0:2], hash[2:4], hash)
204
205 def hash_path_exists(dir, hash):
206         return os.path.exists(os.path.join(dir, hash[0:2], hash[2:4], hash))
207
208 def snapshot_hash_path(hash):
209         return hash_path(snapshot_cache_dir, hash)
210
211 def snapshot_hash_path_exists(hash):
212         return hash_path_exists(snapshot_cache_dir, hash)
213
214 def part_hash_path(part):
215         if 'sha1' in part:
216                 path = snapshot_hash_path(part['sha1'])
217                 if not os.path.exists(path): path = hash_path(sha1_cache_dir, part['sha1'])
218                 return path
219         elif 'sha256' in part:
220                 return hash_path(sha256_cache_dir, part['sha256'])
221         elif 'md5sum' in part:
222                 return hash_path(md5_cache_dir, part['md5sum'])
223         else:
224                 return None
225
226 def sha1_patch_path(debian_dsc_sha1, dsc_sha1, type=None):
227         path = os.path.join(hash_path(sha1_patch_dir, debian_dsc_sha1), hash_path('', dsc_sha1))
228         if type: path += '.%s' % type
229         path += '.patch'
230         return os.path.abspath(path)
231
232 def sha1_lsdiff_path(debian_dsc_sha1, dsc_sha1, type=None):
233         path = os.path.join(hash_path(sha1_lsdiff_dir, debian_dsc_sha1), hash_path('', dsc_sha1))
234         if type: path += '.%s' % type
235         path += '.lsdiff'
236         return os.path.abspath(path)
237
238 def shortslug(name):
239         return name[:4] if name.startswith('lib') else name[0]
240
241 def deriv_patch_path(name, version, debian_name, debian_version, type=None):
242         path = os.path.join(deriv_patch_dir, shortslug(debian_name), debian_name, '')
243         path += '_'.join((debian_name, debian_version, name, version))
244         if type: path += '.%s' % type
245         path += '.patch'
246         return os.path.abspath(path)
247
248 def global_patch_path(name, version, debian_name, debian_version, type=None):
249         path = os.path.join(global_patch_dir, shortslug(debian_name), debian_name, '')
250         path += '_'.join(('Debian', debian_name, debian_version, derivative_short_name, name, version))
251         if type: path += '.%s' % type
252         path += '.patch'
253         return os.path.abspath(path)
254
255 # Functions for munging source packages
256
257 def convert_lzip_to_gzip(dir, name):
258         cmdline = ['lzip', '-d', '--', name]
259         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
260         output = process.communicate()[0]
261         if process.returncode:
262                 logging.warning('lzip reported failure to decompress %s:', name)
263                 logging.warning(output)
264                 return None
265         bname = name[0:-3] # Strip off .lz
266         cmdline = ['gzip', '-1', '--', bname] # gzip -1 to reduce overhead
267         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
268         output = process.communicate()[0]
269         if process.returncode:
270                 logging.warning('gzip reported failure to compress %s:', bname)
271                 logging.warning(output)
272                 return None
273         return (name, bname+'.gz')
274
275 def update_dsc_file(dir, dsc_name, parts):
276         dsc_path = os.path.join(dir,dsc_name)
277         dsc_file = open(dsc_path,'rb')
278         dsc = deb822.Dsc(dsc_file)
279         for (old, name) in parts:
280                 path = os.path.join(dir,name)
281                 size = os.path.getsize(path)
282                 with open(path,'rb') as f:
283                         hashes = {}
284                         for (type, func) in zip(checksum_types, checksum_hashlib):
285                                 hashes[type] = getattr(hashlib, func)()
286                         for chunk in iter(lambda: f.read(128*64L), b''):
287                                 for type in checksum_types:
288                                         hashes[type].update(chunk)
289                         for type in checksum_types:
290                                 hashes[type] = hashes[type].hexdigest()
291                         for (header, type) in zip(checksum_headers, checksum_types):
292                                 if header in dsc:
293                                         dsc[header] = [{type: hashes[type], 'size': size, 'name': name} if p['name'] == old else p for p in dsc[header]]
294         dsc_file.close()
295         os.remove(dsc_path) # So we don't change the original that the dsc links to
296         with open(dsc_path,'wb') as dsc_file:
297                 dsc.dump(dsc_file)
298
299 # Functions for downloading files and storing them in the hash caches
300
301 def download_and_check_hash(url, dir, hash, hash_type):
302         try:
303                 parent = hash_path_parent(dir,hash)
304                 path = hash_path(dir,hash)
305                 logging.debug('downloading %s', url)
306                 makedirs(parent)
307                 headers = { 'User-Agent' : user_agent }
308                 req = urllib2.Request(url, None, headers)
309                 u = urllib2.urlopen(req, None, timeout)
310                 data = u.read()
311                 if hash_type == 'sha256':
312                         data_hash = hashlib.sha256(data).hexdigest()
313                 elif hash_type == 'md5sum':
314                         data_hash = hashlib.md5(data).hexdigest()
315                 else:
316                         logging.warning('unknown hash type detected: %s %s %s', hash_type, hash, url)
317                         return ('unknown', None)
318                 if data_hash != hash:
319                         logging.warning('incorrect hash for downloaded file, ignoring: %s %s != %s %s', hash_type, hash, data_hash, url)
320                         return ('unknown', None)
321                 sha1 = hashlib.sha1(data).hexdigest()
322                 sha1_path = hash_path(sha1_cache_dir, sha1)
323                 sha1_parent = hash_path_parent(sha1_cache_dir, sha1)
324                 makedirs(sha1_parent)
325                 snapshot_path = snapshot_hash_path(sha1)
326                 if os.path.exists(snapshot_path):
327                         symlink(snapshot_path, path)
328                         logging.debug('exists in snapshot sha1 cache: %s %s %s %s', hash_type, hash, sha1, url)
329                         return (True, sha1)
330                 else:
331                         if not os.path.exists(sha1_path):
332                                 logging.debug('correct hash for downloaded file, saving: %s %s %s %s', hash_type, hash, sha1, url)
333                                 with open(sha1_path, 'w') as f:
334                                         f.write(data)
335                         else:
336                                 logging.debug('correct hash for downloaded file, not saving: already in derivs cache: %s %s %s %s', hash_type, hash, sha1, url)
337                         symlink(os.path.relpath(sha1_path, os.path.dirname(path)), path)
338                         logging.debug('does not exist in snapshot sha1 cache: %s %s %s %s', hash_type, hash, sha1, url)
339                         return (False, sha1)
340         except urllib2.URLError, e:
341                 if hasattr(e, 'reason'): reason = e.reason
342                 elif hasattr(e, 'code'): reason = e.code
343                 else: reason = e
344                 logging.warning('unable to download hash file, ignoring: %s %s', reason, url)
345                 return ('unknown', None)
346         except httplib.HTTPException, e:
347                 logging.warning('unable to download hash file, ignoring: %s %s', repr(e), url)
348                 return ('unknown', None)
349         except socket.error, e:
350                 logging.warning('unable to download hash file, ignoring: %s %s', e, url)
351                 return ('unknown', None)
352
353 def download_sha1(url, dir, sha1):
354         try:
355                 parent = hash_path_parent(dir,sha1)
356                 path = hash_path(dir,sha1)
357                 logging.debug('downloading sha1: %s %s', sha1, url)
358                 makedirs(parent)
359                 headers = { 'User-Agent' : user_agent }
360                 req = urllib2.Request(url, None, headers)
361                 u = urllib2.urlopen(req, None, timeout)
362                 data = u.read()
363                 data_sha1 = hashlib.sha1(data).hexdigest()
364                 if data_sha1 == sha1:
365                         logging.debug('correct sha1 for downloaded file, saving: %s %s', sha1, url)
366                         if not os.path.exists(path):
367                                 with open(path, 'w') as f:
368                                         f.write(data)
369                         return (False, sha1)
370                 else:
371                         logging.warning('incorrect sha1 for downloaded file, ignoring: %s != %s %s', sha1, data_sha1, url)
372                         return ('unknown', None)
373         except urllib2.URLError, e:
374                 if hasattr(e, 'reason'): reason = e.reason
375                 elif hasattr(e, 'code'): reason = e.code
376                 else: reason = e
377                 logging.warning('unable to download sha1 file, ignoring: %s %s', reason, url)
378                 return ('unknown', None)
379         except httplib.HTTPException, e:
380                 logging.warning('unable to download hash file, ignoring: %s %s', repr(e), url)
381                 return ('unknown', None)
382         except socket.error, e:
383                 logging.warning('unable to download hash file, ignoring: %s %s', e, url)
384                 return ('unknown', None)
385
386 # Functions for checking the hash caches
387
388 def check_hash_cache(dir, hash, hash_type, url):
389         logging.debug('checking hash cache: %s %s', hash_type, hash)
390         path = hash_path(dir, hash)
391         try:
392                 result = os.readlink(path)
393                 path = os.path.join(os.path.dirname(path), result)
394         except OSError:
395                 logging.debug('does not exist in hash cache: %s %s', hash_type, hash)
396                 return download_and_check_hash(url, dir, hash, hash_type)
397         logging.debug('exists in hash cache: %s %s', hash_type, hash)
398         sha1 = os.path.basename(path)
399         if snapshot_hash_path_exists(sha1):
400                 logging.debug('exists in snapshot sha1 cache: %s', sha1)
401                 remove(hash_path(sha1_cache_dir,sha1))
402                 return (True, sha1)
403         elif hash_path_exists(sha1_cache_dir, sha1):
404                 logging.debug('exists in derivatives sha1 cache: %s', sha1)
405                 return (False, sha1)
406         else:
407                 logging.debug('missing in derivatives sha1 cache: %s', sha1)
408                 return download_and_check_hash(url, dir, hash, hash_type)
409
410 def check_sha1_cache(sha1, url):
411         logging.debug('checking sha1 caches: %s', sha1)
412         if snapshot_hash_path_exists(sha1):
413                 logging.debug('exists in snapshot sha1 cache: %s', sha1)
414                 remove(hash_path(sha1_cache_dir,sha1))
415                 return (True, sha1)
416         elif hash_path_exists(sha1_cache_dir, sha1):
417                 logging.debug('exists in derivatives sha1 cache: %s', sha1)
418                 return (False, sha1)
419         else:
420                 logging.debug('does not exist in any sha1 caches: %s', sha1)
421                 return download_sha1(url, sha1_cache_dir, sha1)
422
423 def status(type, hash, url):
424         logging.debug('checking status of hash: %s %s %s', type, hash, url)
425         if type == 'sha1':
426                 (ret, sha1) = check_sha1_cache(hash, url)
427                 if ret == True:
428                         return ('unmodified', sha1)
429                 elif ret == False:
430                         return ('modified', sha1)
431                 else:
432                         return (ret, sha1)
433         elif type == 'sha256':
434                 (ret, sha1) = check_hash_cache(sha256_cache_dir, hash, type, url)
435                 if ret == True:
436                         return ('unmodified', sha1)
437                 elif ret == False:
438                         return ('modified', sha1)
439                 else:
440                         return (ret, sha1)
441         elif type == 'md5sum':
442                 (ret, sha1) = check_hash_cache(md5_cache_dir, hash, type, url)
443                 if ret == True:
444                         return ('unmodified', sha1)
445                 elif ret == False:
446                         return ('modified', sha1)
447                 else:
448                         return (ret, sha1)
449         else:
450                 logging.warning('unknown hash type detected: %s %s %s', type, hash, url)
451                 return ('unknown', None)
452
453 # Functions for getting information about source packages
454
455 def get_info(srcpkg):
456         dsc = None
457         for header in checksum_headers:
458                 if not dsc and header in srcpkg:
459                         dsc = [x for x in srcpkg[header] if x['name'].endswith('.dsc')]
460         if not dsc:
461                 logging.warning('did not find any dsc files')
462                 return None
463         if len(dsc) > 1:
464                 logging.warning('found multiple dsc files: %s' % ' '.join(['%s %s' % (dsc_name, dsc_sha1) for dsc_name, dsc_sha1 in dsc]))
465                 return None
466         dsc = dsc[0]
467         dsc_name = dsc['name']
468         dsc_hash_type, dsc_hash =  [(k, v) for k, v in dsc.iteritems() if k not in ('name', 'size')][0]
469
470         parts = []
471         part_names = []
472         for header in checksum_headers:
473                 if header in srcpkg:
474                         for part in srcpkg[header]:
475                                 if 'name' in part and part['name'] not in part_names and not part['name'].endswith('.dsc'):
476                                         parts.append(part)
477                                         part_names.append(part['name'])
478
479         return (dsc_hash_type, dsc_hash, dsc_name, parts)
480
481 def get_debian_info(files):
482         dsc = [file for file in files if file[0].endswith('.dsc')]
483         if not dsc:
484                 logging.warning('did not find any Debian dsc files: snapshots bug or ancient source package')
485                 return None
486         if len(dsc) > 1:
487                 logging.warning('found multiple Debian dsc files, choosing first one: %s' % ' '.join(['%s %s' % (dsc_name, dsc_sha1) for dsc_name, dsc_sha1 in dsc]))
488
489         dsc = dsc[0]
490         dsc_name, dsc_sha1 = dsc
491
492         parts = []
493         part_names = []
494         for file in files:
495                 part_name, part_sha1 = file
496                 if part_name not in part_names and not part_name.endswith('.dsc'):
497                         parts.append(file)
498                         part_names.append(part_name)
499
500         return (dsc_sha1, dsc_name, parts)
501
502 # Functions for extracting information from the snapshots database
503
504 def database_error(e):
505         reason = None
506         code = None
507         if hasattr(e, 'pgerror'): reason = e.pgerror
508         if hasattr(e, 'pgcode'): code = e.pgcode
509         logging.warning('unable to execute database query: %s %s', code, reason)
510         conn.reset()
511
512 def srcpkg_was_in_debian(name, version=None):
513         try:
514                 if version:
515                         cur.execute('SELECT version FROM srcpkg WHERE name=%s AND version=%s LIMIT 1;', (name, version))
516                         return not not cur.fetchone()
517                 else:
518                         cur.execute('SELECT version FROM srcpkg WHERE name=%s LIMIT 1;', (name,))
519                         return not not cur.fetchone()
520         except psycopg2.Error, e:
521                 database_error(e)
522                 return None
523
524 def sha1_to_srcpkgs(sha1):
525         try:
526                 cur.execute(
527                         '''SELECT name, version
528                         FROM srcpkg
529                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
530                         WHERE hash=%s;''', (sha1,))
531                 return cur.fetchall()
532         except psycopg2.Error, e:
533                 database_error(e)
534                 return None
535
536 def srcpkg_to_sha1s(name, version):
537         try:
538                 cur.execute(
539                         '''SELECT hash
540                         FROM file_srcpkg_mapping
541                         JOIN srcpkg ON srcpkg.srcpkg_id=file_srcpkg_mapping.srcpkg_id
542                         WHERE name=%s AND version=%s;''', (name, version))
543                 return cur.fetchall()
544         except psycopg2.Error, e:
545                 database_error(e)
546                 return None
547
548 def srcpkg_to_srcpkgs(name):
549         try:
550                 cur.execute(
551                         '''SELECT name, version
552                         FROM srcpkg
553                         WHERE name=%s ORDER BY version DESC;''', (name,))
554                 return cur.fetchall()
555         except psycopg2.Error, e:
556                 database_error(e)
557                 return None
558
559 def sha1s_to_files(sha1):
560         try:
561                 cur.execute('SELECT DISTINCT ON (name, hash) name, hash FROM file WHERE hash=%s;', hash)
562                 return cur.fetchall()
563         except psycopg2.Error, e:
564                 database_error(e)
565                 return None
566
567 def srcpkg_to_files(name, version):
568         try:
569                 cur.execute(
570                         '''SELECT DISTINCT ON (file.name, file.hash) file.name, file.hash
571                         FROM file_srcpkg_mapping
572                         JOIN srcpkg ON srcpkg.srcpkg_id=file_srcpkg_mapping.srcpkg_id
573                         JOIN file ON file_srcpkg_mapping.hash=file.hash
574                         WHERE srcpkg.name=%s AND srcpkg.version=%s;''', (name, version))
575                 return cur.fetchall()
576         except psycopg2.Error, e:
577                 database_error(e)
578                 return None
579
580 def sha1_version_to_derived_from(sha1, version):
581         try:
582                 cur.execute(
583                         '''SELECT name, version
584                         FROM srcpkg
585                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
586                         WHERE hash=%s and version<=%s
587                         ORDER BY name ASC, version DESC
588                         LIMIT 1;''', (sha1, version))
589                 res = cur.fetchall()
590                 if res: return res
591                 cur.execute(
592                         '''SELECT name, version
593                         FROM srcpkg
594                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
595                         WHERE hash=%s
596                         ORDER BY name ASC, version ASC
597                         LIMIT 1;''', (sha1, version))
598                 return cur.fetchall()
599         except psycopg2.Error, e:
600                 database_error(e)
601                 return None
602
603 def srcpkg_to_derived_from(name, version):
604         try:
605                 cur.execute(
606                         '''SELECT name, version
607                         FROM srcpkg
608                         WHERE name=%s and version<=%s
609                         ORDER BY version DESC
610                         LIMIT 1;''', (name, version))
611                 res = cur.fetchall()
612                 if res: return res
613                 cur.execute(
614                         '''SELECT name, version
615                         FROM srcpkg
616                         WHERE name=%s
617                         ORDER BY version ASC
618                         LIMIT 1;''', (name,))
619                 return cur.fetchall()
620         except psycopg2.Error, e:
621                 database_error(e)
622                 return None
623
624 # Functions related to creating patches
625
626 # Add symlinks for all needed files
627 def prepare(dsc_name, dsc_sha1, parts):
628         logging.debug('preparing deriv directory for %s', dsc_name)
629         total_size = 0
630         unreadable_parts = []
631         symlink_parts = []
632         convert_lzip_parts = []
633         converted_parts = []
634         path = snapshot_hash_path(dsc_sha1)
635         if not os.path.exists(path): path = hash_path(sha1_cache_dir, dsc_sha1)
636         if not os.access(path, os.R_OK): unreadable_parts.append(dsc_name)
637         symlink_parts.append((path, dsc_name))
638         for part in parts:
639                 path = part_hash_path(part)
640                 if not path: continue
641                 if not os.access(path, os.R_OK):
642                         unreadable_parts.append(part['name'])
643                         continue
644                 size = uncompressed_size(path)
645                 total_size += size
646                 symlink_parts.append((path, part['name']))
647                 if part['name'].endswith('.lz'):
648                         total_size += size
649                         convert_lzip_parts.append(part['name'])
650
651         if unreadable_parts:
652                 logging.warning('some parts of %s are unreadable: %s', dsc_name, ' '.join(unreadable_parts))
653                 return (None, total_size)
654
655         # Ensure that the debdiff will have enough space
656         tmp_parent = None
657         if total_size*2.0 > tmp_size()/2.0 or total_size > tmp_space():
658                 logging.info('prepare: not enough space in /tmp, using scratch space: %s %s %s', total_size, tmp_size(), tmp_space())
659                 tmp_parent = os.path.expanduser('~/tmp')
660         elif dsc_name.startswith('libreoffice_') or dsc_name.startswith('iceweasel_'):
661                 logging.info('prepare: iceweasel/libreoffice, always using scratch space')
662                 tmp_parent = os.path.expanduser('~/tmp')
663         else:
664                 logging.debug('prepare: enough space in /tmp, not using scratch space: %s %s %s', total_size, tmp_size(), tmp_space())
665
666         tmp_dir = tempfile.mkdtemp(prefix='derivs-cmp-srcpkg-%s-' % derivative_short_name, dir=tmp_parent)
667
668         # Setup the symlinks for dpkg-source/debdiff/etc
669         for symlink_target, symlink_name in symlink_parts:
670                 symlink_path = os.path.join(tmp_dir, symlink_name)
671                 os.symlink(symlink_target, symlink_path)
672
673         # Some distributions allow additional compression schemes
674         # Here we work around this by recompressing with gzip
675         for part_name in convert_lzip_parts:
676                 converted = convert_lzip_to_gzip(tmp_dir, part_name)
677                 if converted is not None:
678                         converted_parts.append(converted)
679                 else:
680                         rmtree(tmp_dir)
681                         return (None, total_size)
682
683         # Update the dsc file if we recompressed any files
684         if converted_parts:
685                 update_dsc_file(tmp_dir, dsc_name, converted_parts)
686
687         return (tmp_dir, total_size)
688
689 def prepare_debian(dsc_name, dsc_sha1, files, total_size):
690         logging.debug('preparing Debian directory for %s', dsc_name)
691         unreadable_parts = []
692         symlink_parts = []
693         path = snapshot_hash_path(dsc_sha1)
694         if not os.access(path, os.R_OK): unreadable_parts.append(dsc_name)
695         symlink_parts.append((path, dsc_name))
696         for file in files:
697                 part_name, part_sha1 = file
698                 path = snapshot_hash_path(part_sha1)
699                 if not os.access(path, os.R_OK):
700                         unreadable_parts.append(part_name)
701                         continue
702                 size = uncompressed_size(path)
703                 total_size += size
704                 symlink_parts.append((path, part_name))
705
706         if unreadable_parts:
707                 logging.warning('some parts of %s are unreadable: %s', dsc_name, ' '.join(unreadable_parts))
708                 return None
709
710         # Ensure that the debdiff will have enough space
711         tmp_parent = None
712         if total_size > tmp_space():
713                 logging.info('prepare_debian: not enough space in /tmp, using scratch space: %s %s %s', total_size, tmp_size(), tmp_space())
714                 tmp_parent = os.path.expanduser('~/tmp')
715         elif dsc_name.startswith('libreoffice_') or dsc_name.startswith('iceweasel_'):
716                 logging.info('prepare: iceweasel/libreoffice, always using scratch space')
717                 tmp_parent = os.path.expanduser('~/tmp')
718         else:
719                 logging.debug('prepare_debian: enough space in /tmp, not using scratch space: %s %s %s', total_size, tmp_size(), tmp_space())
720
721         debian_tmp_dir = tempfile.mkdtemp(prefix='derivs-cmp-srcpkg-Debian-', dir=tmp_parent)
722
723         # Setup the symlinks for dpkg-source/debdiff/etc
724         for symlink_target, symlink_name in symlink_parts:
725                 symlink_path = os.path.join(debian_tmp_dir, symlink_name)
726                 os.symlink(symlink_target, symlink_path)
727
728         return debian_tmp_dir
729
730 def get_changelog_entries(tmp_dir, dsc_name, dsc_sha1):
731         logging.debug('getting changelog entries from %s', dsc_name)
732
733         # Cache check
734         changelog_path = hash_path(sha1_changelog_dir, dsc_sha1)
735         if os.path.exists(changelog_path):
736                 logging.debug('changelog cache exists for %s %s', dsc_name, dsc_sha1)
737                 with open(changelog_path) as f:
738                         try: changelog_entries = json.load(f)
739                         except ValueError: pass
740                         else: return [tuple(entry) for entry in changelog_entries]
741
742         # Preparation
743         extract_path = os.path.join(tmp_dir,'extracted')
744
745         # Unpack the source tree
746         logging.debug('unpacking source package %s', dsc_name)
747         cmdline = ['dpkg-source', '-x', dsc_name, 'extracted']
748         process = subprocess.Popen(cmdline, cwd=tmp_dir, env=tmp_environ(tmp_dir), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
749         output = process.communicate()[0]
750         if process.returncode:
751                 logging.warning('dpkg-source reported failure to extract %s:', dsc_name)
752                 logging.warning(output)
753                 cmdline = ['ls', '-lR', '--time-style=+']
754                 process = subprocess.Popen(cmdline, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
755                 output = process.communicate()[0]
756                 logging.warning(output)
757                 rmtree(extract_path)
758                 return None
759
760         # Sanitise the debian dir and changelog file in case it is a symlink to outside
761         debian_dir = os.path.join(extract_path, 'debian')
762         changelog_filename = os.path.join(debian_dir,'changelog')
763         if os.path.islink(debian_dir) or os.path.islink(changelog_filename):
764                 logging.warning('debian dir or changelog is a symbolic link %s', dsc_name)
765                 rmtree(extract_path)
766                 return None
767
768         # Check if the changelog exists
769         if not os.path.exists(changelog_filename):
770                 logging.warning('could not find changelog in %s', dsc_name)
771                 rmtree(extract_path)
772                 return None
773
774         # Find out which source package is the most likely derivative
775         logging.debug('parsing changelog for %s', dsc_name)
776         changelog_file = open(changelog_filename)
777         try:
778                 changelog_obj = changelog.Changelog(changelog_file)
779         except UnicodeDecodeError:
780                 changelog_file.seek(0)
781                 changelog_obj = changelog.Changelog(changelog_file, encoding='iso-8859-1')
782         try:
783                 changelog_entries = [(entry.package, str(entry._raw_version)) for entry in changelog_obj]
784         except:
785                 logging.warning('could not read changelog from %s', dsc_name)
786                 rmtree(extract_path)
787                 return None
788         del changelog_obj
789         changelog_file.close()
790
791         # Clean up again
792         rmtree(extract_path)
793
794         # Write the cache
795         makedirs(hash_path_parent(sha1_changelog_dir, dsc_sha1))
796         remove(changelog_path)
797         with open(changelog_path, 'w') as f:
798                 json.dump(changelog_entries, f)
799
800         return changelog_entries
801
802 # Find the source package name and version this is probably derived from
803 def find_derived_from(tmp_dir, name, version, dsc_name, dsc_sha1, parts_unmodified):
804         logging.debug('finding base source package of %s %s', name, version)
805
806         # Get a list of changelog entries
807         changelog_entries = get_changelog_entries(tmp_dir, dsc_name, dsc_sha1)
808         if changelog_entries:
809                 logging.debug('changelog entries are: %s', ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in changelog_entries]))
810
811         # Get a list of candidate versions from the database
812         possibly_derived_from = []
813         logging.debug('checking which parts were in Debian')
814         for part_sha1, part_name in parts_unmodified:
815                 part_derived_from = sha1_to_srcpkgs(part_sha1)
816                 if part_derived_from:
817                         logging.debug('part %s %s available in %s', part_sha1, part_name, ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in part_derived_from]))
818                         possibly_derived_from.extend(part_derived_from)
819
820         if not possibly_derived_from:
821                 logging.debug('no parts in common with Debian, obtaining old versions')
822                 old_packages = srcpkg_to_srcpkgs(name)
823                 if old_packages: possibly_derived_from = old_packages
824
825         # Uniqify
826         possibly_derived_from = list(set(possibly_derived_from))
827         if possibly_derived_from:
828                 logging.debug('possibly derived from: %s', ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in possibly_derived_from]))
829         else:
830                 logging.debug('nothing in possibly derived from list')
831
832         # Match changelog versions against candidates
833         if changelog_entries:
834                 logging.debug('matching changelog entries against versions possibly derived from')
835                 for entry in changelog_entries:
836                         entry_name, entry_version = entry
837                         if entry in possibly_derived_from:
838                                 logging.debug('%s %s in possibly derived from', entry_name, entry_version)
839                                 return entry
840                 logging.debug('checking if changelog entries were ever in Debian')
841                 for entry_name, entry_version in changelog_entries:
842                         if srcpkg_was_in_debian(entry_name, entry_version):
843                                 logging.debug('%s %s was in Debian', entry_name, entry_version)
844                                 return (entry_name, entry_version)
845         if possibly_derived_from:
846                 logging.debug('finding closest entry in possibly derived from')
847                 possibly_derived_from.sort(cmp=lambda a,b: apt_version_cmp(b[1],a[1]))
848                 for entry_name, entry_version in possibly_derived_from:
849                         if name == entry_name and apt_version_cmp(version, entry_version) >= 0:
850                                 logging.debug('%s %s is an equal or lower version', entry_name, entry_version)
851                                 return (entry_name, entry_version)
852                 entry = possibly_derived_from[-1]
853                 entry_name, entry_version = entry
854                 logging.debug('no lower version numbers, returning next highest version %s %s', entry_name, entry_version)
855                 return entry
856         logging.debug('finding closest version number in Debian')
857         for entry in srcpkg_to_derived_from(name, version):
858                 entry_name, entry_version = entry
859                 logging.debug('closest package was %s %s', entry_name, entry_version)
860                 return entry
861         logging.debug('could not find Debian package %s %s is derived from', name, version)
862         return None
863
864 # Generate a patch file
865 def create_patch(tmp_dir, dsc_name, dsc_sha1, debian_tmp_dir, debian_dsc_name, debian_dsc_sha1):
866         global repackaged_but_identical
867
868         dsc_path = os.path.join(tmp_dir, dsc_name)
869         debian_dsc_path = os.path.join(debian_tmp_dir, debian_dsc_name)
870         path_everything = sha1_patch_path(debian_dsc_sha1, dsc_sha1)
871         path_debian = sha1_patch_path(debian_dsc_sha1, dsc_sha1, 'debian')
872
873         # Generate the main patch
874         if not os.path.exists(path_everything) and os.path.exists(debian_dsc_path) and os.path.exists(dsc_path):
875                 makedirs(os.path.dirname(path_everything))
876                 cmdline = ['debdiff', '--quiet', '--diffstat', debian_dsc_path, dsc_path]
877                 stdout = open(path_everything, 'w')
878                 process = subprocess.Popen(cmdline, env=tmp_environ(tmp_dir), stdout=stdout, stderr=subprocess.PIPE, preexec_fn=subprocess_setup)
879                 output = process.communicate()[1]
880                 stdout.close()
881                 if process.returncode == 255:
882                         logging.warning('debdiff reported failure %s %s:', debian_dsc_name, dsc_name)
883                         logging.warning(output)
884                         cmdline = ['ls', '-lR', '--time-style=+']
885                         for name, dir in (derivative_short_name, tmp_dir), ('Debian', debian_tmp_dir):
886                                 logging.warning('dir listing for %s:', name)
887                                 process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
888                                 output = process.communicate()[0]
889                                 logging.warning(output)
890                         return False
891                 elif process.returncode == 0:
892                         logging.info('derivative repackaged in an identical way %s %s', debian_dsc_name, dsc_name)
893                         repackaged_but_identical += 1
894                         return False
895                 elif process.returncode != 1:
896                         logging.warning('debdiff reported unknown return code %s %s %s:', process.returncode, debian_dsc_name, dsc_name)
897                         logging.warning(output)
898                         cmdline = ['ls', '-lR', '--time-style=+']
899                         for name, dir in (derivative_short_name, tmp_dir), ('Debian', debian_tmp_dir):
900                                 logging.warning('dir listing for %s:', name)
901                                 process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
902                                 output = process.communicate()[0]
903                                 logging.warning(output)
904                         return False
905
906         # Filter the main patch to include only the debian/ directory
907         if os.path.exists(path_everything) and not os.path.exists(path_debian):
908                 makedirs(os.path.dirname(path_debian))
909                 cmdline = ['filterdiff', '--include=*/debian/*', path_everything]
910                 filterdiff = subprocess.Popen(cmdline, env=tmp_environ(tmp_dir), stdout=subprocess.PIPE, preexec_fn=subprocess_setup)
911                 filterdiff_output = filterdiff.communicate()[0]
912                 diffstat = subprocess.Popen('diffstat', stdin=subprocess.PIPE, stdout=subprocess.PIPE, preexec_fn=subprocess_setup)
913                 diffstat_output = diffstat.communicate(filterdiff_output)[0]
914                 with open(path_debian, 'w') as f:
915                         f.write('diffstat of debian/ for %s %s\n' % (os.path.splitext(debian_dsc_name)[0], os.path.splitext(dsc_name)[0]))
916                         f.write('\n')
917                         f.write(diffstat_output)
918                         f.write('\n')
919                         f.write(filterdiff_output)
920
921         # Patches > 100MB are probably not that useful, replace them with a link
922         for path in path_everything, path_debian:
923                 try:
924                         if os.path.getsize(path) > 104857600:
925                                 logging.info('patch between %s and %s is larger than 100MB', dsc_name, debian_dsc_name)
926                                 remove(path)
927                                 symlink(os.path.relpath(patch_too_large, os.path.dirname(path)), path)
928                 except OSError:
929                         pass
930
931         return True
932
933 def check_patch(debian_dsc_sha1, dsc_sha1):
934         patch_path = sha1_patch_path(debian_dsc_sha1, dsc_sha1)
935         lsdiff_path = sha1_lsdiff_path(debian_dsc_sha1, dsc_sha1)
936         if os.path.exists(lsdiff_path):
937                 logging.debug('lsdiff cache exists for %s', patch_path)
938                 with open(lsdiff_path) as f:
939                         lsdiff = f.read()
940         else:
941                 logging.debug('lsdiff cache does not exist for %s', patch_path)
942                 cmdline = ['lsdiff', patch_path]
943                 process = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
944                 lsdiff = process.communicate()[0]
945                 makedirs(os.path.dirname(lsdiff_path))
946                 with open(lsdiff_path,'w') as f:
947                         f.write(lsdiff)
948         lsdiff = lsdiff.splitlines()
949         for line in lsdiff:
950                 if line != 'debian/changelog' and not line.endswith('/debian/changelog'):
951                         logging.debug('patch changes files other than debian/changelog')
952                         return True
953         if lsdiff:
954                 logging.debug('patch does not change files other than debian/changelog')
955         else:
956                 logging.debug('patch does not change any files')
957         return False
958
959 def present_patch(name, version, dsc_sha1, debian_name, debian_version,  debian_dsc_sha1):
960         useful_patch = check_patch(debian_dsc_sha1, dsc_sha1)
961         patches = []
962         types = ('', 'debian')
963         for type in types:
964                 ln_to = sha1_patch_path(debian_dsc_sha1, dsc_sha1, type)
965                 if not os.path.exists(ln_to):
966                         continue
967                 ln_from_deriv = deriv_patch_path(name, version, debian_name, debian_version, type)
968                 ln_from_global = global_patch_path(name, version, debian_name, debian_version, type)
969                 makedirs(os.path.dirname(ln_from_deriv))
970                 makedirs(os.path.dirname(ln_from_global))
971                 remove(ln_from_deriv)
972                 remove(ln_from_global)
973                 symlink(os.path.relpath(ln_to, os.path.dirname(ln_from_deriv)), ln_from_deriv)
974                 symlink(os.path.relpath(ln_to, os.path.dirname(ln_from_global)), ln_from_global)
975                 if useful_patch:
976                         patches.append(os.path.relpath(ln_from_global, os.path.abspath(global_patch_dir)))
977         return tuple(patches)
978
979 # Functions that wrap other functions and decide what to do
980
981 def check_source_package(source_entry, srcpkg):
982         global modifies_dsc_files
983
984         try:
985                 name = None
986                 version = None
987                 dir = None
988                 name = srcpkg['Package']
989                 version = srcpkg['Version']
990                 dir = srcpkg['Directory']
991                 if '/' in name or name == '..':
992                         logging.warning('could not process source package %s %s: possibly malicious name', name, version)
993                         return None
994                 if '/' in version or version == '..':
995                         logging.warning('could not process source package %s %s: possibly malicious version', name, version)
996                         return None
997                 if '..' in dir.split('/'):
998                         logging.warning('could not process source package %s %s: possibly malicious dir: %s', name, version, dir)
999                         return None
1000         except KeyError:
1001                 logging.warning('could not process source package %s %s', name, version)
1002                 return None
1003         logging.debug('started processing source package %s %s', name, version)
1004         info = get_info(srcpkg)
1005         if not info:
1006                 logging.warning('finished processing source package %s %s: could not get any info', name, version)
1007                 return None
1008         dsc_hash_type, dsc_hash, dsc_name, parts = info
1009         if '/' in dsc_name or dsc_name == '..':
1010                 logging.warning('could not process source package %s %s: possibly malicious dsc name %s', name, version, dsc_name)
1011                 return None
1012         if not ishex(dsc_hash):
1013                 logging.warning('could not process source package %s %s: possibly malicious dsc hash %s', name, version, dsc_hash)
1014                 return None
1015         dsc_url = source_entry.archive_uri('%s/%s' % (dir, dsc_name))
1016         logging.debug('found dsc file: %s %s %s', dsc_hash_type, dsc_hash, dsc_url)
1017         dsc_status, dsc_sha1 = status(dsc_hash_type, dsc_hash, dsc_url)
1018         logging.debug('checked dsc status: %s %s %s', dsc_status, dsc_sha1, dsc_url)
1019         if dsc_status == 'unmodified':
1020                 # Ignore the srcpkg since we know it is was in Debian
1021                 # at one point and is hopefully therefore unmodified
1022                 logging.debug('finished processing source package %s %s: dsc unmodified', name, version)
1023                 return None
1024         else:
1025                 files = [(dsc_sha1, dsc_hash_type, dsc_hash)]
1026                 parts_unmodified = []
1027                 parts_modified = []
1028                 parts_unknown = []
1029                 for part in parts:
1030                         part_name = part['name']
1031                         if '/' in part_name or part_name == '..':
1032                                 logging.warning('could not process source package %s %s: possibly malicious part name %s', name, version, part_name)
1033                                 return None
1034                         part_url = source_entry.archive_uri('%s/%s' % (dir, part_name))
1035                         part_hash_type, part_hash = [(k, v) for k, v in part.iteritems() if k not in ('name', 'size')][0]
1036                         if not ishex(part_hash):
1037                                 logging.warning('could not process source package %s %s: possibly malicious part hash %s', name, version, part_hash)
1038                                 return None
1039                         logging.debug('found part file: %s %s %s', part_hash_type, part_hash, part_url)
1040                         part_status, part_sha1 = status(part_hash_type, part_hash, part_url)
1041                         logging.debug('checked part status: %s %s %s', part_status, part_sha1, part_url)
1042                         if 'sha1' not in part and part_sha1: part['sha1'] = part_sha1
1043                         if part_status == 'unmodified': parts_unmodified.append((part_sha1, part_name))
1044                         elif part_status == 'modified': parts_modified.append((part_sha1, part_name))
1045                         else: parts_unknown.append((part_sha1, part_name))
1046                         if part_status == 'modified': files.append((part_sha1, part_hash_type, part_hash))
1047
1048                 all_parts_unmodified = (len(parts_unmodified) == len(parts))
1049                 parts_unmodified = list(set(parts_unmodified))
1050                 logging.debug('source package status %s %s: dsc %s, %s parts unmodified, %s parts modified, %s parts unknown', name, version, dsc_status, len(parts_unmodified), len(parts_modified), len(parts_unknown))
1051
1052                 if all_parts_unmodified:
1053                         # Ignore the srcpkg since we know all the parts were
1054                         # in Debian at one point and ergo, it is unmodified
1055                         logging.debug('finished processing source package %s %s: all non-dsc parts unmodified', name, version)
1056                         if dsc_status == 'modified':
1057                                 logging.info('source package %s %s: unmodified, but dsc different', name, version)
1058                                 modifies_dsc_files += 1
1059                         return (files, None, None, None)
1060                 else:
1061                         logging.debug('some parts modified, looking for derived version %s %s', name, version)
1062                         if not dsc_sha1:
1063                                 logging.warning('finished processing source package %s %s: sha1 missing for dsc file', name, version)
1064                                 return (files, None, None, None)
1065                         if parts_unknown:
1066                                 logging.warning('finished processing source package %s %s: sha1 missing for some parts', name, version)
1067                                 return (files, None, None, None)
1068                         new = None
1069                         link = None
1070                         patch = None
1071                         tmp_dir, size = prepare(dsc_name, dsc_sha1, parts)
1072                         if not tmp_dir:
1073                                 logging.warning('source package %s %s: could not create temporary dir for deriv: %s', name, version, dsc_name)
1074                                 return (files, None, None, None)
1075                         derived_from = find_derived_from(tmp_dir, name, version, dsc_name, dsc_sha1, parts_unmodified)
1076                         if derived_from:
1077                                 debian_name, debian_version = derived_from
1078                                 link = (debian_name, debian_version, name, version, dsc_url)
1079                                 logging.debug('source package %s %s derived from %s %s', name, version, debian_name, debian_version)
1080                                 debian_files = srcpkg_to_files(debian_name, debian_version)
1081                                 if debian_files:
1082                                         debian_info = get_debian_info(debian_files)
1083                                         if debian_info:
1084                                                 debian_dsc_sha1, debian_dsc_name, debian_parts = debian_info
1085                                                 logging.debug('Debian source package %s %s dsc found %s %s', debian_name, debian_version, debian_dsc_name, debian_dsc_sha1)
1086                                                 debian_tmp_dir = prepare_debian(debian_dsc_name, debian_dsc_sha1, debian_parts, size)
1087                                                 if debian_tmp_dir:
1088                                                         patch_created = create_patch(tmp_dir, dsc_name, dsc_sha1, debian_tmp_dir, debian_dsc_name, debian_dsc_sha1)
1089                                                         if patch_created:
1090                                                                 patch_names = present_patch(name, version, dsc_sha1, debian_name, debian_version, debian_dsc_sha1)
1091                                                                 if patch_names:
1092                                                                         patch = (debian_name, debian_version, debian_dsc_sha1, name, version, dsc_sha1, [part[0] for part in parts_modified], patch_names)
1093                                                                 else:
1094                                                                         logging.debug('patch between %s %s and %s %s is probably not useful', debian_name, debian_version, name, version)
1095                                                         rmtree(debian_tmp_dir)
1096                                                 else:
1097                                                         # This could be an issue with disk space, snapshots or a file that is not distributable
1098                                                         logging.warning('source package %s %s: could not create temporary dir for Debian: %s %s', name, version, debian_name, debian_version)
1099                                         else:
1100                                                 logging.warning('source package %s %s: could not get Debian info for %s %s: %s', name, version, debian_name, debian_version, debian_info)
1101                                 else:
1102                                         if srcpkg_was_in_debian(debian_name, debian_version):
1103                                                 logging.warning('source package %s %s: snapshot database issue, no Debian files found', debian_name, debian_version)
1104                                         else:
1105                                                 logging.warning('source package %s %s: derived from %s %s possibly bogus', name, version, debian_name, debian_version)
1106                         else:
1107                                 new = (name, version, dsc_url)
1108                         rmtree(tmp_dir)
1109                         logging.debug('finished processing source package %s %s: all done', name, version)
1110                         return (files, patch, link, new)
1111
1112 def process_sources(source_entries, lists_dir):
1113         files = []
1114         patches = []
1115         links = []
1116         new = []
1117         for source in source_entries:
1118                 for source_entry in source:
1119                         logging.debug('processing sources.list entry %s', source_entry.describe)
1120                         fn = os.path.join(lists_dir, source_entry.describe.rstrip(')').rpartition('(')[2])
1121                         try: f = file(fn)
1122                         except IOError: continue
1123                         for srcpkg in deb822.Sources.iter_paragraphs(f):
1124                                 actions = check_source_package(source_entry, srcpkg)
1125                                 if actions:
1126                                         action_files, action_patch, action_link, action_new = actions
1127                                         if action_files:
1128                                                 files.append(action_files)
1129                                                 logging.debug('action: return files %s', ' '.join([' '.join([str(item) for item in action]) for action in action_files]))
1130                                         if action_patch:
1131                                                 patches.append(action_patch)
1132                                                 logging.debug('action: return patches %s', ' '.join([' '.join(action) for action in action_patch]))
1133                                         if action_link:
1134                                                 links.append(action_link)
1135                                                 logging.debug('action: return links to modified source packages %s', ' '.join(action_link))
1136                                         if action_new:
1137                                                 new.append(action_new)
1138                                                 logging.debug('action: return links to new source packages %s', ' '.join(action_new))
1139                                 logging.debug('done')
1140                                 logging.debug('')
1141                         f.close()
1142         return (files, patches, links, new)
1143
1144 logging.debug('processing distribution %s', derivative_short_name)
1145
1146 files, patches, links, new = process_sources(source_entries, lists_dir)
1147
1148 # Done with the database, close the connection
1149 cur.close()
1150 conn.close()
1151
1152 # Write out the results
1153 filename = sys.argv[3]
1154 data = files
1155 if data:
1156         output_data = {}
1157         for package in data:
1158                 for modified_file in package:
1159                         sha1, hash_type, hash = modified_file
1160                         if sha1 not in output_data:
1161                                 output_data[sha1] = {}
1162                         if hash_type != 'sha1' and hash_type not in output_data[sha1]:
1163                                 output_data[sha1][hash_type] = hash
1164                         elif hash_type != 'sha1' and hash != output_data[sha1][hash_type]:
1165                                 logging.warning('hashes mismatched: %s: %s %s != %s', sha1, hash_type, hash, output_data[sha1][hash_type])
1166         with open(os.path.abspath(filename), 'wb') as output:
1167                 yaml.safe_dump(output_data, output)
1168
1169 filename = sys.argv[4]
1170 data = patches
1171 if data:
1172         if not os.path.exists(os.path.join(global_patch_dir,'HEADER.html')):
1173                 symlink('../../doc/HEADER.patches.html',os.path.join(global_patch_dir,'HEADER.html'))
1174         if not os.path.exists(os.path.join(global_patch_dir,'.htaccess')):
1175                 symlink('../../etc/htaccess.patches',os.path.join(global_patch_dir,'.htaccess'))
1176         if not os.path.exists(os.path.join(deriv_patch_dir,'HEADER.html')):
1177                 symlink('../../../doc/HEADER.patches.html',os.path.join(deriv_patch_dir,'HEADER.html'))
1178         if not os.path.exists(os.path.join(global_patch_dir,'.htaccess')):
1179                 symlink('../../../etc/htaccess.patches',os.path.join(deriv_patch_dir,'.htaccess'))
1180         output_data = []
1181         for item in data:
1182                 debian_name, debian_version, debian_sha1, name, version, sha1, parts_sha1, patches = item
1183                 item = {}
1184                 item['debian_name'] = debian_name
1185                 item['debian_version'] = debian_version
1186                 item['debian_sha1'] = debian_sha1
1187                 item['name'] = name
1188                 item['version'] = version
1189                 item['sha1'] = sha1
1190                 item['patches'] = patches
1191                 item['parts'] = parts_sha1
1192                 output_data.append(item)
1193         with open (os.path.abspath(filename), 'wb') as output:
1194                 yaml.safe_dump(output_data, output)
1195 else:
1196         remove(filename)
1197
1198 filename = sys.argv[5]
1199 data = links
1200 if data:
1201         data = list(set(data))
1202         data.sort(cmp=lambda a,b: cmp(a[0],b[0]) or apt_version_cmp(a[1],b[1]) or cmp(a[2],b[2]) or apt_version_cmp(a[3],b[3]))
1203         output_data = {}
1204         for debian_name, debian_version, name, version, dsc_url in data:
1205                 if debian_name not in output_data:
1206                         output_data[debian_name] = {}
1207                 if debian_version not in output_data[debian_name]:
1208                         output_data[debian_name][debian_version] = []
1209                 item = {}
1210                 item['name'] = name
1211                 item['version'] = version
1212                 item['dsc'] = dsc_url
1213                 output_data[debian_name][debian_version].append(item)
1214         with open (os.path.abspath(filename), 'wb') as output:
1215                 yaml.safe_dump(output_data, output)
1216 else:
1217         remove(filename)
1218
1219 filename = sys.argv[6]
1220 data = new
1221 if data:
1222         data = list(set(data))
1223         data.sort(cmp=lambda a,b: cmp(a[0],b[0]) or apt_version_cmp(a[1],b[1]) or cmp(a[2],b[2]))
1224         output_data = {}
1225         for name, version, dsc_url in data:
1226                 if name not in output_data:
1227                         output_data[name] = {}
1228                 if version not in output_data[name]:
1229                         output_data[name][version] = []
1230                 output_data[name][version].append(str(dsc_url))
1231         with open(os.path.abspath(filename), 'wb') as output:
1232                 yaml.safe_dump(output_data, output)
1233 else:
1234         remove(filename)
1235
1236 logging.shutdown()