01489949d04a2caf0336267dc4d2e4074bbd5401
[dex/census.git] / bin / compare-source-package-list
1 #!/usr/bin/python
2
3 # Copyright 2011 Paul Wise
4 # Released under the MIT/Expat license, see doc/COPYING
5
6 # Uses the snapshot.debian.org metadata database and SHA-1 based filesystem to
7 # compute debdiffs between Debian and individual derivatives. The metadata
8 # allows knowing if a particular file was ever in Debian and the filesystem
9 # allows the creation of debdiffs.
10 #
11 # The script works approximately like this:
12 #
13 # Load the Sources files previously downloaded by get-package-lists as indicated
14 # by the sources.list of the derivative.
15 #
16 # For each source package in the derivative:
17 #
18 # Check if the dsc has ever been in Debian, if not, check if the other
19 # parts have and therefore decide if the package is unmodified or not.
20 # Unmodified source packages are skipped and include those with the exact
21 # same dsc file or those where all the non-dsc parts are identical.
22 #
23 # Try some heuristics (name, version, changelog entries) to find out if
24 # the package could be based on some package that is or was in Debian.
25 #
26 # If it was not then skip to the next one and make a note, since Debian
27 # might want to know about source packages that are missing from Debian.
28 #
29 # If it was then use debdiff to create a diff and filterdiff to create a
30 # diff of the debian/ dir.
31 #
32 # Usage:
33 # compare-source-package-list <sources.list> <apt dir> <patches list> <links list> <new package list> <log file>
34
35 # FIXME: write out some statistics and rrdtool graphs
36 #               source package types per derivative
37 #               number of source packages
38 #               cache misses: md5, sha256, sha1, patch, changelog
39 # FIXME: comment the code to list assumptions and function purpose
40 # FIXME: add options to allow re-processing only specific packages
41 # FIXME: write something to clean up old files and patches
42 # FIXME: don't unpack or make a patch when we don't have all the parts
43 # FIXME: don't make a patch when we were not able to unpack the source package
44 # FIXME: cleanup files at start of run
45 # FIXME: extract new debian/patches/ patches
46 # FIXME: print out packages that are no longer in Debian
47 # FIXME: deal with really large patches:
48 # FIXME:   kde-l10n-*: too few parts to be useful
49 # FIXME:   divergence: too many changelog entries between versions to be useful
50 # FIXME:   derivative is older than Debian
51 # FIXME:   derivative renamed the source package
52 # FIXME:   just a really big diff
53 # FIXME: when there are multiple dsc files in snapshots, prefer the debian/debian-archive one
54 # FIXME: when the source package is ancient and the dsc is missing, make a fake one to use
55 # FIXME: add an in-memory cache of hashes so that hashes in multiple releases hit the disk once
56
57 import os
58 import sys
59 import httplib
60 import urllib2
61 import hashlib
62 import shutil
63 import logging
64 import tempfile
65 import string
66 import socket
67 import signal
68 import subprocess
69 import yaml
70 from debian import deb822, changelog
71 import apt_pkg
72 import psycopg2
73 try: import cjson as json
74 except ImportError: import json
75
76 # Helper functions for python stuff with annoying error handling
77
78 def makedirs(dirs):
79         try: os.makedirs(dirs)
80         except OSError: pass
81
82 def rmtree(dir):
83         try: shutil.rmtree(dir)
84         except OSError: pass
85
86 def remove(file):
87         try: os.remove(file)
88         except OSError: pass
89
90 def symlink(source, link):
91         try: os.symlink(source, link)
92         except OSError: pass
93
94 # http://www.chiark.greenend.org.uk/ucgi/~cjwatson/blosxom/2009-07-02-python-sigpipe.html
95 def subprocess_setup():
96         # Python installs a SIGPIPE handler by default. This is usually not what
97         # non-Python subprocesses expect.
98         signal.signal(signal.SIGPIPE, signal.SIG_DFL)
99
100 # We need to map apt_pkg.version_compare return values to cmp return values
101 # The documentation is incorrect: http://bugs.debian.org/680891
102 def apt_version_cmp(a, b):
103         ret = apt_pkg.version_compare(a, b)
104         if ret < 0: return -1
105         elif ret > 0: return 1
106         else: return 0
107
108 # Config
109 md5_cache_dir = os.path.abspath('../md5-farm')
110 sha1_cache_dir = os.path.abspath('../sha1-farm')
111 sha256_cache_dir = os.path.abspath('../sha256-farm')
112 sha1_patch_dir = os.path.abspath('../sha1-patches')
113 sha1_lsdiff_dir = os.path.abspath('../sha1-lsdiff')
114 sha1_changelog_dir = os.path.abspath('../sha1-changelog')
115 deriv_patch_dir = os.path.abspath('patches')
116 global_patch_dir = os.path.abspath('../patches')
117 snapshot_cache_dir = '/srv/snapshot.debian.org/farm'
118 patch_too_large = os.path.abspath('../../doc/patch-too-large.txt')
119 checksum_types = ('sha1', 'sha256', 'md5sum')
120 checksum_hashlib = ('sha1', 'sha256', 'md5')
121 checksum_headers = ('Checksums-Sha1', 'Checksums-Sha256', 'Files')
122 user_agent = 'Debian Derivatives Census QA bot'
123 timeout = 60
124 ishex = lambda s: not(set(s)-set(string.hexdigits))
125
126 # Init
127 apt_pkg.init()
128
129 # Preparation
130 sources_list = apt_pkg.SourceList()
131 sources_list.read_main_list()
132 conn = psycopg2.connect("service=snapshot-guest")
133 cur = conn.cursor()
134 remove(sys.argv[7])
135 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG, filename=sys.argv[7])
136
137 # Voodoo
138 lists_dir = apt_pkg.config.find_dir('Dir::State::lists')
139 source_entries = [[i for i in x.index_files if i.label=='Debian Source Index'] for x in sources_list.list]
140 derivative_short_name = os.path.basename(os.getcwd())
141 modifies_dsc_files = 0
142 repackaged_but_identical = 0
143
144 # Helper functions for generating path names
145
146 def hash_path_parent(dir, hash):
147         return os.path.join(dir, hash[0:2], hash[2:4])
148
149 def hash_path(dir, hash):
150         return os.path.join(dir, hash[0:2], hash[2:4], hash)
151
152 def hash_path_exists(dir, hash):
153         return os.path.exists(os.path.join(dir, hash[0:2], hash[2:4], hash))
154
155 def sha1_patch_path(debian_dsc_sha1, dsc_sha1, type=None):
156         path = os.path.join(hash_path(sha1_patch_dir, debian_dsc_sha1), hash_path('', dsc_sha1))
157         if type: path += '.%s' % type
158         path += '.patch'
159         return os.path.abspath(path)
160
161 def sha1_lsdiff_path(debian_dsc_sha1, dsc_sha1, type=None):
162         path = os.path.join(hash_path(sha1_lsdiff_dir, debian_dsc_sha1), hash_path('', dsc_sha1))
163         if type: path += '.%s' % type
164         path += '.lsdiff'
165         return os.path.abspath(path)
166
167 def shortslug(name):
168         return name[:4] if name.startswith('lib') else name[0]
169
170 def deriv_patch_path(name, version, debian_name, debian_version, type=None):
171         path = os.path.join(deriv_patch_dir, shortslug(debian_name), debian_name, '')
172         path += '_'.join((debian_name, debian_version, name, version))
173         if type: path += '.%s' % type
174         path += '.patch'
175         return os.path.abspath(path)
176
177 def global_patch_path(name, version, debian_name, debian_version, type=None):
178         path = os.path.join(global_patch_dir, shortslug(debian_name), debian_name, '')
179         path += '_'.join(('Debian', debian_name, debian_version, derivative_short_name, name, version))
180         if type: path += '.%s' % type
181         path += '.patch'
182         return os.path.abspath(path)
183
184 # Functions for munging source packages
185
186 def convert_lzip_to_gzip(dir, name):
187         cmdline = ['lzip', '-d', name]
188         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
189         output = process.communicate()[0]
190         if process.returncode:
191                 logging.warning('lzip reported failure to decompress %s:', name)
192                 logging.warning(output)
193         bname = name[0:-3] # Strip off .lz
194         cmdline = ['gzip', '-1', bname] # gzip -1 to reduce overhead
195         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
196         output = process.communicate()[0]
197         if process.returncode:
198                 logging.warning('gzip reported failure to compress %s:', bname)
199                 logging.warning(output)
200         return (name, bname+'.gz')
201
202 def update_dsc_file(dir, dsc_name, parts):
203         dsc_path = os.path.join(dir,dsc_name)
204         dsc_file = open(dsc_path,'rb')
205         dsc = deb822.Dsc(dsc_file)
206         for (old, name) in parts:
207                 path = os.path.join(dir,name)
208                 size = os.path.getsize(path)
209                 with open(path,'rb') as f:
210                         hashes = {}
211                         for (type, func) in zip(checksum_types, checksum_hashlib):
212                                 hashes[type] = getattr(hashlib, func)()
213                         for chunk in iter(lambda: f.read(128*64L), b''):
214                                 for type in checksum_types:
215                                         hashes[type].update(chunk)
216                         for type in checksum_types:
217                                 hashes[type] = hashes[type].hexdigest()
218                         for (header, type) in zip(checksum_headers, checksum_types):
219                                 if header in dsc:
220                                         dsc[header] = [{type: hashes[type], 'size': size, 'name': name} if p['name'] == old else p for p in dsc[header]]
221         dsc_file.close()
222         os.remove(dsc_path) # So we don't change the original that the dsc links to
223         dsc_file = open(dsc_path,'wb')
224         dsc.dump(dsc_file)
225         dsc_file.close()
226
227 # Functions for downloading files and storing them in the hash caches
228
229 def download_and_check_hash(url, dir, hash, hash_type):
230         try:
231                 parent = hash_path_parent(dir,hash)
232                 path = hash_path(dir,hash)
233                 logging.debug('downloading %s', url)
234                 makedirs(parent)
235                 headers = { 'User-Agent' : user_agent }
236                 req = urllib2.Request(url, None, headers)
237                 u = urllib2.urlopen(req, None, timeout)
238                 data = u.read()
239                 if hash_type == 'sha256':
240                         data_hash = hashlib.sha256(data).hexdigest()
241                 elif hash_type == 'md5sum':
242                         data_hash = hashlib.md5(data).hexdigest()
243                 else:
244                         logging.warning('unknown hash type detected: %s %s %s', hash_type, hash, url)
245                         return ('unknown', None)
246                 if data_hash != hash:
247                         logging.warning('incorrect hash for downloaded file, ignoring: %s %s != %s %s', hash_type, hash, data_hash, url)
248                         return ('unknown', None)
249                 sha1 = hashlib.sha1(data).hexdigest()
250                 sha1_path = hash_path(sha1_cache_dir, sha1)
251                 sha1_parent = hash_path_parent(sha1_cache_dir, sha1)
252                 makedirs(sha1_parent)
253                 if hash_path_exists(snapshot_cache_dir, sha1):
254                         snapshot_path = hash_path(snapshot_cache_dir, sha1)
255                         symlink(snapshot_path, path)
256                         logging.debug('exists in snapshot sha1 cache: %s %s %s %s', hash_type, hash, sha1, url)
257                         return (True, sha1)
258                 else:
259                         if not os.path.exists(sha1_path):
260                                 logging.debug('correct hash for downloaded file, saving: %s %s %s %s', hash_type, hash, sha1, url)
261                                 f = open(sha1_path, 'w')
262                                 f.write(data)
263                                 f.close()
264                         else:
265                                 logging.debug('correct hash for downloaded file, not saving: already in derivs cache: %s %s %s %s', hash_type, hash, sha1, url)
266                         symlink(os.path.relpath(sha1_path, os.path.dirname(path)), path)
267                         logging.debug('does not exist in snapshot sha1 cache: %s %s %s %s', hash_type, hash, sha1, url)
268                         return (False, sha1)
269         except urllib2.URLError, e:
270                 if hasattr(e, 'reason'): reason = e.reason
271                 elif hasattr(e, 'code'): reason = e.code
272                 else: reason = e
273                 logging.warning('unable to download hash file, ignoring: %s %s', reason, url)
274                 return ('unknown', None)
275         except httplib.HTTPException, e:
276                 logging.warning('unable to download hash file, ignoring: %s %s', repr(e), url)
277                 return ('unknown', None)
278         except socket.error, e:
279                 logging.warning('unable to download hash file, ignoring: %s %s', e, url)
280                 return ('unknown', None)
281
282 def download_sha1(url, dir, sha1):
283         try:
284                 parent = hash_path_parent(dir,sha1)
285                 path = hash_path(dir,sha1)
286                 logging.debug('downloading sha1: %s %s', sha1, url)
287                 makedirs(parent)
288                 headers = { 'User-Agent' : user_agent }
289                 req = urllib2.Request(url, None, headers)
290                 u = urllib2.urlopen(req, None, timeout)
291                 data = u.read()
292                 data_sha1 = hashlib.sha1(data).hexdigest()
293                 if data_sha1 == sha1:
294                         logging.debug('correct sha1 for downloaded file, saving: %s %s', sha1, url)
295                         if not os.path.exists(path):
296                                 f = open(path, 'w')
297                                 f.write(data)
298                                 f.close()
299                         return (False, sha1)
300                 else:
301                         logging.warning('incorrect sha1 for downloaded file, ignoring: %s != %s %s', sha1, data_sha1, url)
302                         return ('unknown', None)
303         except urllib2.URLError, e:
304                 if hasattr(e, 'reason'): reason = e.reason
305                 elif hasattr(e, 'code'): reason = e.code
306                 else: reason = e
307                 logging.warning('unable to download sha1 file, ignoring: %s %s', reason, url)
308                 return ('unknown', None)
309         except httplib.HTTPException, e:
310                 logging.warning('unable to download hash file, ignoring: %s %s', repr(e), url)
311                 return ('unknown', None)
312         except socket.error, e:
313                 logging.warning('unable to download hash file, ignoring: %s %s', e, url)
314                 return ('unknown', None)
315
316 # Functions for checking the hash caches
317
318 def check_hash_cache(dir, hash, hash_type, url):
319         logging.debug('checking hash cache: %s %s', hash_type, hash)
320         path = hash_path(dir, hash)
321         try:
322                 result = os.readlink(path)
323                 path = os.path.join(os.path.dirname(path), result)
324         except OSError:
325                 logging.debug('does not exist in hash cache: %s %s', hash_type, hash)
326                 return download_and_check_hash(url, dir, hash, hash_type)
327         logging.debug('exists in hash cache: %s %s', hash_type, hash)
328         sha1 = os.path.basename(path)
329         if hash_path_exists(snapshot_cache_dir, sha1):
330                 logging.debug('exists in snapshot sha1 cache: %s', sha1)
331                 remove(hash_path(sha1_cache_dir,sha1))
332                 return (True, sha1)
333         elif hash_path_exists(sha1_cache_dir, sha1):
334                 logging.debug('exists in derivatives sha1 cache: %s', sha1)
335                 return (False, sha1)
336
337 def check_sha1_cache(sha1, url):
338         logging.debug('checking sha1 caches: %s', sha1)
339         if hash_path_exists(snapshot_cache_dir, sha1):
340                 logging.debug('exists in snapshot sha1 cache: %s', sha1)
341                 remove(hash_path(sha1_cache_dir,sha1))
342                 return (True, sha1)
343         elif hash_path_exists(sha1_cache_dir, sha1):
344                 logging.debug('exists in derivatives sha1 cache: %s', sha1)
345                 return (False, sha1)
346         else:
347                 logging.debug('does not exist in any sha1 caches: %s', sha1)
348                 return download_sha1(url, sha1_cache_dir, sha1)
349
350 def status(type, hash, url):
351         logging.debug('checking status of hash: %s %s %s', type, hash, url)
352         if type == 'sha1':
353                 (ret, sha1) = check_sha1_cache(hash, url)
354                 if ret == True:
355                         return ('unmodified', sha1)
356                 elif ret == False:
357                         return ('modified', sha1)
358                 else:
359                         return (ret, sha1)
360         elif type == 'sha256':
361                 (ret, sha1) = check_hash_cache(sha256_cache_dir, hash, type, url)
362                 if ret == True:
363                         return ('unmodified', sha1)
364                 elif ret == False:
365                         return ('modified', sha1)
366                 else:
367                         return (ret, sha1)
368         elif type == 'md5sum':
369                 (ret, sha1) = check_hash_cache(md5_cache_dir, hash, type, url)
370                 if ret == True:
371                         return ('unmodified', sha1)
372                 elif ret == False:
373                         return ('modified', sha1)
374                 else:
375                         return (ret, sha1)
376         else:
377                 logging.warning('unknown hash type detected: %s %s %s', type, hash, url)
378                 return ('unknown', None)
379
380 # Functions for getting information about source packages
381
382 def get_info(srcpkg):
383         dsc = None
384         for header in checksum_headers:
385                 if not dsc and header in srcpkg:
386                         dsc = [x for x in srcpkg[header] if x['name'].endswith('.dsc')]
387         if not dsc:
388                 logging.warning('did not find any dsc files')
389                 return None
390         if len(dsc) > 1:
391                 logging.warning('found multiple dsc files: %s' % ' '.join(['%s %s' % (dsc_name, dsc_sha1) for dsc_name, dsc_sha1 in dsc]))
392                 return None
393         dsc = dsc[0]
394         dsc_name = dsc['name']
395         dsc_hash_type, dsc_hash =  [(k, v) for k, v in dsc.iteritems() if k not in ('name', 'size')][0]
396
397         parts = []
398         part_names = []
399         for header in checksum_headers:
400                 if header in srcpkg:
401                         for part in srcpkg[header]:
402                                 if 'name' in part and part['name'] not in part_names and not part['name'].endswith('.dsc'):
403                                         parts.append(part)
404                                         part_names.append(part['name'])
405
406         return (dsc_hash_type, dsc_hash, dsc_name, parts)
407
408 def get_debian_info(files):
409         dsc = [file for file in files if file[0].endswith('.dsc')]
410         if not dsc:
411                 logging.warning('did not find any Debian dsc files: snapshots bug or ancient source package')
412                 return None
413         if len(dsc) > 1:
414                 logging.warning('found multiple Debian dsc files, choosing first one: %s' % ' '.join(['%s %s' % (dsc_name, dsc_sha1) for dsc_name, dsc_sha1 in dsc]))
415
416         dsc = dsc[0]
417         dsc_name, dsc_sha1 = dsc
418
419         parts = []
420         part_names = []
421         for file in files:
422                 part_name, part_sha1 = file
423                 if part_name not in part_names and not part_name.endswith('.dsc'):
424                         parts.append(file)
425                         part_names.append(part_name)
426
427         return (dsc_sha1, dsc_name, parts)
428
429 # Functions for extracting information from the snapshots database
430
431 def database_error(e):
432         reason = None
433         code = None
434         if hasattr(e, 'pgerror'): reason = e.pgerror
435         if hasattr(e, 'pgcode'): code = e.pgcode
436         logging.warning('unable to execute database query: %s %s', code, reason)
437         conn.reset()
438
439 def srcpkg_was_in_debian(name, version=None):
440         try:
441                 if version:
442                         cur.execute('SELECT version FROM srcpkg WHERE name=%s AND version=%s LIMIT 1;', (name, version))
443                         return not not cur.fetchone()
444                 else:
445                         cur.execute('SELECT version FROM srcpkg WHERE name=%s LIMIT 1;', (name,))
446                         return not not cur.fetchone()
447         except psycopg2.Error, e:
448                 database_error(e)
449                 return None
450
451 def sha1_to_srcpkgs(sha1):
452         try:
453                 cur.execute(
454                         '''SELECT name, version
455                         FROM srcpkg
456                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
457                         WHERE hash=%s;''', (sha1,))
458                 return cur.fetchall()
459         except psycopg2.Error, e:
460                 database_error(e)
461                 return None
462
463 def srcpkg_to_sha1s(name, version):
464         try:
465                 cur.execute(
466                         '''SELECT hash
467                         FROM file_srcpkg_mapping
468                         JOIN srcpkg ON srcpkg.srcpkg_id=file_srcpkg_mapping.srcpkg_id
469                         WHERE name=%s AND version=%s;''', (name, version))
470                 return cur.fetchall()
471         except psycopg2.Error, e:
472                 database_error(e)
473                 return None
474
475 def srcpkg_to_srcpkgs(name):
476         try:
477                 cur.execute(
478                         '''SELECT name, version
479                         FROM srcpkg
480                         WHERE name=%s ORDER BY version DESC;''', (name,))
481                 return cur.fetchall()
482         except psycopg2.Error, e:
483                 database_error(e)
484                 return None
485
486 def sha1s_to_files(sha1):
487         try:
488                 cur.execute('SELECT DISTINCT ON (name, hash) name, hash FROM file WHERE hash=%s;', hash)
489                 return cur.fetchall()
490         except psycopg2.Error, e:
491                 database_error(e)
492                 return None
493
494 def srcpkg_to_files(name, version):
495         try:
496                 cur.execute(
497                         '''SELECT DISTINCT ON (file.name, file.hash) file.name, file.hash
498                         FROM file_srcpkg_mapping
499                         JOIN srcpkg ON srcpkg.srcpkg_id=file_srcpkg_mapping.srcpkg_id
500                         JOIN file ON file_srcpkg_mapping.hash=file.hash
501                         WHERE srcpkg.name=%s AND srcpkg.version=%s;''', (name, version))
502                 return cur.fetchall()
503         except psycopg2.Error, e:
504                 database_error(e)
505                 return None
506
507 def sha1_version_to_derived_from(sha1, version):
508         try:
509                 cur.execute(
510                         '''SELECT name, version
511                         FROM srcpkg
512                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
513                         WHERE hash=%s and version<=%s
514                         ORDER BY name ASC, version DESC
515                         LIMIT 1;''', (sha1, version))
516                 res = cur.fetchall()
517                 if res: return res
518                 cur.execute(
519                         '''SELECT name, version
520                         FROM srcpkg
521                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
522                         WHERE hash=%s
523                         ORDER BY name ASC, version ASC
524                         LIMIT 1;''', (sha1, version))
525                 return cur.fetchall()
526         except psycopg2.Error, e:
527                 database_error(e)
528                 return None
529
530 def srcpkg_to_derived_from(name, version):
531         try:
532                 cur.execute(
533                         '''SELECT name, version
534                         FROM srcpkg
535                         WHERE name=%s and version<=%s
536                         ORDER BY version DESC
537                         LIMIT 1;''', (name, version))
538                 res = cur.fetchall()
539                 if res: return res
540                 cur.execute(
541                         '''SELECT name, version
542                         FROM srcpkg
543                         WHERE name=%s
544                         ORDER BY version ASC
545                         LIMIT 1;''', (name,))
546                 return cur.fetchall()
547         except psycopg2.Error, e:
548                 database_error(e)
549                 return None
550
551 # Functions related to creating patches
552
553 # Add symlinks for all needed files
554 def prepare(dsc_name, dsc_sha1, parts):
555         logging.debug('preparing deriv directory for %s', dsc_name)
556         tmp_dir = tempfile.mkdtemp(prefix='derivs-cmp-srcpkg-%s-' % derivative_short_name)
557         path = hash_path(snapshot_cache_dir, dsc_sha1)
558         if not os.path.exists(path): path = hash_path(sha1_cache_dir, dsc_sha1)
559         path = hash_path(sha1_cache_dir, dsc_sha1)
560         dsc_path = os.path.join(tmp_dir, dsc_name)
561         os.symlink(path, dsc_path)
562         converted_parts = []
563         for part in parts:
564                 if 'sha1' in part:
565                         path = hash_path(snapshot_cache_dir, part['sha1'])
566                         if not os.path.exists(path): path = hash_path(sha1_cache_dir, part['sha1'])
567                 elif 'sha256' in part: path = hash_path(sha256_cache_dir, part['sha256'])
568                 elif 'md5sum' in part: path = hash_path(md5_cache_dir, part['md5sum'])
569                 part_path = os.path.join(tmp_dir, part['name'])
570                 os.symlink(path, part_path)
571                 # Some distributions allow additional compression schemes
572                 # Here we work around this by recompressing with gzip
573                 if part['name'].endswith('.lz'):
574                         converted_parts.append(convert_lzip_to_gzip(tmp_dir, part['name']))
575         # Update the dsc file if we recompressed any files
576         if converted_parts:
577                 update_dsc_file(tmp_dir, dsc_name, converted_parts)
578         return tmp_dir
579
580 def prepare_debian(dsc_name, dsc_sha1, files):
581         logging.debug('preparing Debian directory for %s', dsc_name)
582         readable_parts = 0
583         debian_tmp_dir = tempfile.mkdtemp(prefix='derivs-cmp-srcpkg-Debian-')
584         path = hash_path(snapshot_cache_dir, dsc_sha1)
585         if os.access(path, os.R_OK): readable_parts += 1
586         dsc_path = os.path.join(debian_tmp_dir, dsc_name)
587         os.symlink(path, dsc_path)
588         for file in files:
589                 part_name, part_sha1 = file
590                 path = hash_path(snapshot_cache_dir, part_sha1)
591                 part_path = os.path.join(debian_tmp_dir, part_name)
592                 os.symlink(path, part_path)
593                 if os.access(path, os.R_OK): readable_parts += 1
594         if readable_parts != (1 + len(files)):
595                 logging.info('only %s parts of %s are readable', readable_parts, dsc_name)
596                 rmtree(debian_tmp_dir)
597                 return None
598         return debian_tmp_dir
599
600 def get_changelog_entries(tmp_dir, dsc_name, dsc_sha1):
601         logging.debug('getting changelog entries from %s', dsc_name)
602
603         # Cache check
604         changelog_path = hash_path(sha1_changelog_dir, dsc_sha1)
605         if os.path.exists(changelog_path):
606                 logging.debug('changelog cache exists for %s %s', dsc_name, dsc_sha1)
607                 f = file(changelog_path)
608                 if f:
609                         try: changelog_entries = json.load(f)
610                         except ValueError: pass
611                         else: return [tuple(entry) for entry in changelog_entries]
612                         finally: f.close()
613
614         # Preparation
615         extract_path = os.path.join(tmp_dir,'extracted')
616
617         # Unpack the source tree
618         logging.debug('unpacking source package %s', dsc_name)
619         cmdline = ['dpkg-source', '-x', dsc_name, 'extracted']
620         process = subprocess.Popen(cmdline, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
621         output = process.communicate()[0]
622         if process.returncode:
623                 logging.warning('dpkg-source reported failure to extract %s:', dsc_name)
624                 logging.warning(output)
625                 cmdline = ['ls', '-lR', '--time-style=+']
626                 process = subprocess.Popen(cmdline, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
627                 output = process.communicate()[0]
628                 logging.warning(output)
629                 rmtree(extract_path)
630                 return None
631
632         # Sanitise the debian dir and changelog file in case it is a symlink to outside
633         debian_dir = os.path.join(extract_path, 'debian')
634         changelog_filename = os.path.join(debian_dir,'changelog')
635         if os.path.islink(debian_dir) or os.path.islink(changelog_filename):
636                 logging.warning('debian dir or changelog is a symbolic link %s', dsc_name)
637                 rmtree(extract_path)
638                 return None
639
640         # Check if the changelog exists
641         if not os.path.exists(changelog_filename):
642                 logging.warning('could not find changelog in %s', dsc_name)
643                 rmtree(extract_path)
644                 return None
645
646         # Find out which source package is the most likely derivative
647         logging.debug('parsing changelog for %s', dsc_name)
648         changelog_file = open(changelog_filename)
649         changelog_obj = changelog.Changelog(changelog_file)
650         try:
651                 changelog_entries = [(entry.package, str(entry._raw_version)) for entry in changelog_obj]
652         except:
653                 logging.warning('could not read changelog from %s', dsc_name)
654                 rmtree(extract_path)
655                 return None
656         del changelog_obj
657         changelog_file.close()
658
659         # Clean up again
660         rmtree(extract_path)
661
662         # Write the cache
663         makedirs(hash_path_parent(sha1_changelog_dir, dsc_sha1))
664         remove(changelog_path)
665         f = file(changelog_path, 'w')
666         json.dump(changelog_entries, f)
667         f.close()
668
669         return changelog_entries
670
671 # Find the source package name and version this is probably derived from
672 def find_derived_from(tmp_dir, name, version, dsc_name, dsc_sha1, parts_unmodified):
673         logging.debug('finding base source package of %s %s', name, version)
674
675         # Get a list of changelog entries
676         changelog_entries = get_changelog_entries(tmp_dir, dsc_name, dsc_sha1)
677         if changelog_entries:
678                 logging.debug('changelog entries are: %s', ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in changelog_entries]))
679
680         # Get a list of candidate versions from the database
681         possibly_derived_from = []
682         logging.debug('checking which parts were in Debian')
683         for part_sha1, part_name in parts_unmodified:
684                 part_derived_from = sha1_to_srcpkgs(part_sha1)
685                 if part_derived_from:
686                         logging.debug('part %s %s available in %s', part_sha1, part_name, ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in part_derived_from]))
687                         possibly_derived_from.extend(part_derived_from)
688
689         if not possibly_derived_from:
690                 logging.debug('no parts in common with Debian, obtaining old versions')
691                 old_packages = srcpkg_to_srcpkgs(name)
692                 if old_packages: possibly_derived_from = old_packages
693
694         # Uniqify
695         possibly_derived_from = list(set(possibly_derived_from))
696         if possibly_derived_from:
697                 logging.debug('possibly derived from: %s', ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in possibly_derived_from]))
698         else:
699                 logging.debug('nothing in possibly derived from list')
700
701         # Match changelog versions against candidates
702         if changelog_entries:
703                 logging.debug('matching changelog entries against versions possibly derived from')
704                 for entry in changelog_entries:
705                         entry_name, entry_version = entry
706                         if entry in possibly_derived_from:
707                                 logging.debug('%s %s in possibly derived from', entry_name, entry_version)
708                                 return entry
709                 logging.debug('checking if changelog entries were ever in Debian')
710                 for entry_name, entry_version in changelog_entries:
711                         if srcpkg_was_in_debian(entry_name, entry_version):
712                                 logging.debug('%s %s was in Debian', entry_name, entry_version)
713                                 return (entry_name, entry_version)
714         if possibly_derived_from:
715                 logging.debug('finding closest entry in possibly derived from')
716                 possibly_derived_from.sort(cmp=lambda a,b: apt_version_cmp(b[1],a[1]))
717                 for entry_name, entry_version in possibly_derived_from:
718                         if name == entry_name and apt_version_cmp(version, entry_version) >= 0:
719                                 logging.debug('%s %s is an equal or lower version', entry_name, entry_version)
720                                 return (entry_name, entry_version)
721                 entry = possibly_derived_from[-1]
722                 entry_name, entry_version = entry
723                 logging.debug('no lower version numbers, returning next highest version %s %s', entry_name, entry_version)
724                 return entry
725         logging.debug('finding closest version number in Debian')
726         for entry in srcpkg_to_derived_from(name, version):
727                 entry_name, entry_version = entry
728                 logging.debug('closest package was %s %s', entry_name, entry_version)
729                 return entry
730         logging.debug('could not find Debian package %s %s is derived from', name, version)
731         return None
732
733 # Generate a patch file
734 def create_patch(tmp_dir, dsc_name, dsc_sha1, debian_tmp_dir, debian_dsc_name, debian_dsc_sha1):
735         global repackaged_but_identical
736
737         dsc_path = os.path.join(tmp_dir, dsc_name)
738         debian_dsc_path = os.path.join(debian_tmp_dir, debian_dsc_name)
739         path_everything = sha1_patch_path(debian_dsc_sha1, dsc_sha1)
740         path_debian = sha1_patch_path(debian_dsc_sha1, dsc_sha1, 'debian')
741
742         # Generate the main patch
743         if not os.path.exists(path_everything) and os.path.exists(debian_dsc_path) and os.path.exists(dsc_path):
744                 makedirs(os.path.dirname(path_everything))
745                 cmdline = ['debdiff', '--quiet', '--diffstat', debian_dsc_path, dsc_path]
746                 stdout = open(path_everything, 'w')
747                 process = subprocess.Popen(cmdline, stdout=stdout, stderr=subprocess.PIPE, preexec_fn=subprocess_setup)
748                 output = process.communicate()[1]
749                 stdout.close()
750                 if process.returncode == 255:
751                         logging.warning('debdiff reported failure %s %s:', debian_dsc_name, dsc_name)
752                         logging.warning(output)
753                         cmdline = ['ls', '-lR', '--time-style=+']
754                         for name, dir in (derivative_short_name, tmp_dir), ('Debian', debian_tmp_dir):
755                                 logging.warning('dir listing for %s:', name)
756                                 process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
757                                 output = process.communicate()[0]
758                                 logging.warning(output)
759                         return False
760                 elif process.returncode == 0:
761                         logging.info('derivative repackaged in an identical way %s %s', debian_dsc_name, dsc_name)
762                         repackaged_but_identical += 1
763                         return False
764                 elif process.returncode != 1:
765                         logging.warning('debdiff reported unknown return code %s %s %s:', process.returncode, debian_dsc_name, dsc_name)
766                         logging.warning(output)
767                         cmdline = ['ls', '-lR', '--time-style=+']
768                         for name, dir in (derivative_short_name, tmp_dir), ('Debian', debian_tmp_dir):
769                                 logging.warning('dir listing for %s:', name)
770                                 process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
771                                 output = process.communicate()[0]
772                                 logging.warning(output)
773                         return False
774
775         # Filter the main patch to include only the debian/ directory
776         if os.path.exists(path_everything) and not os.path.exists(path_debian):
777                 makedirs(os.path.dirname(path_debian))
778                 cmdline = ['filterdiff', '--include=*/debian/*', path_everything]
779                 filterdiff = subprocess.Popen(cmdline, stdout=subprocess.PIPE, preexec_fn=subprocess_setup)
780                 filterdiff_output = filterdiff.communicate()[0]
781                 diffstat = subprocess.Popen('diffstat', stdin=subprocess.PIPE, stdout=subprocess.PIPE, preexec_fn=subprocess_setup)
782                 diffstat_output = diffstat.communicate(filterdiff_output)[0]
783                 f = open(path_debian, 'w')
784                 f.write('diffstat of debian/ for %s %s\n' % (os.path.splitext(debian_dsc_name)[0], os.path.splitext(dsc_name)[0]))
785                 f.write('\n')
786                 f.write(diffstat_output)
787                 f.write('\n')
788                 f.write(filterdiff_output)
789                 f.close()
790
791         # Patches > 100MB are probably not that useful, replace them with a link
792         for path in path_everything, path_debian:
793                 try:
794                         if os.path.getsize(path) > 104857600:
795                                 logging.info('patch between %s and %s is larger than 100MB', dsc_name, debian_dsc_name)
796                                 remove(path)
797                                 symlink(os.path.relpath(patch_too_large, os.path.dirname(path)), path)
798                 except OSError:
799                         pass
800
801         return True
802
803 def check_patch(debian_dsc_sha1, dsc_sha1):
804         patch_path = sha1_patch_path(debian_dsc_sha1, dsc_sha1)
805         lsdiff_path = sha1_lsdiff_path(debian_dsc_sha1, dsc_sha1)
806         if os.path.exists(lsdiff_path):
807                 logging.debug('lsdiff cache exists for %s', patch_path)
808                 f = file(lsdiff_path)
809                 lsdiff = f.read()
810                 f.close()
811         else:
812                 logging.debug('lsdiff cache does not exist for %s', patch_path)
813                 cmdline = ['lsdiff', patch_path]
814                 process = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
815                 lsdiff = process.communicate()[0]
816                 makedirs(os.path.dirname(lsdiff_path))
817                 f = file(lsdiff_path,'w')
818                 f.write(lsdiff)
819                 f.close()
820         lsdiff = lsdiff.splitlines()
821         for line in lsdiff:
822                 if line != 'debian/changelog' and not line.endswith('/debian/changelog'):
823                         return True
824         return False
825
826 def present_patch(name, version, dsc_sha1, debian_name, debian_version,  debian_dsc_sha1):
827         useful_patch = check_patch(debian_dsc_sha1, dsc_sha1)
828         patches = []
829         types = ('', 'debian')
830         for type in types:
831                 ln_to = sha1_patch_path(debian_dsc_sha1, dsc_sha1, type)
832                 if not os.path.exists(ln_to):
833                         continue
834                 ln_from_deriv = deriv_patch_path(name, version, debian_name, debian_version, type)
835                 ln_from_global = global_patch_path(name, version, debian_name, debian_version, type)
836                 makedirs(os.path.dirname(ln_from_deriv))
837                 makedirs(os.path.dirname(ln_from_global))
838                 remove(ln_from_deriv)
839                 remove(ln_from_global)
840                 symlink(os.path.relpath(ln_to, os.path.dirname(ln_from_deriv)), ln_from_deriv)
841                 symlink(os.path.relpath(ln_to, os.path.dirname(ln_from_global)), ln_from_global)
842                 if useful_patch:
843                         patches.append(os.path.relpath(ln_from_global, os.path.abspath(global_patch_dir)))
844         return tuple(patches)
845
846 # Functions that wrap other functions and decide what to do
847
848 def check_source_package(source_entry, srcpkg):
849         global modifies_dsc_files
850
851         try:
852                 name = None
853                 version = None
854                 dir = None
855                 name = srcpkg['Package']
856                 version = srcpkg['Version']
857                 dir = srcpkg['Directory']
858                 if '/' in name or name == '..':
859                         logging.warning('could not process source package %s %s: possibly malicious name', name, version)
860                         return None
861                 if '/' in version or version == '..':
862                         logging.warning('could not process source package %s %s: possibly malicious version', name, version)
863                         return None
864                 if '..' in dir.split('/'):
865                         logging.warning('could not process source package %s %s: possibly malicious dir: %s', name, version, dir)
866                         return None
867         except KeyError:
868                 logging.warning('could not process source package %s %s', name, version)
869                 return None
870         logging.debug('started processing source package %s %s', name, version)
871         info = get_info(srcpkg)
872         if not info:
873                 logging.warning('finished processing source package %s %s: could not get any info', name, version)
874                 return None
875         dsc_hash_type, dsc_hash, dsc_name, parts = info
876         if '/' in dsc_name or dsc_name == '..':
877                 logging.warning('could not process source package %s %s: possibly malicious dsc name %s', name, version, dsc_name)
878                 return None
879         if not ishex(dsc_hash):
880                 logging.warning('could not process source package %s %s: possibly malicious dsc hash %s', name, version, dsc_hash)
881                 return None
882         dsc_url = source_entry.archive_uri('%s/%s' % (dir, dsc_name))
883         logging.debug('found dsc file: %s %s %s', dsc_hash_type, dsc_hash, dsc_url)
884         dsc_status, dsc_sha1 = status(dsc_hash_type, dsc_hash, dsc_url)
885         logging.debug('checked dsc status: %s %s %s', dsc_status, dsc_sha1, dsc_url)
886         if dsc_status == 'unmodified':
887                 # Ignore the srcpkg since we know it is was in Debian
888                 # at one point and is hopefully therefore unmodified
889                 logging.debug('finished processing source package %s %s: dsc unmodified', name, version)
890                 return None
891         else:
892                 files = [(dsc_sha1, dsc_hash_type, dsc_hash)]
893                 parts_unmodified = []
894                 parts_modified = []
895                 parts_unknown = []
896                 for part in parts:
897                         part_name = part['name']
898                         if '/' in part_name or part_name == '..':
899                                 logging.warning('could not process source package %s %s: possibly malicious part name %s', name, version, part_name)
900                                 return None
901                         part_url = source_entry.archive_uri('%s/%s' % (dir, part_name))
902                         part_hash_type, part_hash = [(k, v) for k, v in part.iteritems() if k not in ('name', 'size')][0]
903                         if not ishex(part_hash):
904                                 logging.warning('could not process source package %s %s: possibly malicious part hash %s', name, version, part_hash)
905                                 return None
906                         logging.debug('found part file: %s %s %s', part_hash_type, part_hash, part_url)
907                         part_status, part_sha1 = status(part_hash_type, part_hash, part_url)
908                         logging.debug('checked part status: %s %s %s', part_status, part_sha1, part_url)
909                         if 'sha1' not in part and part_sha1: part['sha1'] = part_sha1
910                         if part_status == 'unmodified': parts_unmodified.append((part_sha1, part_name))
911                         elif part_status == 'modified': parts_modified.append((part_sha1, part_name))
912                         else: parts_unknown.append((part_sha1, part_name))
913                         if part_status == 'modified': files.append((part_sha1, part_hash_type, part_hash))
914
915                 all_parts_unmodified = (len(parts_unmodified) == len(parts))
916                 parts_unmodified = list(set(parts_unmodified))
917                 logging.debug('source package status %s %s: dsc %s, %s parts unmodified, %s parts modified, %s parts unknown', name, version, dsc_status, len(parts_unmodified), len(parts_modified), len(parts_unknown))
918
919                 if all_parts_unmodified:
920                         # Ignore the srcpkg since we know all the parts were
921                         # in Debian at one point and ergo, it is unmodified
922                         logging.debug('finished processing source package %s %s: all non-dsc parts unmodified', name, version)
923                         if dsc_status == 'modified':
924                                 logging.info('source package %s %s: unmodified, but dsc different', name, version)
925                                 modifies_dsc_files += 1
926                         return (files, None, None, None)
927                 else:
928                         logging.debug('some parts modified, looking for derived version %s %s', name, version)
929                         if not dsc_sha1:
930                                 logging.warning('finished processing source package %s %s: sha1 missing for dsc file', name, version)
931                                 return (files, None, None, None)
932                         if parts_unknown:
933                                 logging.warning('finished processing source package %s %s: sha1 missing for some parts', name, version)
934                                 return (files, None, None, None)
935                         new = None
936                         link = None
937                         patch = None
938                         tmp_dir = prepare(dsc_name, dsc_sha1, parts)
939                         derived_from = find_derived_from(tmp_dir, name, version, dsc_name, dsc_sha1, parts_unmodified)
940                         if derived_from:
941                                 debian_name, debian_version = derived_from
942                                 link = (debian_name, debian_version, name, version, dsc_url)
943                                 logging.debug('source package %s %s derived from %s %s', name, version, debian_name, debian_version)
944                                 debian_files = srcpkg_to_files(debian_name, debian_version)
945                                 if debian_files:
946                                         debian_info = get_debian_info(debian_files)
947                                         if debian_info:
948                                                 debian_dsc_sha1, debian_dsc_name, debian_parts = debian_info
949                                                 logging.debug('Debian source package %s %s dsc found %s %s', debian_name, debian_version, debian_dsc_name, debian_dsc_sha1)
950                                                 debian_tmp_dir = prepare_debian(debian_dsc_name, debian_dsc_sha1, debian_parts)
951                                                 if debian_tmp_dir:
952                                                         patch_created = create_patch(tmp_dir, dsc_name, dsc_sha1, debian_tmp_dir, debian_dsc_name, debian_dsc_sha1)
953                                                         if patch_created:
954                                                                 patch_names = present_patch(name, version, dsc_sha1, debian_name, debian_version, debian_dsc_sha1)
955                                                                 if patch_names: patch = (debian_name, debian_version, debian_dsc_sha1, name, version, dsc_sha1, [part[0] for part in parts_modified], patch_names)
956                                                         rmtree(debian_tmp_dir)
957                                                 else:
958                                                         # This could be an issue with snapshots or a file that is not distributable
959                                                         logging.info('source package %s %s: could not create temporary dir for Debian: %s %s', name, version, debian_name, debian_version)
960                                         else:
961                                                 logging.warning('source package %s %s: could not get Debian info for %s %s: %s', name, version, debian_name, debian_version, debian_info)
962                                 else:
963                                         if srcpkg_was_in_debian(debian_name, debian_version):
964                                                 logging.warning('source package %s %s: snapshot database issue, no Debian files found', debian_name, debian_version)
965                                         else:
966                                                 logging.warning('source package %s %s: derived from %s %s possibly bogus', name, version, debian_name, debian_version)
967                         else:
968                                 new = (name, version, dsc_url)
969                         rmtree(tmp_dir)
970                         logging.debug('finished processing source package %s %s: all done', name, version)
971                         return (files, patch, link, new)
972
973 def process_sources(source_entries, lists_dir):
974         files = []
975         patches = []
976         links = []
977         new = []
978         for source in source_entries:
979                 for source_entry in source:
980                         fn = os.path.join(lists_dir, source_entry.describe.rstrip(')').rpartition('(')[2])
981                         try: f = file(fn)
982                         except IOError: continue
983                         for srcpkg in deb822.Sources.iter_paragraphs(f):
984                                 actions = check_source_package(source_entry, srcpkg)
985                                 if actions:
986                                         action_files, action_patch, action_link, action_new = actions
987                                         if action_files:
988                                                 files.append(action_files)
989                                                 logging.debug('action: return files %s', ' '.join([' '.join(action) for action in action_files]))
990                                         if action_patch:
991                                                 patches.append(action_patch)
992                                                 logging.debug('action: return patches %s', ' '.join([' '.join(action) for action in action_patch]))
993                                         if action_link:
994                                                 links.append(action_link)
995                                                 logging.debug('action: return links to modified source packages %s', ' '.join(action_link))
996                                         if action_new:
997                                                 new.append(action_new)
998                                                 logging.debug('action: return links to new source packages %s', ' '.join(action_new))
999                                 logging.debug('done')
1000                                 logging.debug('')
1001                         f.close()
1002         return (files, patches, links, new)
1003
1004 logging.debug('processing distribution %s', derivative_short_name)
1005
1006 files, patches, links, new = process_sources(source_entries, lists_dir)
1007
1008 # Done with the database, close the connection
1009 cur.close()
1010 conn.close()
1011
1012 # Write out the results
1013 filename = sys.argv[3]
1014 data = files
1015 if data:
1016         output_data = {}
1017         for package in data:
1018                 for modified_file in package:
1019                         sha1, hash_type, hash = modified_file
1020                         if sha1 not in output_data:
1021                                 output_data[sha1] = {}
1022                         if hash_type != 'sha1' and hash_type not in output_data[sha1]:
1023                                 output_data[sha1][hash_type] = hash
1024                         elif hash_type != 'sha1' and hash != output_data[sha1][hash_type]:
1025                                 logging.warning('hashes mismatched: %s: %s %s != %s', sha1, hash_type, hash, output_data[sha1][hash_type])
1026         output = file(os.path.abspath(filename), 'wb')
1027         yaml.safe_dump(output_data, output)
1028         output.close()
1029
1030 filename = sys.argv[4]
1031 data = patches
1032 if data:
1033         if not os.path.exists(os.path.join(global_patch_dir,'HEADER.html')):
1034                 symlink('../../doc/HEADER.patches.html',os.path.join(global_patch_dir,'HEADER.html'))
1035         if not os.path.exists(os.path.join(global_patch_dir,'.htaccess')):
1036                 symlink('../../etc/htaccess.patches',os.path.join(global_patch_dir,'.htaccess'))
1037         if not os.path.exists(os.path.join(deriv_patch_dir,'HEADER.html')):
1038                 symlink('../../../doc/HEADER.patches.html',os.path.join(deriv_patch_dir,'HEADER.html'))
1039         if not os.path.exists(os.path.join(global_patch_dir,'.htaccess')):
1040                 symlink('../../../etc/htaccess.patches',os.path.join(deriv_patch_dir,'.htaccess'))
1041         output_data = []
1042         for item in data:
1043                 debian_name, debian_version, debian_sha1, name, version, sha1, parts_sha1, patches = item
1044                 item = {}
1045                 item['debian_name'] = debian_name
1046                 item['debian_version'] = debian_version
1047                 item['debian_sha1'] = debian_sha1
1048                 item['name'] = name
1049                 item['version'] = version
1050                 item['sha1'] = sha1
1051                 item['patches'] = patches
1052                 item['parts'] = parts_sha1
1053                 output_data.append(item)
1054         output = file(os.path.abspath(filename), 'wb')
1055         yaml.safe_dump(output_data, output)
1056         output.close()
1057 else:
1058         remove(filename)
1059
1060 filename = sys.argv[5]
1061 data = links
1062 if data:
1063         data = list(set(data))
1064         data.sort(cmp=lambda a,b: cmp(a[0],b[0]) or apt_version_cmp(a[1],b[1]) or cmp(a[2],b[2]) or apt_version_cmp(a[3],b[3]))
1065         output_data = {}
1066         output = file(os.path.abspath(filename), 'wb')
1067         for debian_name, debian_version, name, version, dsc_url in data:
1068                 if debian_name not in output_data:
1069                         output_data[debian_name] = {}
1070                 if debian_version not in output_data[debian_name]:
1071                         output_data[debian_name][debian_version] = []
1072                 item = {}
1073                 item['name'] = name
1074                 item['version'] = version
1075                 item['dsc'] = dsc_url
1076                 output_data[debian_name][debian_version].append(item)
1077         yaml.safe_dump(output_data, output)
1078         output.close()
1079 else:
1080         remove(filename)
1081
1082 filename = sys.argv[6]
1083 data = new
1084 if data:
1085         data = list(set(data))
1086         data.sort(cmp=lambda a,b: cmp(a[0],b[0]) or apt_version_cmp(a[1],b[1]) or cmp(a[2],b[2]))
1087         output_data = {}
1088         output = file(os.path.abspath(filename), 'wb')
1089         for name, version, dsc_url in data:
1090                 if name not in output_data:
1091                         output_data[name] = {}
1092                 if version not in output_data[name]:
1093                         output_data[name][version] = []
1094                 output_data[name][version].append(str(dsc_url))
1095         yaml.safe_dump(output_data, output)
1096         output.close()
1097 else:
1098         remove(filename)
1099
1100 logging.shutdown()