a5e1b6d10fa1719c385c3979c9a648b8c4a6d2f5
[dex/census.git] / bin / compare-source-package-list
1 #!/usr/bin/python
2
3 # Copyright 2011 Paul Wise
4 # Released under the MIT/Expat license, see doc/COPYING
5
6 # Uses the snapshot.debian.org metadata database and SHA-1 based filesystem to
7 # compute debdiffs between Debian and individual derivatives. The metadata
8 # allows knowing if a particular file was ever in Debian and the filesystem
9 # allows the creation of debdiffs.
10 #
11 # The script works approximately like this:
12 #
13 # Load the Sources files previously downloaded by get-package-lists as indicated
14 # by the sources.list of the derivative.
15 #
16 # For each source package in the derivative:
17 #
18 # Check if the dsc has ever been in Debian, if not, check if the other
19 # parts have and therefore decide if the package is unmodified or not.
20 # Unmodified source packages are skipped and include those with the exact
21 # same dsc file or those where all the non-dsc parts are identical.
22 #
23 # Try some heuristics (name, version, changelog entries) to find out if
24 # the package could be based on some package that is or was in Debian.
25 #
26 # If it was not then skip to the next one and make a note, since Debian
27 # might want to know about source packages that are missing from Debian.
28 #
29 # If it was then use debdiff to create a diff and filterdiff to create a
30 # diff of the debian/ dir.
31 #
32 # Usage:
33 # compare-source-package-list <sources.list> <apt dir> <patches list> <links list> <new package list> <log file>
34
35 # FIXME: write out some statistics and rrdtool graphs
36 #               source package types per derivative
37 #               number of source packages
38 #               cache misses: md5, sha256, sha1, patch, changelog
39 # FIXME: comment the code to list assumptions and function purpose
40 # FIXME: add options to allow re-processing only specific packages
41 # FIXME: write something to clean up old files and patches
42 # FIXME: don't unpack or make a patch when we don't have all the parts
43 # FIXME: don't make a patch when we were not able to unpack the source package
44 # FIXME: cleanup files at start of run
45 # FIXME: extract new debian/patches/ patches
46 # FIXME: print out packages that are no longer in Debian
47 # FIXME: deal with really large patches:
48 # FIXME:   kde-l10n-*: too few parts to be useful
49 # FIXME:   divergence: too many changelog entries between versions to be useful
50 # FIXME:   derivative is older than Debian
51 # FIXME:   derivative renamed the source package
52 # FIXME:   just a really big diff
53 # FIXME: when there are multiple dsc files in snapshots, prefer the debian/debian-archive one
54 # FIXME: when the source package is ancient and the dsc is missing, make a fake one to use
55 # FIXME: add an in-memory cache of hashes so that hashes in multiple releases hit the disk once
56 # FIXME: deal with rate-limited websites like alioth that do not like many requests
57
58 import os
59 import sys
60 import httplib
61 import urllib2
62 import hashlib
63 import shutil
64 import logging
65 import tempfile
66 import string
67 import socket
68 import signal
69 import subprocess
70 import yaml
71 from debian import deb822, changelog
72 import apt_pkg
73 import psycopg2
74 try: import simplejson as json
75 except ImportError: import json
76
77 # Helper functions for python stuff with annoying error handling
78
79 def makedirs(dirs):
80         try: os.makedirs(dirs)
81         except OSError: pass
82
83 def rmtree(dir):
84         try: shutil.rmtree(dir)
85         except OSError: pass
86
87 def remove(file):
88         try: os.remove(file)
89         except OSError: pass
90
91 def symlink(source, link):
92         try: os.symlink(source, link)
93         except OSError: pass
94
95 # http://www.chiark.greenend.org.uk/ucgi/~cjwatson/blosxom/2009-07-02-python-sigpipe.html
96 def subprocess_setup():
97         # Python installs a SIGPIPE handler by default. This is usually not what
98         # non-Python subprocesses expect.
99         signal.signal(signal.SIGPIPE, signal.SIG_DFL)
100
101 # We need to map apt_pkg.version_compare return values to cmp return values
102 # The documentation is incorrect: http://bugs.debian.org/680891
103 def apt_version_cmp(a, b):
104         ret = apt_pkg.version_compare(a, b)
105         if ret < 0: return -1
106         elif ret > 0: return 1
107         else: return 0
108
109 # Config
110 md5_cache_dir = os.path.abspath('../md5-farm')
111 sha1_cache_dir = os.path.abspath('../sha1-farm')
112 sha256_cache_dir = os.path.abspath('../sha256-farm')
113 sha1_patch_dir = os.path.abspath('../sha1-patches')
114 sha1_lsdiff_dir = os.path.abspath('../sha1-lsdiff')
115 sha1_changelog_dir = os.path.abspath('../sha1-changelog')
116 deriv_patch_dir = os.path.abspath('patches')
117 global_patch_dir = os.path.abspath('../patches')
118 snapshot_cache_dir = '/srv/snapshot.debian.org/farm'
119 patch_too_large = os.path.abspath('../../doc/patch-too-large.txt')
120 checksum_types = ('sha1', 'sha256', 'md5sum')
121 checksum_hashlib = ('sha1', 'sha256', 'md5')
122 checksum_headers = ('Checksums-Sha1', 'Checksums-Sha256', 'Files')
123 user_agent = 'Debian Derivatives Census QA bot'
124 timeout = 60
125 ishex = lambda s: not(set(s)-set(string.hexdigits))
126
127 # Init
128 apt_pkg.init()
129
130 # Preparation
131 sources_list = apt_pkg.SourceList()
132 sources_list.read_main_list()
133 conn = psycopg2.connect("service=snapshot-guest")
134 cur = conn.cursor()
135 remove(sys.argv[7])
136 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG, filename=sys.argv[7])
137
138 # Voodoo
139 lists_dir = apt_pkg.config.find_dir('Dir::State::lists')
140 source_entries = [[i for i in x.index_files if i.label=='Debian Source Index'] for x in sources_list.list]
141 derivative_short_name = os.path.basename(os.getcwd())
142 modifies_dsc_files = 0
143 repackaged_but_identical = 0
144
145 # Helper functions for generating path names
146
147 def hash_path_parent(dir, hash):
148         return os.path.join(dir, hash[0:2], hash[2:4])
149
150 def hash_path(dir, hash):
151         return os.path.join(dir, hash[0:2], hash[2:4], hash)
152
153 def hash_path_exists(dir, hash):
154         return os.path.exists(os.path.join(dir, hash[0:2], hash[2:4], hash))
155
156 def sha1_patch_path(debian_dsc_sha1, dsc_sha1, type=None):
157         path = os.path.join(hash_path(sha1_patch_dir, debian_dsc_sha1), hash_path('', dsc_sha1))
158         if type: path += '.%s' % type
159         path += '.patch'
160         return os.path.abspath(path)
161
162 def sha1_lsdiff_path(debian_dsc_sha1, dsc_sha1, type=None):
163         path = os.path.join(hash_path(sha1_lsdiff_dir, debian_dsc_sha1), hash_path('', dsc_sha1))
164         if type: path += '.%s' % type
165         path += '.lsdiff'
166         return os.path.abspath(path)
167
168 def shortslug(name):
169         return name[:4] if name.startswith('lib') else name[0]
170
171 def deriv_patch_path(name, version, debian_name, debian_version, type=None):
172         path = os.path.join(deriv_patch_dir, shortslug(debian_name), debian_name, '')
173         path += '_'.join((debian_name, debian_version, name, version))
174         if type: path += '.%s' % type
175         path += '.patch'
176         return os.path.abspath(path)
177
178 def global_patch_path(name, version, debian_name, debian_version, type=None):
179         path = os.path.join(global_patch_dir, shortslug(debian_name), debian_name, '')
180         path += '_'.join(('Debian', debian_name, debian_version, derivative_short_name, name, version))
181         if type: path += '.%s' % type
182         path += '.patch'
183         return os.path.abspath(path)
184
185 # Functions for munging source packages
186
187 def convert_lzip_to_gzip(dir, name):
188         cmdline = ['lzip', '-d', '--', name]
189         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
190         output = process.communicate()[0]
191         if process.returncode:
192                 logging.warning('lzip reported failure to decompress %s:', name)
193                 logging.warning(output)
194         bname = name[0:-3] # Strip off .lz
195         cmdline = ['gzip', '-1', '--', bname] # gzip -1 to reduce overhead
196         process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
197         output = process.communicate()[0]
198         if process.returncode:
199                 logging.warning('gzip reported failure to compress %s:', bname)
200                 logging.warning(output)
201         return (name, bname+'.gz')
202
203 def update_dsc_file(dir, dsc_name, parts):
204         dsc_path = os.path.join(dir,dsc_name)
205         dsc_file = open(dsc_path,'rb')
206         dsc = deb822.Dsc(dsc_file)
207         for (old, name) in parts:
208                 path = os.path.join(dir,name)
209                 size = os.path.getsize(path)
210                 with open(path,'rb') as f:
211                         hashes = {}
212                         for (type, func) in zip(checksum_types, checksum_hashlib):
213                                 hashes[type] = getattr(hashlib, func)()
214                         for chunk in iter(lambda: f.read(128*64L), b''):
215                                 for type in checksum_types:
216                                         hashes[type].update(chunk)
217                         for type in checksum_types:
218                                 hashes[type] = hashes[type].hexdigest()
219                         for (header, type) in zip(checksum_headers, checksum_types):
220                                 if header in dsc:
221                                         dsc[header] = [{type: hashes[type], 'size': size, 'name': name} if p['name'] == old else p for p in dsc[header]]
222         dsc_file.close()
223         os.remove(dsc_path) # So we don't change the original that the dsc links to
224         dsc_file = open(dsc_path,'wb')
225         dsc.dump(dsc_file)
226         dsc_file.close()
227
228 # Functions for downloading files and storing them in the hash caches
229
230 def download_and_check_hash(url, dir, hash, hash_type):
231         try:
232                 parent = hash_path_parent(dir,hash)
233                 path = hash_path(dir,hash)
234                 logging.debug('downloading %s', url)
235                 makedirs(parent)
236                 headers = { 'User-Agent' : user_agent }
237                 req = urllib2.Request(url, None, headers)
238                 u = urllib2.urlopen(req, None, timeout)
239                 data = u.read()
240                 if hash_type == 'sha256':
241                         data_hash = hashlib.sha256(data).hexdigest()
242                 elif hash_type == 'md5sum':
243                         data_hash = hashlib.md5(data).hexdigest()
244                 else:
245                         logging.warning('unknown hash type detected: %s %s %s', hash_type, hash, url)
246                         return ('unknown', None)
247                 if data_hash != hash:
248                         logging.warning('incorrect hash for downloaded file, ignoring: %s %s != %s %s', hash_type, hash, data_hash, url)
249                         return ('unknown', None)
250                 sha1 = hashlib.sha1(data).hexdigest()
251                 sha1_path = hash_path(sha1_cache_dir, sha1)
252                 sha1_parent = hash_path_parent(sha1_cache_dir, sha1)
253                 makedirs(sha1_parent)
254                 if hash_path_exists(snapshot_cache_dir, sha1):
255                         snapshot_path = hash_path(snapshot_cache_dir, sha1)
256                         symlink(snapshot_path, path)
257                         logging.debug('exists in snapshot sha1 cache: %s %s %s %s', hash_type, hash, sha1, url)
258                         return (True, sha1)
259                 else:
260                         if not os.path.exists(sha1_path):
261                                 logging.debug('correct hash for downloaded file, saving: %s %s %s %s', hash_type, hash, sha1, url)
262                                 f = open(sha1_path, 'w')
263                                 f.write(data)
264                                 f.close()
265                         else:
266                                 logging.debug('correct hash for downloaded file, not saving: already in derivs cache: %s %s %s %s', hash_type, hash, sha1, url)
267                         symlink(os.path.relpath(sha1_path, os.path.dirname(path)), path)
268                         logging.debug('does not exist in snapshot sha1 cache: %s %s %s %s', hash_type, hash, sha1, url)
269                         return (False, sha1)
270         except urllib2.URLError, e:
271                 if hasattr(e, 'reason'): reason = e.reason
272                 elif hasattr(e, 'code'): reason = e.code
273                 else: reason = e
274                 logging.warning('unable to download hash file, ignoring: %s %s', reason, url)
275                 return ('unknown', None)
276         except httplib.HTTPException, e:
277                 logging.warning('unable to download hash file, ignoring: %s %s', repr(e), url)
278                 return ('unknown', None)
279         except socket.error, e:
280                 logging.warning('unable to download hash file, ignoring: %s %s', e, url)
281                 return ('unknown', None)
282
283 def download_sha1(url, dir, sha1):
284         try:
285                 parent = hash_path_parent(dir,sha1)
286                 path = hash_path(dir,sha1)
287                 logging.debug('downloading sha1: %s %s', sha1, url)
288                 makedirs(parent)
289                 headers = { 'User-Agent' : user_agent }
290                 req = urllib2.Request(url, None, headers)
291                 u = urllib2.urlopen(req, None, timeout)
292                 data = u.read()
293                 data_sha1 = hashlib.sha1(data).hexdigest()
294                 if data_sha1 == sha1:
295                         logging.debug('correct sha1 for downloaded file, saving: %s %s', sha1, url)
296                         if not os.path.exists(path):
297                                 f = open(path, 'w')
298                                 f.write(data)
299                                 f.close()
300                         return (False, sha1)
301                 else:
302                         logging.warning('incorrect sha1 for downloaded file, ignoring: %s != %s %s', sha1, data_sha1, url)
303                         return ('unknown', None)
304         except urllib2.URLError, e:
305                 if hasattr(e, 'reason'): reason = e.reason
306                 elif hasattr(e, 'code'): reason = e.code
307                 else: reason = e
308                 logging.warning('unable to download sha1 file, ignoring: %s %s', reason, url)
309                 return ('unknown', None)
310         except httplib.HTTPException, e:
311                 logging.warning('unable to download hash file, ignoring: %s %s', repr(e), url)
312                 return ('unknown', None)
313         except socket.error, e:
314                 logging.warning('unable to download hash file, ignoring: %s %s', e, url)
315                 return ('unknown', None)
316
317 # Functions for checking the hash caches
318
319 def check_hash_cache(dir, hash, hash_type, url):
320         logging.debug('checking hash cache: %s %s', hash_type, hash)
321         path = hash_path(dir, hash)
322         try:
323                 result = os.readlink(path)
324                 path = os.path.join(os.path.dirname(path), result)
325         except OSError:
326                 logging.debug('does not exist in hash cache: %s %s', hash_type, hash)
327                 return download_and_check_hash(url, dir, hash, hash_type)
328         logging.debug('exists in hash cache: %s %s', hash_type, hash)
329         sha1 = os.path.basename(path)
330         if hash_path_exists(snapshot_cache_dir, sha1):
331                 logging.debug('exists in snapshot sha1 cache: %s', sha1)
332                 remove(hash_path(sha1_cache_dir,sha1))
333                 return (True, sha1)
334         elif hash_path_exists(sha1_cache_dir, sha1):
335                 logging.debug('exists in derivatives sha1 cache: %s', sha1)
336                 return (False, sha1)
337         else:
338                 logging.debug('missing in derivatives sha1 cache: %s', sha1)
339                 return download_and_check_hash(url, dir, hash, hash_type)
340
341 def check_sha1_cache(sha1, url):
342         logging.debug('checking sha1 caches: %s', sha1)
343         if hash_path_exists(snapshot_cache_dir, sha1):
344                 logging.debug('exists in snapshot sha1 cache: %s', sha1)
345                 remove(hash_path(sha1_cache_dir,sha1))
346                 return (True, sha1)
347         elif hash_path_exists(sha1_cache_dir, sha1):
348                 logging.debug('exists in derivatives sha1 cache: %s', sha1)
349                 return (False, sha1)
350         else:
351                 logging.debug('does not exist in any sha1 caches: %s', sha1)
352                 return download_sha1(url, sha1_cache_dir, sha1)
353
354 def status(type, hash, url):
355         logging.debug('checking status of hash: %s %s %s', type, hash, url)
356         if type == 'sha1':
357                 (ret, sha1) = check_sha1_cache(hash, url)
358                 if ret == True:
359                         return ('unmodified', sha1)
360                 elif ret == False:
361                         return ('modified', sha1)
362                 else:
363                         return (ret, sha1)
364         elif type == 'sha256':
365                 (ret, sha1) = check_hash_cache(sha256_cache_dir, hash, type, url)
366                 if ret == True:
367                         return ('unmodified', sha1)
368                 elif ret == False:
369                         return ('modified', sha1)
370                 else:
371                         return (ret, sha1)
372         elif type == 'md5sum':
373                 (ret, sha1) = check_hash_cache(md5_cache_dir, hash, type, url)
374                 if ret == True:
375                         return ('unmodified', sha1)
376                 elif ret == False:
377                         return ('modified', sha1)
378                 else:
379                         return (ret, sha1)
380         else:
381                 logging.warning('unknown hash type detected: %s %s %s', type, hash, url)
382                 return ('unknown', None)
383
384 # Functions for getting information about source packages
385
386 def get_info(srcpkg):
387         dsc = None
388         for header in checksum_headers:
389                 if not dsc and header in srcpkg:
390                         dsc = [x for x in srcpkg[header] if x['name'].endswith('.dsc')]
391         if not dsc:
392                 logging.warning('did not find any dsc files')
393                 return None
394         if len(dsc) > 1:
395                 logging.warning('found multiple dsc files: %s' % ' '.join(['%s %s' % (dsc_name, dsc_sha1) for dsc_name, dsc_sha1 in dsc]))
396                 return None
397         dsc = dsc[0]
398         dsc_name = dsc['name']
399         dsc_hash_type, dsc_hash =  [(k, v) for k, v in dsc.iteritems() if k not in ('name', 'size')][0]
400
401         parts = []
402         part_names = []
403         for header in checksum_headers:
404                 if header in srcpkg:
405                         for part in srcpkg[header]:
406                                 if 'name' in part and part['name'] not in part_names and not part['name'].endswith('.dsc'):
407                                         parts.append(part)
408                                         part_names.append(part['name'])
409
410         return (dsc_hash_type, dsc_hash, dsc_name, parts)
411
412 def get_debian_info(files):
413         dsc = [file for file in files if file[0].endswith('.dsc')]
414         if not dsc:
415                 logging.warning('did not find any Debian dsc files: snapshots bug or ancient source package')
416                 return None
417         if len(dsc) > 1:
418                 logging.warning('found multiple Debian dsc files, choosing first one: %s' % ' '.join(['%s %s' % (dsc_name, dsc_sha1) for dsc_name, dsc_sha1 in dsc]))
419
420         dsc = dsc[0]
421         dsc_name, dsc_sha1 = dsc
422
423         parts = []
424         part_names = []
425         for file in files:
426                 part_name, part_sha1 = file
427                 if part_name not in part_names and not part_name.endswith('.dsc'):
428                         parts.append(file)
429                         part_names.append(part_name)
430
431         return (dsc_sha1, dsc_name, parts)
432
433 # Functions for extracting information from the snapshots database
434
435 def database_error(e):
436         reason = None
437         code = None
438         if hasattr(e, 'pgerror'): reason = e.pgerror
439         if hasattr(e, 'pgcode'): code = e.pgcode
440         logging.warning('unable to execute database query: %s %s', code, reason)
441         conn.reset()
442
443 def srcpkg_was_in_debian(name, version=None):
444         try:
445                 if version:
446                         cur.execute('SELECT version FROM srcpkg WHERE name=%s AND version=%s LIMIT 1;', (name, version))
447                         return not not cur.fetchone()
448                 else:
449                         cur.execute('SELECT version FROM srcpkg WHERE name=%s LIMIT 1;', (name,))
450                         return not not cur.fetchone()
451         except psycopg2.Error, e:
452                 database_error(e)
453                 return None
454
455 def sha1_to_srcpkgs(sha1):
456         try:
457                 cur.execute(
458                         '''SELECT name, version
459                         FROM srcpkg
460                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
461                         WHERE hash=%s;''', (sha1,))
462                 return cur.fetchall()
463         except psycopg2.Error, e:
464                 database_error(e)
465                 return None
466
467 def srcpkg_to_sha1s(name, version):
468         try:
469                 cur.execute(
470                         '''SELECT hash
471                         FROM file_srcpkg_mapping
472                         JOIN srcpkg ON srcpkg.srcpkg_id=file_srcpkg_mapping.srcpkg_id
473                         WHERE name=%s AND version=%s;''', (name, version))
474                 return cur.fetchall()
475         except psycopg2.Error, e:
476                 database_error(e)
477                 return None
478
479 def srcpkg_to_srcpkgs(name):
480         try:
481                 cur.execute(
482                         '''SELECT name, version
483                         FROM srcpkg
484                         WHERE name=%s ORDER BY version DESC;''', (name,))
485                 return cur.fetchall()
486         except psycopg2.Error, e:
487                 database_error(e)
488                 return None
489
490 def sha1s_to_files(sha1):
491         try:
492                 cur.execute('SELECT DISTINCT ON (name, hash) name, hash FROM file WHERE hash=%s;', hash)
493                 return cur.fetchall()
494         except psycopg2.Error, e:
495                 database_error(e)
496                 return None
497
498 def srcpkg_to_files(name, version):
499         try:
500                 cur.execute(
501                         '''SELECT DISTINCT ON (file.name, file.hash) file.name, file.hash
502                         FROM file_srcpkg_mapping
503                         JOIN srcpkg ON srcpkg.srcpkg_id=file_srcpkg_mapping.srcpkg_id
504                         JOIN file ON file_srcpkg_mapping.hash=file.hash
505                         WHERE srcpkg.name=%s AND srcpkg.version=%s;''', (name, version))
506                 return cur.fetchall()
507         except psycopg2.Error, e:
508                 database_error(e)
509                 return None
510
511 def sha1_version_to_derived_from(sha1, version):
512         try:
513                 cur.execute(
514                         '''SELECT name, version
515                         FROM srcpkg
516                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
517                         WHERE hash=%s and version<=%s
518                         ORDER BY name ASC, version DESC
519                         LIMIT 1;''', (sha1, version))
520                 res = cur.fetchall()
521                 if res: return res
522                 cur.execute(
523                         '''SELECT name, version
524                         FROM srcpkg
525                         JOIN file_srcpkg_mapping ON file_srcpkg_mapping.srcpkg_id=srcpkg.srcpkg_id
526                         WHERE hash=%s
527                         ORDER BY name ASC, version ASC
528                         LIMIT 1;''', (sha1, version))
529                 return cur.fetchall()
530         except psycopg2.Error, e:
531                 database_error(e)
532                 return None
533
534 def srcpkg_to_derived_from(name, version):
535         try:
536                 cur.execute(
537                         '''SELECT name, version
538                         FROM srcpkg
539                         WHERE name=%s and version<=%s
540                         ORDER BY version DESC
541                         LIMIT 1;''', (name, version))
542                 res = cur.fetchall()
543                 if res: return res
544                 cur.execute(
545                         '''SELECT name, version
546                         FROM srcpkg
547                         WHERE name=%s
548                         ORDER BY version ASC
549                         LIMIT 1;''', (name,))
550                 return cur.fetchall()
551         except psycopg2.Error, e:
552                 database_error(e)
553                 return None
554
555 # Functions related to creating patches
556
557 # Add symlinks for all needed files
558 def prepare(dsc_name, dsc_sha1, parts):
559         logging.debug('preparing deriv directory for %s', dsc_name)
560         tmp_dir = tempfile.mkdtemp(prefix='derivs-cmp-srcpkg-%s-' % derivative_short_name)
561         path = hash_path(snapshot_cache_dir, dsc_sha1)
562         if not os.path.exists(path): path = hash_path(sha1_cache_dir, dsc_sha1)
563         path = hash_path(sha1_cache_dir, dsc_sha1)
564         dsc_path = os.path.join(tmp_dir, dsc_name)
565         os.symlink(path, dsc_path)
566         converted_parts = []
567         for part in parts:
568                 if 'sha1' in part:
569                         path = hash_path(snapshot_cache_dir, part['sha1'])
570                         if not os.path.exists(path): path = hash_path(sha1_cache_dir, part['sha1'])
571                 elif 'sha256' in part: path = hash_path(sha256_cache_dir, part['sha256'])
572                 elif 'md5sum' in part: path = hash_path(md5_cache_dir, part['md5sum'])
573                 part_path = os.path.join(tmp_dir, part['name'])
574                 os.symlink(path, part_path)
575                 # Some distributions allow additional compression schemes
576                 # Here we work around this by recompressing with gzip
577                 if part['name'].endswith('.lz'):
578                         converted_parts.append(convert_lzip_to_gzip(tmp_dir, part['name']))
579         # Update the dsc file if we recompressed any files
580         if converted_parts:
581                 update_dsc_file(tmp_dir, dsc_name, converted_parts)
582         return tmp_dir
583
584 def prepare_debian(dsc_name, dsc_sha1, files):
585         logging.debug('preparing Debian directory for %s', dsc_name)
586         readable_parts = 0
587         debian_tmp_dir = tempfile.mkdtemp(prefix='derivs-cmp-srcpkg-Debian-')
588         path = hash_path(snapshot_cache_dir, dsc_sha1)
589         if os.access(path, os.R_OK): readable_parts += 1
590         dsc_path = os.path.join(debian_tmp_dir, dsc_name)
591         os.symlink(path, dsc_path)
592         for file in files:
593                 part_name, part_sha1 = file
594                 path = hash_path(snapshot_cache_dir, part_sha1)
595                 part_path = os.path.join(debian_tmp_dir, part_name)
596                 os.symlink(path, part_path)
597                 if os.access(path, os.R_OK): readable_parts += 1
598         if readable_parts != (1 + len(files)):
599                 logging.info('only %s parts of %s are readable', readable_parts, dsc_name)
600                 rmtree(debian_tmp_dir)
601                 return None
602         return debian_tmp_dir
603
604 def get_changelog_entries(tmp_dir, dsc_name, dsc_sha1):
605         logging.debug('getting changelog entries from %s', dsc_name)
606
607         # Cache check
608         changelog_path = hash_path(sha1_changelog_dir, dsc_sha1)
609         if os.path.exists(changelog_path):
610                 logging.debug('changelog cache exists for %s %s', dsc_name, dsc_sha1)
611                 f = file(changelog_path)
612                 if f:
613                         try: changelog_entries = json.load(f)
614                         except ValueError: pass
615                         else: return [tuple(entry) for entry in changelog_entries]
616                         finally: f.close()
617
618         # Preparation
619         extract_path = os.path.join(tmp_dir,'extracted')
620
621         # Unpack the source tree
622         logging.debug('unpacking source package %s', dsc_name)
623         cmdline = ['dpkg-source', '-x', dsc_name, 'extracted']
624         process = subprocess.Popen(cmdline, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
625         output = process.communicate()[0]
626         if process.returncode:
627                 logging.warning('dpkg-source reported failure to extract %s:', dsc_name)
628                 logging.warning(output)
629                 cmdline = ['ls', '-lR', '--time-style=+']
630                 process = subprocess.Popen(cmdline, cwd=tmp_dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
631                 output = process.communicate()[0]
632                 logging.warning(output)
633                 rmtree(extract_path)
634                 return None
635
636         # Sanitise the debian dir and changelog file in case it is a symlink to outside
637         debian_dir = os.path.join(extract_path, 'debian')
638         changelog_filename = os.path.join(debian_dir,'changelog')
639         if os.path.islink(debian_dir) or os.path.islink(changelog_filename):
640                 logging.warning('debian dir or changelog is a symbolic link %s', dsc_name)
641                 rmtree(extract_path)
642                 return None
643
644         # Check if the changelog exists
645         if not os.path.exists(changelog_filename):
646                 logging.warning('could not find changelog in %s', dsc_name)
647                 rmtree(extract_path)
648                 return None
649
650         # Find out which source package is the most likely derivative
651         logging.debug('parsing changelog for %s', dsc_name)
652         changelog_file = open(changelog_filename)
653         try:
654                 changelog_obj = changelog.Changelog(changelog_file)
655         except UnicodeDecodeError:
656                 changelog_file.seek(0)
657                 changelog_obj = changelog.Changelog(changelog_file, encoding='iso-8859-1')
658         try:
659                 changelog_entries = [(entry.package, str(entry._raw_version)) for entry in changelog_obj]
660         except:
661                 logging.warning('could not read changelog from %s', dsc_name)
662                 rmtree(extract_path)
663                 return None
664         del changelog_obj
665         changelog_file.close()
666
667         # Clean up again
668         rmtree(extract_path)
669
670         # Write the cache
671         makedirs(hash_path_parent(sha1_changelog_dir, dsc_sha1))
672         remove(changelog_path)
673         f = file(changelog_path, 'w')
674         json.dump(changelog_entries, f)
675         f.close()
676
677         return changelog_entries
678
679 # Find the source package name and version this is probably derived from
680 def find_derived_from(tmp_dir, name, version, dsc_name, dsc_sha1, parts_unmodified):
681         logging.debug('finding base source package of %s %s', name, version)
682
683         # Get a list of changelog entries
684         changelog_entries = get_changelog_entries(tmp_dir, dsc_name, dsc_sha1)
685         if changelog_entries:
686                 logging.debug('changelog entries are: %s', ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in changelog_entries]))
687
688         # Get a list of candidate versions from the database
689         possibly_derived_from = []
690         logging.debug('checking which parts were in Debian')
691         for part_sha1, part_name in parts_unmodified:
692                 part_derived_from = sha1_to_srcpkgs(part_sha1)
693                 if part_derived_from:
694                         logging.debug('part %s %s available in %s', part_sha1, part_name, ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in part_derived_from]))
695                         possibly_derived_from.extend(part_derived_from)
696
697         if not possibly_derived_from:
698                 logging.debug('no parts in common with Debian, obtaining old versions')
699                 old_packages = srcpkg_to_srcpkgs(name)
700                 if old_packages: possibly_derived_from = old_packages
701
702         # Uniqify
703         possibly_derived_from = list(set(possibly_derived_from))
704         if possibly_derived_from:
705                 logging.debug('possibly derived from: %s', ' '.join(['%s %s' % (entry_name, entry_version) for entry_name, entry_version in possibly_derived_from]))
706         else:
707                 logging.debug('nothing in possibly derived from list')
708
709         # Match changelog versions against candidates
710         if changelog_entries:
711                 logging.debug('matching changelog entries against versions possibly derived from')
712                 for entry in changelog_entries:
713                         entry_name, entry_version = entry
714                         if entry in possibly_derived_from:
715                                 logging.debug('%s %s in possibly derived from', entry_name, entry_version)
716                                 return entry
717                 logging.debug('checking if changelog entries were ever in Debian')
718                 for entry_name, entry_version in changelog_entries:
719                         if srcpkg_was_in_debian(entry_name, entry_version):
720                                 logging.debug('%s %s was in Debian', entry_name, entry_version)
721                                 return (entry_name, entry_version)
722         if possibly_derived_from:
723                 logging.debug('finding closest entry in possibly derived from')
724                 possibly_derived_from.sort(cmp=lambda a,b: apt_version_cmp(b[1],a[1]))
725                 for entry_name, entry_version in possibly_derived_from:
726                         if name == entry_name and apt_version_cmp(version, entry_version) >= 0:
727                                 logging.debug('%s %s is an equal or lower version', entry_name, entry_version)
728                                 return (entry_name, entry_version)
729                 entry = possibly_derived_from[-1]
730                 entry_name, entry_version = entry
731                 logging.debug('no lower version numbers, returning next highest version %s %s', entry_name, entry_version)
732                 return entry
733         logging.debug('finding closest version number in Debian')
734         for entry in srcpkg_to_derived_from(name, version):
735                 entry_name, entry_version = entry
736                 logging.debug('closest package was %s %s', entry_name, entry_version)
737                 return entry
738         logging.debug('could not find Debian package %s %s is derived from', name, version)
739         return None
740
741 # Generate a patch file
742 def create_patch(tmp_dir, dsc_name, dsc_sha1, debian_tmp_dir, debian_dsc_name, debian_dsc_sha1):
743         global repackaged_but_identical
744
745         dsc_path = os.path.join(tmp_dir, dsc_name)
746         debian_dsc_path = os.path.join(debian_tmp_dir, debian_dsc_name)
747         path_everything = sha1_patch_path(debian_dsc_sha1, dsc_sha1)
748         path_debian = sha1_patch_path(debian_dsc_sha1, dsc_sha1, 'debian')
749
750         # Generate the main patch
751         if not os.path.exists(path_everything) and os.path.exists(debian_dsc_path) and os.path.exists(dsc_path):
752                 makedirs(os.path.dirname(path_everything))
753                 cmdline = ['debdiff', '--quiet', '--diffstat', debian_dsc_path, dsc_path]
754                 stdout = open(path_everything, 'w')
755                 process = subprocess.Popen(cmdline, stdout=stdout, stderr=subprocess.PIPE, preexec_fn=subprocess_setup)
756                 output = process.communicate()[1]
757                 stdout.close()
758                 if process.returncode == 255:
759                         logging.warning('debdiff reported failure %s %s:', debian_dsc_name, dsc_name)
760                         logging.warning(output)
761                         cmdline = ['ls', '-lR', '--time-style=+']
762                         for name, dir in (derivative_short_name, tmp_dir), ('Debian', debian_tmp_dir):
763                                 logging.warning('dir listing for %s:', name)
764                                 process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
765                                 output = process.communicate()[0]
766                                 logging.warning(output)
767                         return False
768                 elif process.returncode == 0:
769                         logging.info('derivative repackaged in an identical way %s %s', debian_dsc_name, dsc_name)
770                         repackaged_but_identical += 1
771                         return False
772                 elif process.returncode != 1:
773                         logging.warning('debdiff reported unknown return code %s %s %s:', process.returncode, debian_dsc_name, dsc_name)
774                         logging.warning(output)
775                         cmdline = ['ls', '-lR', '--time-style=+']
776                         for name, dir in (derivative_short_name, tmp_dir), ('Debian', debian_tmp_dir):
777                                 logging.warning('dir listing for %s:', name)
778                                 process = subprocess.Popen(cmdline, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
779                                 output = process.communicate()[0]
780                                 logging.warning(output)
781                         return False
782
783         # Filter the main patch to include only the debian/ directory
784         if os.path.exists(path_everything) and not os.path.exists(path_debian):
785                 makedirs(os.path.dirname(path_debian))
786                 cmdline = ['filterdiff', '--include=*/debian/*', path_everything]
787                 filterdiff = subprocess.Popen(cmdline, stdout=subprocess.PIPE, preexec_fn=subprocess_setup)
788                 filterdiff_output = filterdiff.communicate()[0]
789                 diffstat = subprocess.Popen('diffstat', stdin=subprocess.PIPE, stdout=subprocess.PIPE, preexec_fn=subprocess_setup)
790                 diffstat_output = diffstat.communicate(filterdiff_output)[0]
791                 f = open(path_debian, 'w')
792                 f.write('diffstat of debian/ for %s %s\n' % (os.path.splitext(debian_dsc_name)[0], os.path.splitext(dsc_name)[0]))
793                 f.write('\n')
794                 f.write(diffstat_output)
795                 f.write('\n')
796                 f.write(filterdiff_output)
797                 f.close()
798
799         # Patches > 100MB are probably not that useful, replace them with a link
800         for path in path_everything, path_debian:
801                 try:
802                         if os.path.getsize(path) > 104857600:
803                                 logging.info('patch between %s and %s is larger than 100MB', dsc_name, debian_dsc_name)
804                                 remove(path)
805                                 symlink(os.path.relpath(patch_too_large, os.path.dirname(path)), path)
806                 except OSError:
807                         pass
808
809         return True
810
811 def check_patch(debian_dsc_sha1, dsc_sha1):
812         patch_path = sha1_patch_path(debian_dsc_sha1, dsc_sha1)
813         lsdiff_path = sha1_lsdiff_path(debian_dsc_sha1, dsc_sha1)
814         if os.path.exists(lsdiff_path):
815                 logging.debug('lsdiff cache exists for %s', patch_path)
816                 f = file(lsdiff_path)
817                 lsdiff = f.read()
818                 f.close()
819         else:
820                 logging.debug('lsdiff cache does not exist for %s', patch_path)
821                 cmdline = ['lsdiff', patch_path]
822                 process = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=subprocess_setup)
823                 lsdiff = process.communicate()[0]
824                 makedirs(os.path.dirname(lsdiff_path))
825                 f = file(lsdiff_path,'w')
826                 f.write(lsdiff)
827                 f.close()
828         lsdiff = lsdiff.splitlines()
829         for line in lsdiff:
830                 if line != 'debian/changelog' and not line.endswith('/debian/changelog'):
831                         logging.debug('patch changes files other than debian/changelog')
832                         return True
833         logging.debug('patch does not change files other than debian/changelog')
834         return False
835
836 def present_patch(name, version, dsc_sha1, debian_name, debian_version,  debian_dsc_sha1):
837         useful_patch = check_patch(debian_dsc_sha1, dsc_sha1)
838         patches = []
839         types = ('', 'debian')
840         for type in types:
841                 ln_to = sha1_patch_path(debian_dsc_sha1, dsc_sha1, type)
842                 if not os.path.exists(ln_to):
843                         continue
844                 ln_from_deriv = deriv_patch_path(name, version, debian_name, debian_version, type)
845                 ln_from_global = global_patch_path(name, version, debian_name, debian_version, type)
846                 makedirs(os.path.dirname(ln_from_deriv))
847                 makedirs(os.path.dirname(ln_from_global))
848                 remove(ln_from_deriv)
849                 remove(ln_from_global)
850                 symlink(os.path.relpath(ln_to, os.path.dirname(ln_from_deriv)), ln_from_deriv)
851                 symlink(os.path.relpath(ln_to, os.path.dirname(ln_from_global)), ln_from_global)
852                 if useful_patch:
853                         patches.append(os.path.relpath(ln_from_global, os.path.abspath(global_patch_dir)))
854         return tuple(patches)
855
856 # Functions that wrap other functions and decide what to do
857
858 def check_source_package(source_entry, srcpkg):
859         global modifies_dsc_files
860
861         try:
862                 name = None
863                 version = None
864                 dir = None
865                 name = srcpkg['Package']
866                 version = srcpkg['Version']
867                 dir = srcpkg['Directory']
868                 if '/' in name or name == '..':
869                         logging.warning('could not process source package %s %s: possibly malicious name', name, version)
870                         return None
871                 if '/' in version or version == '..':
872                         logging.warning('could not process source package %s %s: possibly malicious version', name, version)
873                         return None
874                 if '..' in dir.split('/'):
875                         logging.warning('could not process source package %s %s: possibly malicious dir: %s', name, version, dir)
876                         return None
877         except KeyError:
878                 logging.warning('could not process source package %s %s', name, version)
879                 return None
880         logging.debug('started processing source package %s %s', name, version)
881         info = get_info(srcpkg)
882         if not info:
883                 logging.warning('finished processing source package %s %s: could not get any info', name, version)
884                 return None
885         dsc_hash_type, dsc_hash, dsc_name, parts = info
886         if '/' in dsc_name or dsc_name == '..':
887                 logging.warning('could not process source package %s %s: possibly malicious dsc name %s', name, version, dsc_name)
888                 return None
889         if not ishex(dsc_hash):
890                 logging.warning('could not process source package %s %s: possibly malicious dsc hash %s', name, version, dsc_hash)
891                 return None
892         dsc_url = source_entry.archive_uri('%s/%s' % (dir, dsc_name))
893         logging.debug('found dsc file: %s %s %s', dsc_hash_type, dsc_hash, dsc_url)
894         dsc_status, dsc_sha1 = status(dsc_hash_type, dsc_hash, dsc_url)
895         logging.debug('checked dsc status: %s %s %s', dsc_status, dsc_sha1, dsc_url)
896         if dsc_status == 'unmodified':
897                 # Ignore the srcpkg since we know it is was in Debian
898                 # at one point and is hopefully therefore unmodified
899                 logging.debug('finished processing source package %s %s: dsc unmodified', name, version)
900                 return None
901         else:
902                 files = [(dsc_sha1, dsc_hash_type, dsc_hash)]
903                 parts_unmodified = []
904                 parts_modified = []
905                 parts_unknown = []
906                 for part in parts:
907                         part_name = part['name']
908                         if '/' in part_name or part_name == '..':
909                                 logging.warning('could not process source package %s %s: possibly malicious part name %s', name, version, part_name)
910                                 return None
911                         part_url = source_entry.archive_uri('%s/%s' % (dir, part_name))
912                         part_hash_type, part_hash = [(k, v) for k, v in part.iteritems() if k not in ('name', 'size')][0]
913                         if not ishex(part_hash):
914                                 logging.warning('could not process source package %s %s: possibly malicious part hash %s', name, version, part_hash)
915                                 return None
916                         logging.debug('found part file: %s %s %s', part_hash_type, part_hash, part_url)
917                         part_status, part_sha1 = status(part_hash_type, part_hash, part_url)
918                         logging.debug('checked part status: %s %s %s', part_status, part_sha1, part_url)
919                         if 'sha1' not in part and part_sha1: part['sha1'] = part_sha1
920                         if part_status == 'unmodified': parts_unmodified.append((part_sha1, part_name))
921                         elif part_status == 'modified': parts_modified.append((part_sha1, part_name))
922                         else: parts_unknown.append((part_sha1, part_name))
923                         if part_status == 'modified': files.append((part_sha1, part_hash_type, part_hash))
924
925                 all_parts_unmodified = (len(parts_unmodified) == len(parts))
926                 parts_unmodified = list(set(parts_unmodified))
927                 logging.debug('source package status %s %s: dsc %s, %s parts unmodified, %s parts modified, %s parts unknown', name, version, dsc_status, len(parts_unmodified), len(parts_modified), len(parts_unknown))
928
929                 if all_parts_unmodified:
930                         # Ignore the srcpkg since we know all the parts were
931                         # in Debian at one point and ergo, it is unmodified
932                         logging.debug('finished processing source package %s %s: all non-dsc parts unmodified', name, version)
933                         if dsc_status == 'modified':
934                                 logging.info('source package %s %s: unmodified, but dsc different', name, version)
935                                 modifies_dsc_files += 1
936                         return (files, None, None, None)
937                 else:
938                         logging.debug('some parts modified, looking for derived version %s %s', name, version)
939                         if not dsc_sha1:
940                                 logging.warning('finished processing source package %s %s: sha1 missing for dsc file', name, version)
941                                 return (files, None, None, None)
942                         if parts_unknown:
943                                 logging.warning('finished processing source package %s %s: sha1 missing for some parts', name, version)
944                                 return (files, None, None, None)
945                         new = None
946                         link = None
947                         patch = None
948                         tmp_dir = prepare(dsc_name, dsc_sha1, parts)
949                         derived_from = find_derived_from(tmp_dir, name, version, dsc_name, dsc_sha1, parts_unmodified)
950                         if derived_from:
951                                 debian_name, debian_version = derived_from
952                                 link = (debian_name, debian_version, name, version, dsc_url)
953                                 logging.debug('source package %s %s derived from %s %s', name, version, debian_name, debian_version)
954                                 debian_files = srcpkg_to_files(debian_name, debian_version)
955                                 if debian_files:
956                                         debian_info = get_debian_info(debian_files)
957                                         if debian_info:
958                                                 debian_dsc_sha1, debian_dsc_name, debian_parts = debian_info
959                                                 logging.debug('Debian source package %s %s dsc found %s %s', debian_name, debian_version, debian_dsc_name, debian_dsc_sha1)
960                                                 debian_tmp_dir = prepare_debian(debian_dsc_name, debian_dsc_sha1, debian_parts)
961                                                 if debian_tmp_dir:
962                                                         patch_created = create_patch(tmp_dir, dsc_name, dsc_sha1, debian_tmp_dir, debian_dsc_name, debian_dsc_sha1)
963                                                         if patch_created:
964                                                                 patch_names = present_patch(name, version, dsc_sha1, debian_name, debian_version, debian_dsc_sha1)
965                                                                 if patch_names:
966                                                                         patch = (debian_name, debian_version, debian_dsc_sha1, name, version, dsc_sha1, [part[0] for part in parts_modified], patch_names)
967                                                                 else:
968                                                                         logging.debug('patch between %s %s and %s %s is probably not useful', debian_name, debian_version, name, version)
969                                                         rmtree(debian_tmp_dir)
970                                                 else:
971                                                         # This could be an issue with snapshots or a file that is not distributable
972                                                         logging.info('source package %s %s: could not create temporary dir for Debian: %s %s', name, version, debian_name, debian_version)
973                                         else:
974                                                 logging.warning('source package %s %s: could not get Debian info for %s %s: %s', name, version, debian_name, debian_version, debian_info)
975                                 else:
976                                         if srcpkg_was_in_debian(debian_name, debian_version):
977                                                 logging.warning('source package %s %s: snapshot database issue, no Debian files found', debian_name, debian_version)
978                                         else:
979                                                 logging.warning('source package %s %s: derived from %s %s possibly bogus', name, version, debian_name, debian_version)
980                         else:
981                                 new = (name, version, dsc_url)
982                         rmtree(tmp_dir)
983                         logging.debug('finished processing source package %s %s: all done', name, version)
984                         return (files, patch, link, new)
985
986 def process_sources(source_entries, lists_dir):
987         files = []
988         patches = []
989         links = []
990         new = []
991         for source in source_entries:
992                 for source_entry in source:
993                         fn = os.path.join(lists_dir, source_entry.describe.rstrip(')').rpartition('(')[2])
994                         try: f = file(fn)
995                         except IOError: continue
996                         for srcpkg in deb822.Sources.iter_paragraphs(f):
997                                 actions = check_source_package(source_entry, srcpkg)
998                                 if actions:
999                                         action_files, action_patch, action_link, action_new = actions
1000                                         if action_files:
1001                                                 files.append(action_files)
1002                                                 logging.debug('action: return files %s', ' '.join([' '.join([str(item) for item in action]) for action in action_files]))
1003                                         if action_patch:
1004                                                 patches.append(action_patch)
1005                                                 logging.debug('action: return patches %s', ' '.join([' '.join(action) for action in action_patch]))
1006                                         if action_link:
1007                                                 links.append(action_link)
1008                                                 logging.debug('action: return links to modified source packages %s', ' '.join(action_link))
1009                                         if action_new:
1010                                                 new.append(action_new)
1011                                                 logging.debug('action: return links to new source packages %s', ' '.join(action_new))
1012                                 logging.debug('done')
1013                                 logging.debug('')
1014                         f.close()
1015         return (files, patches, links, new)
1016
1017 logging.debug('processing distribution %s', derivative_short_name)
1018
1019 files, patches, links, new = process_sources(source_entries, lists_dir)
1020
1021 # Done with the database, close the connection
1022 cur.close()
1023 conn.close()
1024
1025 # Write out the results
1026 filename = sys.argv[3]
1027 data = files
1028 if data:
1029         output_data = {}
1030         for package in data:
1031                 for modified_file in package:
1032                         sha1, hash_type, hash = modified_file
1033                         if sha1 not in output_data:
1034                                 output_data[sha1] = {}
1035                         if hash_type != 'sha1' and hash_type not in output_data[sha1]:
1036                                 output_data[sha1][hash_type] = hash
1037                         elif hash_type != 'sha1' and hash != output_data[sha1][hash_type]:
1038                                 logging.warning('hashes mismatched: %s: %s %s != %s', sha1, hash_type, hash, output_data[sha1][hash_type])
1039         output = file(os.path.abspath(filename), 'wb')
1040         yaml.safe_dump(output_data, output)
1041         output.close()
1042
1043 filename = sys.argv[4]
1044 data = patches
1045 if data:
1046         if not os.path.exists(os.path.join(global_patch_dir,'HEADER.html')):
1047                 symlink('../../doc/HEADER.patches.html',os.path.join(global_patch_dir,'HEADER.html'))
1048         if not os.path.exists(os.path.join(global_patch_dir,'.htaccess')):
1049                 symlink('../../etc/htaccess.patches',os.path.join(global_patch_dir,'.htaccess'))
1050         if not os.path.exists(os.path.join(deriv_patch_dir,'HEADER.html')):
1051                 symlink('../../../doc/HEADER.patches.html',os.path.join(deriv_patch_dir,'HEADER.html'))
1052         if not os.path.exists(os.path.join(global_patch_dir,'.htaccess')):
1053                 symlink('../../../etc/htaccess.patches',os.path.join(deriv_patch_dir,'.htaccess'))
1054         output_data = []
1055         for item in data:
1056                 debian_name, debian_version, debian_sha1, name, version, sha1, parts_sha1, patches = item
1057                 item = {}
1058                 item['debian_name'] = debian_name
1059                 item['debian_version'] = debian_version
1060                 item['debian_sha1'] = debian_sha1
1061                 item['name'] = name
1062                 item['version'] = version
1063                 item['sha1'] = sha1
1064                 item['patches'] = patches
1065                 item['parts'] = parts_sha1
1066                 output_data.append(item)
1067         output = file(os.path.abspath(filename), 'wb')
1068         yaml.safe_dump(output_data, output)
1069         output.close()
1070 else:
1071         remove(filename)
1072
1073 filename = sys.argv[5]
1074 data = links
1075 if data:
1076         data = list(set(data))
1077         data.sort(cmp=lambda a,b: cmp(a[0],b[0]) or apt_version_cmp(a[1],b[1]) or cmp(a[2],b[2]) or apt_version_cmp(a[3],b[3]))
1078         output_data = {}
1079         output = file(os.path.abspath(filename), 'wb')
1080         for debian_name, debian_version, name, version, dsc_url in data:
1081                 if debian_name not in output_data:
1082                         output_data[debian_name] = {}
1083                 if debian_version not in output_data[debian_name]:
1084                         output_data[debian_name][debian_version] = []
1085                 item = {}
1086                 item['name'] = name
1087                 item['version'] = version
1088                 item['dsc'] = dsc_url
1089                 output_data[debian_name][debian_version].append(item)
1090         yaml.safe_dump(output_data, output)
1091         output.close()
1092 else:
1093         remove(filename)
1094
1095 filename = sys.argv[6]
1096 data = new
1097 if data:
1098         data = list(set(data))
1099         data.sort(cmp=lambda a,b: cmp(a[0],b[0]) or apt_version_cmp(a[1],b[1]) or cmp(a[2],b[2]))
1100         output_data = {}
1101         output = file(os.path.abspath(filename), 'wb')
1102         for name, version, dsc_url in data:
1103                 if name not in output_data:
1104                         output_data[name] = {}
1105                 if version not in output_data[name]:
1106                         output_data[name][version] = []
1107                 output_data[name][version].append(str(dsc_url))
1108         yaml.safe_dump(output_data, output)
1109         output.close()
1110 else:
1111         remove(filename)
1112
1113 logging.shutdown()