| 1 |
# /usr/bin/env python
|
| 2 |
# Last-Modified: <Sun Aug 17 12:24:40 2008>
|
| 3 |
# This file is a part of the Ultimate Debian Database project
|
| 4 |
|
| 5 |
import debian_bundle.deb822
|
| 6 |
import gzip
|
| 7 |
import os
|
| 8 |
import sys
|
| 9 |
import aux
|
| 10 |
import tempfile
|
| 11 |
from aux import ConfigException
|
| 12 |
import psycopg2
|
| 13 |
from gatherer import gatherer
|
| 14 |
import email.Utils
|
| 15 |
import re
|
| 16 |
|
| 17 |
def get_gatherer(connection, config, source):
|
| 18 |
return packages_gatherer(connection, config, source)
|
| 19 |
|
| 20 |
class packages_gatherer(gatherer):
|
| 21 |
"This class imports the data from Packages.gz files into the database"
|
| 22 |
# For efficiency, these are dictionaries
|
| 23 |
# mandatory: list of fields which each package has to provide
|
| 24 |
# non_mandatory: list of fields which are possibly provided by packages
|
| 25 |
# ignorable: fields which are not useful for the database,
|
| 26 |
# but for which no warning should be printed
|
| 27 |
mandatory = {'Package': 0, 'Version': 0, 'Architecture': 0, 'Maintainer': 0,
|
| 28 |
'Description': 0}
|
| 29 |
non_mandatory = {'Source': 0, 'Essential': 0, 'Depends': 0, 'Recommends': 0,
|
| 30 |
'Suggests': 0, 'Enhances': 0, 'Pre-Depends': 0, 'Breaks':0, 'Installed-Size': 0,
|
| 31 |
'Homepage': 0, 'Size': 0, 'Build-Essential':0, 'Origin':0,
|
| 32 |
'SHA1':0, 'Replaces':0, 'Section':0, 'MD5sum':0, 'Bugs':0, 'Priority':0,
|
| 33 |
'Tag':0, 'Task':0, 'Python-Version':0, 'Provides':0, 'Conflicts':0,
|
| 34 |
'SHA256':0, 'Original-Maintainer':0}
|
| 35 |
ignorable = {'Filename':0, 'Npp-Filename':0, 'Npp-Name':0, 'Npp-Mimetype':0, 'Npp-Applications':0, 'Python-Runtime':0, 'Npp-File':0, 'Npp-Description':0, 'Url':0, 'Gstreamer-Elements':0, 'Gstreamer-Version':0, 'Gstreamer-Decoders':0, 'Gstreamer-Uri-Sinks':0, 'Gstreamer-Encoders':0, 'Gstreamer-Uri-Sources':0, 'url':0, 'Vdr-PatchLevel':0, 'Vdr-Patchlevel':0, 'originalmaintainer':0, 'Originalmaintainer':0, 'Build-Recommends':0, 'Multi-Arch':0, 'Maintainer-Homepage':0, 'Tads2-Version':0, 'Tads3-Version':0 }
|
| 36 |
ignorable_re = re.compile("^(Orig-|Original-|Origianl-|Orginal-|Orignal-|Orgiinal-|Debian-|X-Original-|Upstream-)")
|
| 37 |
|
| 38 |
|
| 39 |
def __init__(self, connection, config, source):
|
| 40 |
gatherer.__init__(self, connection, config, source)
|
| 41 |
# The ID for the distribution we want to include
|
| 42 |
self._distr = None
|
| 43 |
self.assert_my_config('directory', 'archs', 'release', 'components', 'distribution', 'packages-table', 'packages-schema')
|
| 44 |
self.warned_about = {}
|
| 45 |
# A mapping from <package-name><version> to 1 If <package-name><version> is
|
| 46 |
# included in this dictionary, this means, that we've already added this
|
| 47 |
# package with this version for architecture 'all' to the database. Needed
|
| 48 |
# because different architectures include packages for architecture 'all'
|
| 49 |
# with the same version, and we don't want these duplicate entries
|
| 50 |
self.imported_all_pkgs = {}
|
| 51 |
|
| 52 |
def build_dict(self, control):
|
| 53 |
"""Build a dictionary from the control dictionary.
|
| 54 |
|
| 55 |
Influenced by class variables mandatory, non_mandatory and ignorable"""
|
| 56 |
d = {}
|
| 57 |
for k in packages_gatherer.mandatory:
|
| 58 |
if k not in control:
|
| 59 |
raise "Mandatory field %s not specified" % k
|
| 60 |
d[k] = control[k]
|
| 61 |
for k in packages_gatherer.non_mandatory:
|
| 62 |
if k not in control:
|
| 63 |
d[k] = None
|
| 64 |
else:
|
| 65 |
d[k] = control[k]
|
| 66 |
for k in control.keys():
|
| 67 |
if k not in packages_gatherer.non_mandatory and k not in packages_gatherer.mandatory and k not in packages_gatherer.ignorable:
|
| 68 |
if not packages_gatherer.ignorable_re.match(k):
|
| 69 |
if k not in self.warned_about:
|
| 70 |
self.warned_about[k] = 1
|
| 71 |
else:
|
| 72 |
self.warned_about[k] += 1
|
| 73 |
return d
|
| 74 |
|
| 75 |
def import_packages(self, sequence, cur):
|
| 76 |
"""Import the packages from the sequence into the database-connection
|
| 77 |
conn.
|
| 78 |
|
| 79 |
Sequence has to have an iterator interface, that yields a line every time
|
| 80 |
it is called.The Format of the sequence is expected to be that of a
|
| 81 |
debian packages file."""
|
| 82 |
pkgs = []
|
| 83 |
query = """EXECUTE package_insert
|
| 84 |
(%(Package)s, %(Version)s, %(Architecture)s, %(Maintainer)s, %(maintainer_name)s, %(maintainer_email)s,
|
| 85 |
%(Description)s, %(Long_Description)s, %(Source)s, %(Source_Version)s, %(Essential)s,
|
| 86 |
%(Depends)s, %(Recommends)s, %(Suggests)s, %(Enhances)s,
|
| 87 |
%(Pre-Depends)s, %(Breaks)s, %(Installed-Size)s, %(Homepage)s, %(Size)s,
|
| 88 |
%(Build-Essential)s, %(Origin)s, %(SHA1)s,
|
| 89 |
%(Replaces)s, %(Section)s, %(MD5sum)s, %(Bugs)s, %(Priority)s,
|
| 90 |
%(Tag)s, %(Task)s, %(Python-Version)s, %(Provides)s,
|
| 91 |
%(Conflicts)s, %(SHA256)s, %(Original-Maintainer)s)"""
|
| 92 |
# The fields that are to be read. Other fields are ignored
|
| 93 |
for control in debian_bundle.deb822.Packages.iter_paragraphs(sequence):
|
| 94 |
# Check whether packages with architectue 'all' have already been
|
| 95 |
# imported
|
| 96 |
if control['Architecture'] == 'all':
|
| 97 |
t = control['Package'] + control['Version']
|
| 98 |
if t in self.imported_all_pkgs:
|
| 99 |
continue
|
| 100 |
self.imported_all_pkgs[t] = 1
|
| 101 |
|
| 102 |
d = self.build_dict(control)
|
| 103 |
|
| 104 |
# We just use the first line of the description
|
| 105 |
if 'Description' in d:
|
| 106 |
if len(d['Description'].split("\n",1)) > 1:
|
| 107 |
d['Long_Description'] = d['Description'].split("\n",1)[1]
|
| 108 |
else:
|
| 109 |
d['Long_Description'] = ''
|
| 110 |
d['Description'] = d['Description'].split("\n",1)[0]
|
| 111 |
|
| 112 |
# Convert numbers to numbers
|
| 113 |
for f in ['Installed-Size', 'Size']:
|
| 114 |
if d[f] is not None:
|
| 115 |
d[f] = int(d[f])
|
| 116 |
|
| 117 |
# Source is non-mandatory, but we don't want it to be NULL
|
| 118 |
if d['Source'] is None:
|
| 119 |
d['Source'] = d['Package']
|
| 120 |
d['Source_Version'] = d['Version']
|
| 121 |
else:
|
| 122 |
split = d['Source'].strip("'").split()
|
| 123 |
if len(split) == 1:
|
| 124 |
d['Source_Version'] = d['Version']
|
| 125 |
else:
|
| 126 |
d['Source'] = split[0]
|
| 127 |
d['Source_Version'] = split[1].strip("()")
|
| 128 |
|
| 129 |
pkgs.append(d)
|
| 130 |
|
| 131 |
d['maintainer_name'], d['maintainer_email'] = email.Utils.parseaddr(d['Maintainer'])
|
| 132 |
try:
|
| 133 |
cur.executemany(query, pkgs)
|
| 134 |
except psycopg2.ProgrammingError:
|
| 135 |
print query
|
| 136 |
raise
|
| 137 |
|
| 138 |
def setup(self):
|
| 139 |
if 'schema-dir' in self.config['general']:
|
| 140 |
schema_dir = self.config['general']['schema-dir']
|
| 141 |
if 'packages-schema' in self.my_config:
|
| 142 |
schema = schema_dir + '/' + self.my_config['packages-schema']
|
| 143 |
self.eval_sql_file(schema, self.my_config)
|
| 144 |
else:
|
| 145 |
raise Exception("'packages-schema' not specified for source " + self.source)
|
| 146 |
else:
|
| 147 |
raise Exception("'schema-dir' not specified")
|
| 148 |
|
| 149 |
def tables(self):
|
| 150 |
return [
|
| 151 |
self.my_config['packages-table'],
|
| 152 |
self.my_config['packages-table'] + '_summary']
|
| 153 |
|
| 154 |
def run(self):
|
| 155 |
src_cfg = self.my_config
|
| 156 |
|
| 157 |
aux.debug = self.config['general']['debug']
|
| 158 |
table = src_cfg['packages-table']
|
| 159 |
|
| 160 |
# Get distribution ID
|
| 161 |
self._distr = src_cfg['distribution']
|
| 162 |
|
| 163 |
cur = self.cursor()
|
| 164 |
# defer constraints checking until the end of the transaction
|
| 165 |
cur.execute("SET CONSTRAINTS ALL DEFERRED")
|
| 166 |
|
| 167 |
# For every part and every architecture, import the packages into the DB
|
| 168 |
for comp in src_cfg['components']:
|
| 169 |
cur.execute("DELETE FROM %s WHERE distribution = '%s' AND release = '%s' AND component = '%s'" %\
|
| 170 |
(table, self._distr, src_cfg['release'], comp))
|
| 171 |
for arch in src_cfg['archs']:
|
| 172 |
path = os.path.join(src_cfg['directory'], comp, 'binary-' + arch, 'Packages.gz')
|
| 173 |
try:
|
| 174 |
cur.execute("""PREPARE package_insert AS INSERT INTO %s
|
| 175 |
(Package, Version, Architecture, Maintainer, maintainer_name, maintainer_email, Description, Long_Description, Source,
|
| 176 |
Source_Version, Essential, Depends, Recommends, Suggests, Enhances,
|
| 177 |
Pre_Depends, Breaks, Installed_Size, Homepage, Size,
|
| 178 |
build_essential, origin, sha1, replaces, section,
|
| 179 |
md5sum, bugs, priority, tag, task, python_version,
|
| 180 |
provides, conflicts, sha256, original_maintainer,
|
| 181 |
Distribution, Release, Component)
|
| 182 |
VALUES
|
| 183 |
( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15,
|
| 184 |
$16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28,
|
| 185 |
$29, $30, $31, $32, $33, $34, $35, '%s', '%s', '%s')
|
| 186 |
""" % (table, self._distr, src_cfg['release'], comp))
|
| 187 |
# aux.print_debug("Reading file " + path)
|
| 188 |
# Copy content from gzipped file to temporary file, so that apt_pkg is
|
| 189 |
# used by debian_bundle
|
| 190 |
tmp = tempfile.NamedTemporaryFile()
|
| 191 |
file = gzip.open(path)
|
| 192 |
tmp.write(file.read())
|
| 193 |
file.close()
|
| 194 |
tmp.seek(0)
|
| 195 |
# aux.print_debug("Importing from " + path)
|
| 196 |
self.import_packages(open(tmp.name), cur)
|
| 197 |
tmp.close()
|
| 198 |
except IOError, (e, message):
|
| 199 |
print "Could not read packages from %s: %s" % (path, message)
|
| 200 |
cur.execute("DEALLOCATE package_insert")
|
| 201 |
# Fill the summary tables
|
| 202 |
cur.execute("DELETE FROM %s" % (table + '_summary'));
|
| 203 |
cur.execute("""INSERT INTO %s (package, version, source, source_version,
|
| 204 |
maintainer, maintainer_name, maintainer_email, distribution, release, component)
|
| 205 |
SELECT DISTINCT ON (package, version, distribution, release, component)
|
| 206 |
package, version, source, source_version, maintainer, maintainer_name, maintainer_email, distribution, release, component
|
| 207 |
FROM %s""" % (table + '_summary', table));
|
| 208 |
cur.execute("DELETE FROM %s" % (table + '_distrelcomparch'));
|
| 209 |
cur.execute("""INSERT INTO %s
|
| 210 |
(distribution, release, component, architecture)
|
| 211 |
SELECT DISTINCT distribution, release, component, architecture
|
| 212 |
FROM %s""" % (table + '_distrelcomparch', table))
|
| 213 |
|
| 214 |
cur.execute("ANALYZE %s" % table)
|
| 215 |
cur.execute("ANALYZE %s" % table + '_summary')
|
| 216 |
cur.execute("ANALYZE %s" % table + '_distrelcomparch')
|
| 217 |
|
| 218 |
self.print_warnings()
|
| 219 |
|
| 220 |
def print_warnings(self):
|
| 221 |
for key in self.warned_about:
|
| 222 |
print("[Packages] Unknown key %s appeared %d times" % (key, self.warned_about[key]))
|