#!/usr/bin/env python
"""
This script imports information from ftp new queue into the database
See http://ftp-master.debian.org/new.822 and
http://ftp-master.debian.org/new.html
"""
from debian_bundle import deb822
from os import access, mkdir, unlink, W_OK
from sys import stderr
import aux
from aux import quote
from gatherer import gatherer
import email.Utils
import re
from time import ctime
from psycopg2 import IntegrityError
def get_gatherer(connection, config, source):
return ftpnew_gatherer(connection, config, source)
DEBUG=0
# When parsing src html pages we have to get rid of certain html strings
def de_html(string):
string= re.sub("?span[^>]*>", '', string)
string= re.sub(""", '"', string)
string= re.sub("&", '&', string)
string= re.sub("<", '<', string)
string= re.sub(">", '>', string)
string= re.sub("?pre>", '', string)
return string
# These fields are not forewarded to UDD tables for the moment
fields_to_pass = ('Format',
'Date',
'Changed-By',
'Files',
'Uploaders',
'Standards-Version',
'Priority',
'Urgency',
'Dm-Upload-Allowed',
'Autobuild',
'Build-Depends',
'Build-Depends-Indep',
'Build-Conflicts',
'Python-Version')
# + startswith('Npp-')
dependencies_to_accept = ( 'Depends', 'Recommends', 'Suggests', 'Enhances', 'Pre-Depends',
'Breaks', 'Replaces', 'Provides', 'Conflicts')
class src_pkg():
def __init__(self, source):
self.s = {}
self.s['Source'] = source
self.has_several_versions = 0
# self.bin = () # comma separated list of binaries created from the source
self.s['Bin'] = () # comma separated list of binaries created from the source
self.s['Architecture'] = () # architecture(s separated by blanks)
# Just define Vcs fields in case it is not provided in the control
self.s['Vcs-Type'] = None
self.s['Vcs-Url'] = None
# preset WNPP bug
self.s['Closes'] = 0
def check_dict(self):
"Make sure that non-mandatory fields at least get a '' value"
for field in ftpnew_gatherer.s_non_mandatory:
if not self.s.has_key(field):
self.s[field] = ''
def __str__(self):
str = "Source %(Source)s: %(Version)s, (%(Architecture)s), %(Last_modified)s, %(Queue)s, %(Distribution)s" % \
(self.s)
str += " %(maintainer_name)s <%(maintainer_email)s>, %(Closes)i" % (self.s)
return str
class bin_pkg():
def __init__(self, package, source):
self.b = {}
self.b['Package'] = package
self.b['Source'] = source
self.b['Installed-Size'] = 0
self.b['License'] = ''
def check_dict(self):
"Make sure that non-mandatory fields at least get a '' value"
for field in ftpnew_gatherer.b_non_mandatory:
if not self.b.has_key(field):
self.b[field] = ''
def __str__(self):
return "Package %s: %s, %s, %s, %s, %s" % \
(self.b['Package'], self.b['Version'], self.b['Architecture'], self.b['Maintainer'],
self.b['Description'], self.b['Long_Description'])
class ftpnew_gatherer(gatherer):
"This class imports the data from New queue into the database"
s_mandatory = {'Source': 0, 'Format': 0, 'Maintainer': 0, 'Package': 0, 'Version': 0, 'Files': 0,
'Queue': 0, 'Last_modified': 0}
s_non_mandatory = {'Uploaders': 0, 'Bin': 0, 'Architecture': 0,
'Homepage': 0, 'Build-Depends': 0, 'Vcs-Arch': 0, 'Vcs-Bzr': 0,
'Vcs-Cvs': 0, 'Vcs-Darcs': 0, 'Vcs-Git': 0, 'Vcs-Hg': 0, 'Vcs-Svn': 0,
'Vcs-Mtn':0, 'Vcs-Browser': 0, 'License': 0, 'Section': 0
}
s_ignorable = {'X-Vcs-Browser': 0, 'X-Vcs-Bzr': 0, 'X-Vcs-Darcs': 0, 'X-Vcs-Svn': 0, 'X-Vcs-Hg':0, 'X-Vcs-Git':0,
'Directory':0, 'Comment':0, 'Origin':0, 'Url':0, 'X-Collab-Maint':0, 'Autobuild':0, 'Vcs-Cvs:':0,
'Python-Standards-Version':0, 'url':0, 'originalmaintainer':0, 'Originalmaintainer':0,
'Build-Recommends':0,
'Build-Depends-Indep': 0, 'Build-Conflicts': 0, 'Build-Conflicts-Indep': 0,
'Priority': 0, 'Python-Version': 0, 'Checksums-Sha1':0,
'Checksums-Sha256':0, 'Original-Maintainer':0, 'Dm-Upload-Allowed':0,
'Standards-Version': 0,
}
b_non_mandatory = {'Source': 0, 'Essential': 0, 'Depends': 0, 'Recommends': 0,
'Suggests': 0, 'Enhances': 0, 'Pre-Depends': 0, 'Breaks':0, 'Installed-Size': 0,
'Homepage': 0, 'Size': 0, 'Build-Essential':0, 'Origin':0,
'SHA1':0, 'Replaces':0, 'Section':0, 'MD5sum':0, 'Bugs':0, 'Priority':0,
'Tag':0, 'Task':0, 'Python-Version':0, 'Provides':0, 'Conflicts':0,
'SHA256':0, 'Original-Maintainer':0}
s_ignorable_re = re.compile("^(Original-|Origianl-|Orginal-|Debian-|X-Original-|Upstream-)")
s_vcs = { 'Arch':0, 'Bzr':0, 'Cvs':0, 'Darcs':0, 'Git':0, 'Hg':0, 'Svn':0, 'Mtn':0}
src_html_failed_re = re.compile("^
The requested URL /new/.+\.html was not found on this server\.
")
src_html_has_tag_re = re.compile('^\s*| ([-\w]+): | (.+) |
$')
src_html_has_description_start_re = re.compile('^\s*| Description: | (.+)')
src_html_has_description_end_re = re.compile('(.+) |
')
closes_is_itp_re = re.compile('^\s*(ITP|RFP|ITA)')
vcs_type_re = re.compile('Vcs-(Svn|Git|Bzr|Darcs|Hg|Cvs|Arch|Mtn)')
def __init__(self, connection, config, source):
gatherer.__init__(self, connection, config, source)
self.assert_my_config('path', 'table_sources', 'table_packages', 'ftpmasterURL', 'releases_ignore')
def check_existing_binaries(self, values, queue):
# Sometimes the source package name has changed, but the binary package name is known in UDD
# we are not interested in these packages
cur = self.cursor()
for value in values:
# query = "SELECT count(*) FROM packages WHERE package = '%s'" % (value)
query = "EXECUTE ftpnew_check_existing_package ('%s')" % (value)
cur.execute(query)
in_udd = cur.fetchone()[0]
if in_udd:
if DEBUG != 0:
print >>stderr, "Binary package %s is %i times in UDD - no interest in just known binaries (queue = %s)" \
% (value, int(in_udd), queue)
return 1
return 0
def run(self):
my_config = self.my_config
#start harassing the DB, preparing the final inserts and making place
#for the new data:
cur = self.cursor()
# if we check whether a package just exists in UDD we ignore oldstable which is currently etch but other
# dists might have to be ignored as well
cur.execute("PREPARE ftpnew_check_existing_package AS SELECT COUNT(*) FROM packages WHERE package = $1 AND release NOT IN (%s)" \
% self.my_config["releases_ignore"])
# For some reason the code tries to add binary packages twice - just verify whether the package is
# just included to make sure we do not trigger conflicting primary keys
cur.execute("PREPARE ftpnew_check_just_added_package AS SELECT COUNT(*) FROM new_packages WHERE package = $1 AND version = $2 AND architecture = $3")
cur.execute("DELETE FROM %s" % my_config["table_sources"])
cur.execute("DELETE FROM %s" % my_config["table_packages"])
query = """PREPARE ftpnew_insert_source
AS INSERT INTO %s (source, version, maintainer, maintainer_name, maintainer_email, binaries,
changed_by, architecture, homepage,
vcs_type, vcs_url, vcs_browser,
section, distribution, component, closes, license, last_modified, queue)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)""" % (my_config['table_sources'])
cur.execute(query)
query = """PREPARE ftpnew_insert_package
AS INSERT INTO %s (package, version, architecture, maintainer, description, source,
depends, recommends, suggests, enhances, pre_depends, breaks, replaces, provides, conflicts,
installed_size, homepage, section, long_description, distribution, component, license)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22)""" \
% (my_config['table_packages'])
cur.execute(query)
ftpnew_data = open(my_config['path']+'/new.822')
has_warned_about_missing_section_key = 0
for stanza in deb822.Sources.iter_paragraphs(ftpnew_data, shared_storage=False):
if stanza['queue'] == 'accepted' or stanza['queue'] == 'proposedupdates' :
continue
srcpkg = src_pkg(stanza['source'])
versions = stanza['version'].split(' ') # the page lists more than one version
srcpkg.has_several_versions = len(versions)-1 # some tests below fail if more than one version in in queue
srcpkg.s['Version'] = versions[srcpkg.has_several_versions]
srcpkg.s['Architecture'] = stanza['architectures']
srcpkg.s['Queue'] = stanza['queue']
srcpkg.s['Last_modified'] = ctime(int(stanza['last-modified'])) # We want a real time object instead of an epoch
srcpkg.s['Distribution'] = stanza['distribution']
srcpkg.s['Changed-By'] = stanza['changed-by']
try:
srcpkg.s['Section'] = stanza['section']
if stanza['section'].startswith('non-free'):
srcpkg.s['Component'] = 'non-free'
elif stanza['section'].startswith('contrib'):
srcpkg.s['Component'] = 'contrib'
else:
srcpkg.s['Component'] = 'main'
except KeyError:
srcpkg.s['Section'] = ''
srcpkg.s['Component'] = ''
if has_warned_about_missing_section_key == 0:
has_warned_about_missing_section_key = 1
print >>stderr, "Warning: Because of a bug in DAK code the Section field is currently missing."
# Check UDD for existing source packages of this name
query = "SELECT count(*) FROM sources WHERE source = '%s'" % (srcpkg.s['Source'])
cur.execute(query)
in_udd = cur.fetchone()[0]
if in_udd:
if DEBUG != 0:
print >>stderr, "%s is %i times in UDD - no interest in just known sources (queue = %s)" \
% (srcpkg.s['Source'], int(in_udd), srcpkg.s['Source'])
continue
src_info_base = srcpkg.s['Source'] + '_' + srcpkg.s['Version']
src_info_html = my_config['path'] + '/' + src_info_base + '.html'
src_info_822 = my_config['path'] + '/' + src_info_base + '.822'
try:
srci = open(src_info_html, 'r')
except IOError, err:
print >>stderr, "No html info for package %s in queue %s (%s)." % (srcpkg.s['Source'], stanza['queue'], err)
continue
srco = open(src_info_822, 'w')
in_description = 0
in_source = 1
binpkgs = []
binpkg = None
for line in srci.readlines():
if ftpnew_gatherer.src_html_failed_re.match(line):
print >>stderr, "File %s not found." % (src_info_html)
src_info_not_found = 1
break
match = ftpnew_gatherer.src_html_has_tag_re.match(line)
if match:
field = match.groups()[0]
value = de_html(match.groups()[1])
if field == 'Package':
# Here begins a new binary package
if self.check_existing_binaries((value,), srcpkg.s['Queue']):
srcpkg.s['Queue'] = 'ignore'
break
if in_source:
in_source = 0
if binpkg:
binpkgs.append(binpkg)
binpkg = bin_pkg(value, srcpkg.s['Source'])
print >>srco, "\nPackage: %s" % (value)
binpkg.b['Distribution'] = srcpkg.s['Distribution']
elif field == 'Maintainer':
# print "DEBUG %s: %s" % (field, value)
if in_source:
srcpkg.s[field] = value
srcpkg.s['maintainer_name'], srcpkg.s['maintainer_email'] = email.Utils.parseaddr(srcpkg.s['Maintainer'])
else:
binpkg.b[field] = value
print >>srco, "%s: %s" % (field, value)
elif field == 'Description':
if in_source:
srcpkg.s[field] = de_html(value)
else:
binpkg.b[field] = de_html(value)
print >>srco, "%s: %s" % (field, value)
elif field == 'Architecture':
if in_source:
srcpkg.s[field] = value
else:
binpkg.b[field] = value
print >>srco, "%s: %s" % (field, value)
elif field == 'Source':
if in_source:
if value != srcpkg.s['Source']:
print >>stderr, "Incompatible source names between new.822(%s) and %s.html (%s)" % \
(srcpkg.s['Source'], src_info_base, value)
srcpkg.s['Source'] = value
print >>srco, "%s: %s" % (field, value)
elif field == 'Version':
if in_source:
if srcpkg.has_several_versions == 0 and value != srcpkg.s[field]:
print >>stderr, "Incompatible version numbers between new.822(%s) and %s.html (%s)" % \
(srcpkg.s[field], src_info_base, value)
srcpkg.s[field] = value
else:
binpkg.b[field] = value
print >>srco, "%s: %s" % (field, value)
elif field == 'Closes':
values = value.split(' ')
found_itp = 0
for val in values:
ival = int(val)
query = "SELECT title from bugs where id = %i and package = 'wnpp' and source = 'wnpp'" % (ival)
cur.execute(query)
try:
wnpp_title = cur.fetchone()[0]
except TypeError, err:
query = "SELECT id, package, source, title FROM bugs WHERE id = %i" % (ival)
cur.execute(query)
bug_info = cur.fetchone()
if not bug_info:
print >>stderr, "Bug %i which source package %s claims to close does not exist." % (ival, srcpkg.s['Source'])
else:
print >>stderr, "Bug #%i of package %s and source %s is not against pseudopackage 'wnpp' and hast title '%s'" % bug_info
if not ftpnew_gatherer.closes_is_itp_re.match(wnpp_title):
print >>stderr, "Closed bug %i seems to be not ITPed (queue = %s; title = %s)" % (ival, srcpkg.s['Queue'], wnpp_title)
else:
if found_itp:
print >>stderr, "Warning: Package %s seems to have more than one ITP bugs (%i, %i). Only %i is stored in UDD" % \
(srcpkg.s['Source'], srcpkg.s['Closes'], ival, srcpkg.s['Closes'])
query = "SELECT count(*) FROM bugs_merged_with WHERE id = %i OR id = %i" % (srcpkg.s['Closes'], ival)
cur.execute(query)
is_merged = cur.fetchone()[0]
if is_merged != 2:
print >>stderr, " --> Bugs should be merged in BTS!"
else: # stay with the ITP found first
srcpkg.s['Closes'] = int(ival)
found_itp = 1
if not found_itp:
print >>stderr, "Most probably %s is not new." % (srcpkg.s['Source'])
print >>srco, "%s: %s\n" % (field, value)
elif field == 'Distribution':
if in_source:
if srcpkg.has_several_versions == 0 and value != srcpkg.s['Distribution']:
print >>stderr, "Incompatible distributions between new.822(%s) and %s.html (%s)" % \
(srcpkg.s['Distribution'], src_info_base, value)
srcpkg.s['Distribution'] = value
print >>srco, "%s: %s" % (field, value)
else:
print >>stderr, "Binary should not mention distribution field in %s.html (%s)" % \
(src_info_base, value)
elif field == 'Binary':
if in_source:
# Binaries are mentioned in different syntax in *.changes and *.dsc
value = re.sub(", +", " ", value)
if self.check_existing_binaries(value.split(' '), srcpkg.s['Queue']):
srcpkg.s['Queue'] = 'ignore'
break
if in_source:
if srcpkg.s['Bin'] != () and value != srcpkg.s['Bin']:
print >>stderr, "Incompatible binaries between new.822(%s) and %s.html (%s)" % \
(srcpkg.s['Bin'], src_info_base, value)
srcpkg.s['Bin'] = value
print >>srco, "%s: %s" % (field, value)
else:
print >>stderr, "Binary should not mention Binary field in %s.html (%s)" % \
(src_info_base, value)
elif field == 'Installed-Size':
if not in_source:
binpkg.b[field] = int(value)
elif field == 'Homepage':
if not in_source:
binpkg.b[field] = value
elif field == 'Section':
if not in_source:
if not binpkg:
print >>stderr, "This should not happen", srcpkg, field, value
exit(-1)
else:
binpkg.b[field] = value
binpkg.b['Component'] = srcpkg.s['Component']
elif field == 'Vcs-Browser':
srcpkg.s[field] = value
elif binpkg != None and field in dependencies_to_accept:
binpkg.b[field] = value
print >>srco, "%s: %s" % (field, value)
elif field in fields_to_pass or field.startswith('Npp-'):
print >>srco, "%s: %s" % (field, value)
else:
matchvcs = ftpnew_gatherer.vcs_type_re.match(field)
if matchvcs:
srcpkg.s['Vcs-Type'] = matchvcs.groups()[0]
srcpkg.s['Vcs-Url'] = value
print >>srco, "%s: %s" % (field, value)
else:
print >>stderr, "Unknown field in %s: %s" % (srcpkg.s['Source'], field)
print >>srco, "*%s: %s" % (field, value)
continue
if in_description:
match = ftpnew_gatherer.src_html_has_description_end_re.match(line)
if match:
if match.groups()[0][0] != ' ':
description += ' '
description += de_html(match.groups()[0])
in_description = 0
if not in_source: # binpkg and binpkg.b:
(binpkg.b['Description'], binpkg.b['Long_Description']) = description.split("\n",1)
print >>srco, "Description: %s\n%s" % (binpkg.b['Description'], binpkg.b['Long_Description'])
else:
if line[0] != ' ':
description += ' '
description += de_html(line)
else:
match = ftpnew_gatherer.src_html_has_description_start_re.match(line)
if match:
in_description = 1
description = de_html(match.groups()[0]) + "\n"
srci.close()
srco.close()
# cur.execute("EXECUTE ftpnew_insert (%s, %s, %s, %s)"\
# % (quote(pkg), pkg_type, quote(tag), quote(ftpnew_gatherer.code_to_tag_type_map[code])));
if srcpkg.s['Queue'] != 'ignore':
# print srcpkg
srcpkg.check_dict()
query = """EXECUTE ftpnew_insert_source (%(Source)s, %(Version)s,
%(Maintainer)s, %(maintainer_name)s, %(maintainer_email)s,
%(Bin)s, %(Changed-By)s, %(Architecture)s, %(Homepage)s,
%(Vcs-Type)s, %(Vcs-Url)s, %(Vcs-Browser)s,
%(Section)s, %(Distribution)s, %(Component)s, %(Closes)s, %(License)s,
%(Last_modified)s, %(Queue)s)"""
cur.execute(query, srcpkg.s)
for binpkg in binpkgs:
# print binpkg
binpkg.check_dict()
query = """EXECUTE ftpnew_insert_package (%(Package)s, %(Version)s,
%(Architecture)s, %(Maintainer)s, %(Description)s, %(Source)s,
%(Depends)s, %(Recommends)s, %(Suggests)s, %(Enhances)s,
%(Pre-Depends)s, %(Breaks)s, %(Replaces)s, %(Provides)s, %(Conflicts)s,
%(Installed-Size)s, %(Homepage)s, %(Section)s,
%(Long_Description)s, %(Distribution)s, %(Component)s, %(License)s)"""
try:
cur.execute(query, binpkg.b)
except IntegrityError, err:
print >>stderr, err, src_info_html
print >>stderr, binpkg
print >>stderr, binpkg.b
continue
cur.execute("DEALLOCATE ftpnew_insert_source")
cur.execute("DEALLOCATE ftpnew_insert_package")
cur.execute("DEALLOCATE ftpnew_check_existing_package")
if __name__ == '__main__':
main()
# vim:set et tabstop=2: