/[collab-qa]/udd/udd/ftpnew_gatherer.py
ViewVC logotype

Contents of /udd/udd/ftpnew_gatherer.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1473 - (hide annotations) (download) (as text)
Tue Jun 9 21:50:03 2009 UTC (3 years, 11 months ago) by tille
File MIME type: text/x-python
File size: 20428 byte(s)
Distribution field is in packages table so also feature it in new_packages
1 tille 1406 #!/usr/bin/env python
2    
3     """
4     This script imports information from ftp new queue into the database
5     See http://ftp-master.debian.org/new.822 and
6     http://ftp-master.debian.org/new.html
7     """
8    
9     from debian_bundle import deb822
10     from os import access, mkdir, unlink, W_OK
11     from sys import stderr
12     import aux
13     from aux import quote
14     from gatherer import gatherer
15     import email.Utils
16     import re
17     from time import ctime
18     from psycopg2 import IntegrityError
19    
20     def get_gatherer(connection, config, source):
21     return ftpnew_gatherer(connection, config, source)
22    
23 tille 1408 DEBUG=0
24    
25 tille 1406 # When parsing src html pages we have to get rid of certain html strings
26     def de_html(string):
27 tille 1408 string= re.sub("</?span[^>]*>", '', string)
28     string= re.sub("&quot;", '"', string)
29     string= re.sub("&amp;", '&', string)
30     string= re.sub("&lt;", '<', string)
31     string= re.sub("&gt;", '>', string)
32     string= re.sub("</?pre>", '', string)
33 tille 1406 return string
34    
35     # These fields are not forewarded to UDD tables for the moment
36     fields_to_pass = ('Format',
37     'Date',
38     'Changed-By',
39     'Files',
40     'Uploaders',
41     'Standards-Version',
42     'Priority',
43     'Urgency',
44     'Dm-Upload-Allowed',
45     'Autobuild',
46     'Build-Depends',
47     'Build-Depends-Indep',
48     'Build-Conflicts',
49     'Python-Version')
50     # + startswith('Npp-')
51    
52     dependencies_to_accept = ( 'Depends', 'Recommends', 'Suggests', 'Enhances', 'Pre-Depends',
53     'Breaks', 'Replaces', 'Provides', 'Conflicts')
54    
55     class src_pkg():
56     def __init__(self, source):
57     self.s = {}
58     self.s['Source'] = source
59     self.has_several_versions = 0
60     # self.bin = () # comma separated list of binaries created from the source
61     self.s['Bin'] = () # comma separated list of binaries created from the source
62     self.s['Architecture'] = () # architecture(s separated by blanks)
63     # Just define Vcs fields in case it is not provided in the control
64     self.s['Vcs-Type'] = None
65     self.s['Vcs-Url'] = None
66     # preset WNPP bug
67     self.s['Closes'] = 0
68    
69     def check_dict(self):
70     "Make sure that non-mandatory fields at least get a '' value"
71     for field in ftpnew_gatherer.s_non_mandatory:
72     if not self.s.has_key(field):
73     self.s[field] = ''
74    
75     def __str__(self):
76     str = "Source %(Source)s: %(Version)s, (%(Architecture)s), %(Last_modified)s, %(Queue)s, %(Distribution)s" % \
77     (self.s)
78     str += " %(maintainer_name)s <%(maintainer_email)s>, %(Closes)i" % (self.s)
79     return str
80    
81     class bin_pkg():
82     def __init__(self, package, source):
83     self.b = {}
84     self.b['Package'] = package
85     self.b['Source'] = source
86     self.b['Installed-Size'] = 0
87     self.b['License'] = ''
88    
89     def check_dict(self):
90     "Make sure that non-mandatory fields at least get a '' value"
91     for field in ftpnew_gatherer.b_non_mandatory:
92     if not self.b.has_key(field):
93     self.b[field] = ''
94    
95     def __str__(self):
96     return "Package %s: %s, %s, %s, %s, %s" % \
97     (self.b['Package'], self.b['Version'], self.b['Architecture'], self.b['Maintainer'],
98     self.b['Description'], self.b['Long_Description'])
99    
100     class ftpnew_gatherer(gatherer):
101     "This class imports the data from New queue into the database"
102     s_mandatory = {'Source': 0, 'Format': 0, 'Maintainer': 0, 'Package': 0, 'Version': 0, 'Files': 0,
103     'Queue': 0, 'Last_modified': 0}
104     s_non_mandatory = {'Uploaders': 0, 'Bin': 0, 'Architecture': 0,
105     'Homepage': 0, 'Build-Depends': 0, 'Vcs-Arch': 0, 'Vcs-Bzr': 0,
106     'Vcs-Cvs': 0, 'Vcs-Darcs': 0, 'Vcs-Git': 0, 'Vcs-Hg': 0, 'Vcs-Svn': 0,
107     'Vcs-Mtn':0, 'Vcs-Browser': 0, 'License': 0
108     }
109     s_ignorable = {'X-Vcs-Browser': 0, 'X-Vcs-Bzr': 0, 'X-Vcs-Darcs': 0, 'X-Vcs-Svn': 0, 'X-Vcs-Hg':0, 'X-Vcs-Git':0,
110     'Directory':0, 'Comment':0, 'Origin':0, 'Url':0, 'X-Collab-Maint':0, 'Autobuild':0, 'Vcs-Cvs:':0,
111     'Python-Standards-Version':0, 'url':0, 'originalmaintainer':0, 'Originalmaintainer':0,
112     'Build-Recommends':0,
113     'Build-Depends-Indep': 0, 'Build-Conflicts': 0, 'Build-Conflicts-Indep': 0,
114     'Priority': 0, 'Section': 0, 'Python-Version': 0, 'Checksums-Sha1':0,
115     'Checksums-Sha256':0, 'Original-Maintainer':0, 'Dm-Upload-Allowed':0,
116     'Standards-Version': 0,
117     }
118    
119     b_non_mandatory = {'Source': 0, 'Essential': 0, 'Depends': 0, 'Recommends': 0,
120     'Suggests': 0, 'Enhances': 0, 'Pre-Depends': 0, 'Breaks':0, 'Installed-Size': 0,
121     'Homepage': 0, 'Size': 0, 'Build-Essential':0, 'Origin':0,
122     'SHA1':0, 'Replaces':0, 'Section':0, 'MD5sum':0, 'Bugs':0, 'Priority':0,
123     'Tag':0, 'Task':0, 'Python-Version':0, 'Provides':0, 'Conflicts':0,
124     'SHA256':0, 'Original-Maintainer':0}
125    
126     s_ignorable_re = re.compile("^(Original-|Origianl-|Orginal-|Debian-|X-Original-|Upstream-)")
127     s_vcs = { 'Arch':0, 'Bzr':0, 'Cvs':0, 'Darcs':0, 'Git':0, 'Hg':0, 'Svn':0, 'Mtn':0}
128    
129     src_html_failed_re = re.compile("^<p>The requested URL /new/.+\.html was not found on this server\.</p>")
130     src_html_has_tag_re = re.compile('^\s*<tr><td class="key">([-\w]+):</td><td class="val">(.+)</td></tr>$')
131     src_html_has_description_start_re = re.compile('^\s*<tr><td class="key">Description:</td><td class="val"><pre>(.+)')
132     src_html_has_description_end_re = re.compile('(.+)</pre></td></tr>')
133     closes_is_itp_re = re.compile('^\s*(ITP|RFP|ITA)')
134     vcs_type_re = re.compile('Vcs-(Svn|Git|Bzr|Darcs|Hg|Cvs|Arch|Mtn)')
135    
136     def __init__(self, connection, config, source):
137     gatherer.__init__(self, connection, config, source)
138     self.assert_my_config('path', 'table_sources', 'table_packages', 'ftpmasterURL', 'releases_ignore')
139    
140    
141     def check_existing_binaries(self, values, queue):
142     # Sometimes the source package name has changed, but the binary package name is known in UDD
143     # we are not interested in these packages
144    
145     cur = self.cursor()
146     for value in values:
147     # query = "SELECT count(*) FROM packages WHERE package = '%s'" % (value)
148     query = "EXECUTE ftpnew_check_existing_package ('%s')" % (value)
149     cur.execute(query)
150     in_udd = cur.fetchone()[0]
151     if in_udd:
152 tille 1408 if DEBUG != 0:
153     print >>stderr, "Binary package %s is %i times in UDD - no interest in just known binaries (queue = %s)" \
154     % (value, int(in_udd), queue)
155 tille 1406 return 1
156     return 0
157    
158     def run(self):
159     my_config = self.my_config
160    
161     #start harassing the DB, preparing the final inserts and making place
162     #for the new data:
163     cur = self.cursor()
164    
165     # if we check whether a package just exists in UDD we ignore oldstable which is currently etch but other
166     # dists might have to be ignored as well
167     cur.execute("PREPARE ftpnew_check_existing_package AS SELECT COUNT(*) FROM packages WHERE package = $1 AND release NOT IN (%s)" \
168     % self.my_config["releases_ignore"])
169     # For some reason the code tries to add binary packages twice - just verify whether the package is
170     # just included to make sure we do not trigger conflicting primary keys
171     cur.execute("PREPARE ftpnew_check_just_added_package AS SELECT COUNT(*) FROM new_packages WHERE package = $1 AND version = $2 AND architecture = $3")
172    
173     cur.execute("DELETE FROM %s" % my_config["table_sources"])
174     cur.execute("DELETE FROM %s" % my_config["table_packages"])
175    
176     query = """PREPARE ftpnew_insert_source
177     AS INSERT INTO %s (source, version, maintainer, maintainer_name, maintainer_email, binaries,
178     changed_by, architecture, homepage,
179     vcs_type, vcs_url, vcs_browser, distribution, closes, license, last_modified, queue)
180     VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)""" % (my_config['table_sources'])
181     cur.execute(query)
182     query = """PREPARE ftpnew_insert_package
183     AS INSERT INTO %s (package, version, architecture, maintainer, description, source,
184     depends, recommends, suggests, enhances, pre_depends, breaks, replaces, provides, conflicts,
185 tille 1473 installed_size, homepage, section, long_description, distribution, license)
186     VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21)""" % (my_config['table_packages'])
187 tille 1406 cur.execute(query)
188    
189     ftpnew_data = open(my_config['path']+'/new.822')
190    
191     for stanza in deb822.Sources.iter_paragraphs(ftpnew_data, shared_storage=False):
192     if stanza['queue'] == 'accepted' or stanza['queue'] == 'proposedupdates' :
193     continue
194     srcpkg = src_pkg(stanza['source'])
195     versions = stanza['version'].split(' ') # the page lists more than one version
196     srcpkg.has_several_versions = len(versions)-1 # some tests below fail if more than one version in in queue
197     srcpkg.s['Version'] = versions[srcpkg.has_several_versions]
198     srcpkg.s['Architecture'] = stanza['architectures']
199     srcpkg.s['Queue'] = stanza['queue']
200     srcpkg.s['Last_modified'] = ctime(int(stanza['last-modified'])) # We want a real time object instead of an epoch
201     srcpkg.s['Distribution'] = stanza['distribution']
202     srcpkg.s['Changed-By'] = stanza['changed-by']
203    
204     # Check UDD for existing source packages of this name
205     query = "SELECT count(*) FROM sources WHERE source = '%s'" % (srcpkg.s['Source'])
206     cur.execute(query)
207     in_udd = cur.fetchone()[0]
208     if in_udd:
209 tille 1408 if DEBUG != 0:
210     print >>stderr, "%s is %i times in UDD - no interest in just known sources (queue = %s)" \
211     % (srcpkg.s['Source'], int(in_udd), srcpkg.s['Source'])
212 tille 1406 continue
213    
214     src_info_base = srcpkg.s['Source'] + '_' + srcpkg.s['Version']
215     src_info_html = my_config['path'] + '/' + src_info_base + '.html'
216     src_info_822 = my_config['path'] + '/' + src_info_base + '.822'
217    
218     try:
219     srci = open(src_info_html, 'r')
220     except IOError, err:
221     print >>stderr, "No html info for package %s in queue %s (%s)." % (srcpkg.s['Source'], stanza['queue'], err)
222     continue
223     srco = open(src_info_822, 'w')
224     in_description = 0
225     in_source = 1
226     binpkgs = []
227     binpkg = None
228     for line in srci.readlines():
229     if ftpnew_gatherer.src_html_failed_re.match(line):
230     print >>stderr, "File %s not found." % (src_info_html)
231     src_info_not_found = 1
232     break
233     match = ftpnew_gatherer.src_html_has_tag_re.match(line)
234     if match:
235     field = match.groups()[0]
236     value = de_html(match.groups()[1])
237     if field == 'Package':
238     # Here begins a new binary package
239     if self.check_existing_binaries((value,), srcpkg.s['Queue']):
240     srcpkg.s['Queue'] = 'ignore'
241     break
242     if in_source:
243     in_source = 0
244     if binpkg:
245     binpkgs.append(binpkg)
246     binpkg = bin_pkg(value, srcpkg.s['Source'])
247     print >>srco, "\nPackage: %s" % (value)
248 tille 1473 binpkg.b['Distribution'] = srcpkg.s['Distribution']
249 tille 1406 elif field == 'Maintainer':
250     # print "DEBUG %s: %s" % (field, value)
251     if in_source:
252     srcpkg.s[field] = value
253     srcpkg.s['maintainer_name'], srcpkg.s['maintainer_email'] = email.Utils.parseaddr(srcpkg.s['Maintainer'])
254     else:
255     binpkg.b[field] = value
256     print >>srco, "%s: %s" % (field, value)
257     elif field == 'Description':
258     if in_source:
259 tille 1408 srcpkg.s[field] = de_html(value)
260 tille 1406 else:
261 tille 1408 binpkg.b[field] = de_html(value)
262 tille 1406 print >>srco, "%s: %s" % (field, value)
263     elif field == 'Architecture':
264     if in_source:
265     srcpkg.s[field] = value
266     else:
267     binpkg.b[field] = value
268     print >>srco, "%s: %s" % (field, value)
269     elif field == 'Source':
270     if in_source:
271     if value != srcpkg.s['Source']:
272     print >>stderr, "Incompatible source names between new.822(%s) and %s.html (%s)" % \
273     (srcpkg.s['Source'], src_info_base, value)
274     srcpkg.s['Source'] = value
275     print >>srco, "%s: %s" % (field, value)
276     elif field == 'Version':
277     if in_source:
278     if srcpkg.has_several_versions == 0 and value != srcpkg.s[field]:
279     print >>stderr, "Incompatible version numbers between new.822(%s) and %s.html (%s)" % \
280     (srcpkg.s[field], src_info_base, value)
281     srcpkg.s[field] = value
282     else:
283     binpkg.b[field] = value
284     print >>srco, "%s: %s" % (field, value)
285     elif field == 'Closes':
286     values = value.split(' ')
287     found_itp = 0
288     for val in values:
289     ival = int(val)
290     query = "SELECT title from bugs where id = %i and package = 'wnpp' and source = 'wnpp'" % (ival)
291     cur.execute(query)
292     try:
293     wnpp_title = cur.fetchone()[0]
294     except TypeError, err:
295     query = "SELECT id, package, source, title FROM bugs WHERE id = %i" % (ival)
296     cur.execute(query)
297     bug_info = cur.fetchone()
298     if not bug_info:
299     print >>stderr, "Bug %i which source package %s claims to close does not exist." % (ival, srcpkg.s['Source'])
300     else:
301     print >>stderr, "Bug #%i of package %s and source %s is not against pseudopackage 'wnpp' and hast title '%s'" % bug_info
302     if not ftpnew_gatherer.closes_is_itp_re.match(wnpp_title):
303     print >>stderr, "Closed bug %i seems to be not ITPed (queue = %s; title = %s)" % (ival, srcpkg.s['Queue'], wnpp_title)
304     else:
305     if found_itp:
306     print >>stderr, "Warning: Package %s seems to have more than one ITP bugs (%i, %i). Only %i is stored in UDD" % \
307     (srcpkg.s['Source'], srcpkg.s['Closes'], ival, srcpkg.s['Closes'])
308     query = "SELECT count(*) FROM bugs_merged_with WHERE id = %i OR id = %i" % (srcpkg.s['Closes'], ival)
309     cur.execute(query)
310     is_merged = cur.fetchone()[0]
311     if is_merged != 2:
312     print >>stderr, " --> Bugs should be merged in BTS!"
313     else: # stay with the ITP found first
314     srcpkg.s['Closes'] = int(ival)
315     found_itp = 1
316     if not found_itp:
317     print >>stderr, "Most probably %s is not new." % (srcpkg.s['Source'])
318     print >>srco, "%s: %s\n" % (field, value)
319     elif field == 'Distribution':
320     if in_source:
321     if srcpkg.has_several_versions == 0 and value != srcpkg.s['Distribution']:
322     print >>stderr, "Incompatible distributions between new.822(%s) and %s.html (%s)" % \
323     (srcpkg.s['Distribution'], src_info_base, value)
324     srcpkg.s['Distribution'] = value
325     print >>srco, "%s: %s" % (field, value)
326     else:
327     print >>stderr, "Binary should not mention distribution field in %s.html (%s)" % \
328     (src_info_base, value)
329     elif field == 'Binary':
330     if in_source:
331     # Binaries are mentioned in different syntax in *.changes and *.dsc
332     value = re.sub(", +", " ", value)
333     if self.check_existing_binaries(value.split(' '), srcpkg.s['Queue']):
334     srcpkg.s['Queue'] = 'ignore'
335     break
336     if in_source:
337     if srcpkg.s['Bin'] != () and value != srcpkg.s['Bin']:
338     print >>stderr, "Incompatible binaries between new.822(%s) and %s.html (%s)" % \
339     (srcpkg.s['Bin'], src_info_base, value)
340     srcpkg.s['Bin'] = value
341     print >>srco, "%s: %s" % (field, value)
342     else:
343     print >>stderr, "Binary should not mention Binary field in %s.html (%s)" % \
344     (src_info_base, value)
345     elif field == 'Installed-Size':
346     if not in_source:
347     binpkg.b[field] = int(value)
348     elif field == 'Homepage':
349     if not in_source:
350     binpkg.b[field] = value
351     elif field == 'Section':
352     if not in_source:
353     if not binpkg:
354     print >>stderr, "This should not happen", srcpkg, field, value
355     exit(-1)
356     elif field == 'Vcs-Browser':
357     srcpkg.s[field] = value
358     elif binpkg != None and field in dependencies_to_accept:
359     binpkg.b[field] = value
360     print >>srco, "%s: %s" % (field, value)
361     elif field in fields_to_pass or field.startswith('Npp-'):
362     print >>srco, "%s: %s" % (field, value)
363     else:
364     matchvcs = ftpnew_gatherer.vcs_type_re.match(field)
365     if matchvcs:
366     srcpkg.s['Vcs-Type'] = matchvcs.groups()[0]
367     srcpkg.s['Vcs-Url'] = value
368     print >>srco, "%s: %s" % (field, value)
369     else:
370     print >>stderr, "Unknown field in %s: %s" % (srcpkg.s['Source'], field)
371     print >>srco, "*%s: %s" % (field, value)
372     continue
373     if in_description:
374     match = ftpnew_gatherer.src_html_has_description_end_re.match(line)
375     if match:
376     if match.groups()[0][0] != ' ':
377     description += ' '
378 tille 1408 description += de_html(match.groups()[0])
379 tille 1406 in_description = 0
380     if not in_source: # binpkg and binpkg.b:
381 tille 1408 (binpkg.b['Description'], binpkg.b['Long_Description']) = description.split("\n",1)
382     print >>srco, "Description: %s\n%s" % (binpkg.b['Description'], binpkg.b['Long_Description'])
383 tille 1406 else:
384     if line[0] != ' ':
385     description += ' '
386 tille 1408 description += de_html(line)
387 tille 1406 else:
388     match = ftpnew_gatherer.src_html_has_description_start_re.match(line)
389     if match:
390     in_description = 1
391 tille 1408 description = de_html(match.groups()[0]) + "\n"
392 tille 1406 srci.close()
393     srco.close()
394     # cur.execute("EXECUTE ftpnew_insert (%s, %s, %s, %s)"\
395     # % (quote(pkg), pkg_type, quote(tag), quote(ftpnew_gatherer.code_to_tag_type_map[code])));
396     if srcpkg.s['Queue'] != 'ignore':
397     # print srcpkg
398     srcpkg.check_dict()
399     query = """EXECUTE ftpnew_insert_source (%(Source)s, %(Version)s,
400     %(Maintainer)s, %(maintainer_name)s, %(maintainer_email)s,
401     %(Bin)s, %(Changed-By)s, %(Architecture)s, %(Homepage)s,
402     %(Vcs-Type)s, %(Vcs-Url)s, %(Vcs-Browser)s,
403     %(Distribution)s, %(Closes)s, %(License)s,
404     %(Last_modified)s, %(Queue)s)"""
405     cur.execute(query, srcpkg.s)
406     for binpkg in binpkgs:
407     # print binpkg
408     binpkg.check_dict()
409     query = """EXECUTE ftpnew_insert_package (%(Package)s, %(Version)s,
410     %(Architecture)s, %(Maintainer)s, %(Description)s, %(Source)s,
411     %(Depends)s, %(Recommends)s, %(Suggests)s, %(Enhances)s,
412     %(Pre-Depends)s, %(Breaks)s, %(Replaces)s, %(Provides)s, %(Conflicts)s,
413     %(Installed-Size)s, %(Homepage)s, %(Section)s,
414 tille 1473 %(Long_Description)s, %(Distribution)s, %(License)s)"""
415 tille 1406 try:
416     cur.execute(query, binpkg.b)
417     except IntegrityError, err:
418     print >>stderr, err, src_info_html
419     print >>stderr, binpkg
420     print >>stderr, binpkg.b
421     continue
422    
423     cur.execute("DEALLOCATE ftpnew_insert_source")
424     cur.execute("DEALLOCATE ftpnew_insert_package")
425     cur.execute("DEALLOCATE ftpnew_check_existing_package")
426    
427     if __name__ == '__main__':
428     main()
429    
430     # vim:set et tabstop=2:

  ViewVC Help
Powered by ViewVC 1.1.5