/[collab-qa]/udd/udd/ftpnew_gatherer.py
ViewVC logotype

Contents of /udd/udd/ftpnew_gatherer.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1408 - (hide annotations) (download) (as text)
Mon Mar 16 10:21:12 2009 UTC (4 years, 3 months ago) by tille
File MIME type: text/x-python
File size: 20327 byte(s)
Forgot to add ddtp_gatherer in latest commit; fixed somme issues in description parsing for ftpnew
1 tille 1406 #!/usr/bin/env python
2    
3     """
4     This script imports information from ftp new queue into the database
5     See http://ftp-master.debian.org/new.822 and
6     http://ftp-master.debian.org/new.html
7     """
8    
9     from debian_bundle import deb822
10     from os import access, mkdir, unlink, W_OK
11     from sys import stderr
12     import aux
13     from aux import quote
14     from gatherer import gatherer
15     import email.Utils
16     import re
17     from time import ctime
18     from psycopg2 import IntegrityError
19    
20     def get_gatherer(connection, config, source):
21     return ftpnew_gatherer(connection, config, source)
22    
23 tille 1408 DEBUG=0
24    
25 tille 1406 # When parsing src html pages we have to get rid of certain html strings
26     def de_html(string):
27 tille 1408 string= re.sub("</?span[^>]*>", '', string)
28     string= re.sub("&quot;", '"', string)
29     string= re.sub("&amp;", '&', string)
30     string= re.sub("&lt;", '<', string)
31     string= re.sub("&gt;", '>', string)
32     string= re.sub("</?pre>", '', string)
33 tille 1406 return string
34    
35     # These fields are not forewarded to UDD tables for the moment
36     fields_to_pass = ('Format',
37     'Date',
38     'Changed-By',
39     'Files',
40     'Uploaders',
41     'Standards-Version',
42     'Priority',
43     'Urgency',
44     'Dm-Upload-Allowed',
45     'Autobuild',
46     'Build-Depends',
47     'Build-Depends-Indep',
48     'Build-Conflicts',
49     'Python-Version')
50     # + startswith('Npp-')
51    
52     dependencies_to_accept = ( 'Depends', 'Recommends', 'Suggests', 'Enhances', 'Pre-Depends',
53     'Breaks', 'Replaces', 'Provides', 'Conflicts')
54    
55     class src_pkg():
56     def __init__(self, source):
57     self.s = {}
58     self.s['Source'] = source
59     self.has_several_versions = 0
60     # self.bin = () # comma separated list of binaries created from the source
61     self.s['Bin'] = () # comma separated list of binaries created from the source
62     self.s['Architecture'] = () # architecture(s separated by blanks)
63     # Just define Vcs fields in case it is not provided in the control
64     self.s['Vcs-Type'] = None
65     self.s['Vcs-Url'] = None
66     # preset WNPP bug
67     self.s['Closes'] = 0
68    
69     def check_dict(self):
70     "Make sure that non-mandatory fields at least get a '' value"
71     for field in ftpnew_gatherer.s_non_mandatory:
72     if not self.s.has_key(field):
73     self.s[field] = ''
74    
75     def __str__(self):
76     str = "Source %(Source)s: %(Version)s, (%(Architecture)s), %(Last_modified)s, %(Queue)s, %(Distribution)s" % \
77     (self.s)
78     str += " %(maintainer_name)s <%(maintainer_email)s>, %(Closes)i" % (self.s)
79     return str
80    
81     class bin_pkg():
82     def __init__(self, package, source):
83     self.b = {}
84     self.b['Package'] = package
85     self.b['Source'] = source
86     self.b['Installed-Size'] = 0
87     self.b['License'] = ''
88    
89     def check_dict(self):
90     "Make sure that non-mandatory fields at least get a '' value"
91     for field in ftpnew_gatherer.b_non_mandatory:
92     if not self.b.has_key(field):
93     self.b[field] = ''
94    
95     def __str__(self):
96     return "Package %s: %s, %s, %s, %s, %s" % \
97     (self.b['Package'], self.b['Version'], self.b['Architecture'], self.b['Maintainer'],
98     self.b['Description'], self.b['Long_Description'])
99    
100     class ftpnew_gatherer(gatherer):
101     "This class imports the data from New queue into the database"
102     s_mandatory = {'Source': 0, 'Format': 0, 'Maintainer': 0, 'Package': 0, 'Version': 0, 'Files': 0,
103     'Queue': 0, 'Last_modified': 0}
104     s_non_mandatory = {'Uploaders': 0, 'Bin': 0, 'Architecture': 0,
105     'Homepage': 0, 'Build-Depends': 0, 'Vcs-Arch': 0, 'Vcs-Bzr': 0,
106     'Vcs-Cvs': 0, 'Vcs-Darcs': 0, 'Vcs-Git': 0, 'Vcs-Hg': 0, 'Vcs-Svn': 0,
107     'Vcs-Mtn':0, 'Vcs-Browser': 0, 'License': 0
108     }
109     s_ignorable = {'X-Vcs-Browser': 0, 'X-Vcs-Bzr': 0, 'X-Vcs-Darcs': 0, 'X-Vcs-Svn': 0, 'X-Vcs-Hg':0, 'X-Vcs-Git':0,
110     'Directory':0, 'Comment':0, 'Origin':0, 'Url':0, 'X-Collab-Maint':0, 'Autobuild':0, 'Vcs-Cvs:':0,
111     'Python-Standards-Version':0, 'url':0, 'originalmaintainer':0, 'Originalmaintainer':0,
112     'Build-Recommends':0,
113     'Build-Depends-Indep': 0, 'Build-Conflicts': 0, 'Build-Conflicts-Indep': 0,
114     'Priority': 0, 'Section': 0, 'Python-Version': 0, 'Checksums-Sha1':0,
115     'Checksums-Sha256':0, 'Original-Maintainer':0, 'Dm-Upload-Allowed':0,
116     'Standards-Version': 0,
117     }
118    
119     b_non_mandatory = {'Source': 0, 'Essential': 0, 'Depends': 0, 'Recommends': 0,
120     'Suggests': 0, 'Enhances': 0, 'Pre-Depends': 0, 'Breaks':0, 'Installed-Size': 0,
121     'Homepage': 0, 'Size': 0, 'Build-Essential':0, 'Origin':0,
122     'SHA1':0, 'Replaces':0, 'Section':0, 'MD5sum':0, 'Bugs':0, 'Priority':0,
123     'Tag':0, 'Task':0, 'Python-Version':0, 'Provides':0, 'Conflicts':0,
124     'SHA256':0, 'Original-Maintainer':0}
125    
126     s_ignorable_re = re.compile("^(Original-|Origianl-|Orginal-|Debian-|X-Original-|Upstream-)")
127     s_vcs = { 'Arch':0, 'Bzr':0, 'Cvs':0, 'Darcs':0, 'Git':0, 'Hg':0, 'Svn':0, 'Mtn':0}
128    
129     src_html_failed_re = re.compile("^<p>The requested URL /new/.+\.html was not found on this server\.</p>")
130     src_html_has_tag_re = re.compile('^\s*<tr><td class="key">([-\w]+):</td><td class="val">(.+)</td></tr>$')
131     src_html_has_description_start_re = re.compile('^\s*<tr><td class="key">Description:</td><td class="val"><pre>(.+)')
132     src_html_has_description_end_re = re.compile('(.+)</pre></td></tr>')
133     closes_is_itp_re = re.compile('^\s*(ITP|RFP|ITA)')
134     vcs_type_re = re.compile('Vcs-(Svn|Git|Bzr|Darcs|Hg|Cvs|Arch|Mtn)')
135    
136     def __init__(self, connection, config, source):
137     gatherer.__init__(self, connection, config, source)
138     self.assert_my_config('path', 'table_sources', 'table_packages', 'ftpmasterURL', 'releases_ignore')
139    
140    
141     def check_existing_binaries(self, values, queue):
142     # Sometimes the source package name has changed, but the binary package name is known in UDD
143     # we are not interested in these packages
144    
145     cur = self.cursor()
146     for value in values:
147     # query = "SELECT count(*) FROM packages WHERE package = '%s'" % (value)
148     query = "EXECUTE ftpnew_check_existing_package ('%s')" % (value)
149     cur.execute(query)
150     in_udd = cur.fetchone()[0]
151     if in_udd:
152 tille 1408 if DEBUG != 0:
153     print >>stderr, "Binary package %s is %i times in UDD - no interest in just known binaries (queue = %s)" \
154     % (value, int(in_udd), queue)
155 tille 1406 return 1
156     return 0
157    
158     def run(self):
159     my_config = self.my_config
160    
161     #start harassing the DB, preparing the final inserts and making place
162     #for the new data:
163     cur = self.cursor()
164    
165     # if we check whether a package just exists in UDD we ignore oldstable which is currently etch but other
166     # dists might have to be ignored as well
167     cur.execute("PREPARE ftpnew_check_existing_package AS SELECT COUNT(*) FROM packages WHERE package = $1 AND release NOT IN (%s)" \
168     % self.my_config["releases_ignore"])
169     # For some reason the code tries to add binary packages twice - just verify whether the package is
170     # just included to make sure we do not trigger conflicting primary keys
171     cur.execute("PREPARE ftpnew_check_just_added_package AS SELECT COUNT(*) FROM new_packages WHERE package = $1 AND version = $2 AND architecture = $3")
172    
173     cur.execute("DELETE FROM %s" % my_config["table_sources"])
174     cur.execute("DELETE FROM %s" % my_config["table_packages"])
175    
176     query = """PREPARE ftpnew_insert_source
177     AS INSERT INTO %s (source, version, maintainer, maintainer_name, maintainer_email, binaries,
178     changed_by, architecture, homepage,
179     vcs_type, vcs_url, vcs_browser, distribution, closes, license, last_modified, queue)
180     VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)""" % (my_config['table_sources'])
181     cur.execute(query)
182     query = """PREPARE ftpnew_insert_package
183     AS INSERT INTO %s (package, version, architecture, maintainer, description, source,
184     depends, recommends, suggests, enhances, pre_depends, breaks, replaces, provides, conflicts,
185     installed_size, homepage, section, long_description, license)
186     VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20)""" % (my_config['table_packages'])
187     cur.execute(query)
188    
189     ftpnew_data = open(my_config['path']+'/new.822')
190    
191     for stanza in deb822.Sources.iter_paragraphs(ftpnew_data, shared_storage=False):
192     if stanza['queue'] == 'accepted' or stanza['queue'] == 'proposedupdates' :
193     continue
194     srcpkg = src_pkg(stanza['source'])
195     versions = stanza['version'].split(' ') # the page lists more than one version
196     srcpkg.has_several_versions = len(versions)-1 # some tests below fail if more than one version in in queue
197     srcpkg.s['Version'] = versions[srcpkg.has_several_versions]
198     srcpkg.s['Architecture'] = stanza['architectures']
199     srcpkg.s['Queue'] = stanza['queue']
200     srcpkg.s['Last_modified'] = ctime(int(stanza['last-modified'])) # We want a real time object instead of an epoch
201     srcpkg.s['Distribution'] = stanza['distribution']
202     srcpkg.s['Changed-By'] = stanza['changed-by']
203    
204     # Check UDD for existing source packages of this name
205     query = "SELECT count(*) FROM sources WHERE source = '%s'" % (srcpkg.s['Source'])
206     cur.execute(query)
207     in_udd = cur.fetchone()[0]
208     if in_udd:
209 tille 1408 if DEBUG != 0:
210     print >>stderr, "%s is %i times in UDD - no interest in just known sources (queue = %s)" \
211     % (srcpkg.s['Source'], int(in_udd), srcpkg.s['Source'])
212 tille 1406 continue
213    
214     src_info_base = srcpkg.s['Source'] + '_' + srcpkg.s['Version']
215     src_info_html = my_config['path'] + '/' + src_info_base + '.html'
216     src_info_822 = my_config['path'] + '/' + src_info_base + '.822'
217    
218     try:
219     srci = open(src_info_html, 'r')
220     except IOError, err:
221     print >>stderr, "No html info for package %s in queue %s (%s)." % (srcpkg.s['Source'], stanza['queue'], err)
222     continue
223     srco = open(src_info_822, 'w')
224     in_description = 0
225     in_source = 1
226     binpkgs = []
227     binpkg = None
228     for line in srci.readlines():
229     if ftpnew_gatherer.src_html_failed_re.match(line):
230     print >>stderr, "File %s not found." % (src_info_html)
231     src_info_not_found = 1
232     break
233     match = ftpnew_gatherer.src_html_has_tag_re.match(line)
234     if match:
235     field = match.groups()[0]
236     value = de_html(match.groups()[1])
237     if field == 'Package':
238     # Here begins a new binary package
239     if self.check_existing_binaries((value,), srcpkg.s['Queue']):
240     srcpkg.s['Queue'] = 'ignore'
241     break
242     if in_source:
243     in_source = 0
244     if binpkg:
245     binpkgs.append(binpkg)
246     binpkg = bin_pkg(value, srcpkg.s['Source'])
247     print >>srco, "\nPackage: %s" % (value)
248     elif field == 'Maintainer':
249     # print "DEBUG %s: %s" % (field, value)
250     if in_source:
251     srcpkg.s[field] = value
252     srcpkg.s['maintainer_name'], srcpkg.s['maintainer_email'] = email.Utils.parseaddr(srcpkg.s['Maintainer'])
253     else:
254     binpkg.b[field] = value
255     print >>srco, "%s: %s" % (field, value)
256     elif field == 'Description':
257     if in_source:
258 tille 1408 srcpkg.s[field] = de_html(value)
259 tille 1406 else:
260 tille 1408 binpkg.b[field] = de_html(value)
261 tille 1406 print >>srco, "%s: %s" % (field, value)
262     elif field == 'Architecture':
263     if in_source:
264     srcpkg.s[field] = value
265     else:
266     binpkg.b[field] = value
267     print >>srco, "%s: %s" % (field, value)
268     elif field == 'Source':
269     if in_source:
270     if value != srcpkg.s['Source']:
271     print >>stderr, "Incompatible source names between new.822(%s) and %s.html (%s)" % \
272     (srcpkg.s['Source'], src_info_base, value)
273     srcpkg.s['Source'] = value
274     print >>srco, "%s: %s" % (field, value)
275     elif field == 'Version':
276     if in_source:
277     if srcpkg.has_several_versions == 0 and value != srcpkg.s[field]:
278     print >>stderr, "Incompatible version numbers between new.822(%s) and %s.html (%s)" % \
279     (srcpkg.s[field], src_info_base, value)
280     srcpkg.s[field] = value
281     else:
282     binpkg.b[field] = value
283     print >>srco, "%s: %s" % (field, value)
284     elif field == 'Closes':
285     values = value.split(' ')
286     found_itp = 0
287     for val in values:
288     ival = int(val)
289     query = "SELECT title from bugs where id = %i and package = 'wnpp' and source = 'wnpp'" % (ival)
290     cur.execute(query)
291     try:
292     wnpp_title = cur.fetchone()[0]
293     except TypeError, err:
294     query = "SELECT id, package, source, title FROM bugs WHERE id = %i" % (ival)
295     cur.execute(query)
296     bug_info = cur.fetchone()
297     if not bug_info:
298     print >>stderr, "Bug %i which source package %s claims to close does not exist." % (ival, srcpkg.s['Source'])
299     else:
300     print >>stderr, "Bug #%i of package %s and source %s is not against pseudopackage 'wnpp' and hast title '%s'" % bug_info
301     if not ftpnew_gatherer.closes_is_itp_re.match(wnpp_title):
302     print >>stderr, "Closed bug %i seems to be not ITPed (queue = %s; title = %s)" % (ival, srcpkg.s['Queue'], wnpp_title)
303     else:
304     if found_itp:
305     print >>stderr, "Warning: Package %s seems to have more than one ITP bugs (%i, %i). Only %i is stored in UDD" % \
306     (srcpkg.s['Source'], srcpkg.s['Closes'], ival, srcpkg.s['Closes'])
307     query = "SELECT count(*) FROM bugs_merged_with WHERE id = %i OR id = %i" % (srcpkg.s['Closes'], ival)
308     cur.execute(query)
309     is_merged = cur.fetchone()[0]
310     if is_merged != 2:
311     print >>stderr, " --> Bugs should be merged in BTS!"
312     else: # stay with the ITP found first
313     srcpkg.s['Closes'] = int(ival)
314     found_itp = 1
315     if not found_itp:
316     print >>stderr, "Most probably %s is not new." % (srcpkg.s['Source'])
317     print >>srco, "%s: %s\n" % (field, value)
318     elif field == 'Distribution':
319     if in_source:
320     if srcpkg.has_several_versions == 0 and value != srcpkg.s['Distribution']:
321     print >>stderr, "Incompatible distributions between new.822(%s) and %s.html (%s)" % \
322     (srcpkg.s['Distribution'], src_info_base, value)
323     srcpkg.s['Distribution'] = value
324     print >>srco, "%s: %s" % (field, value)
325     else:
326     print >>stderr, "Binary should not mention distribution field in %s.html (%s)" % \
327     (src_info_base, value)
328     elif field == 'Binary':
329     if in_source:
330     # Binaries are mentioned in different syntax in *.changes and *.dsc
331     value = re.sub(", +", " ", value)
332     if self.check_existing_binaries(value.split(' '), srcpkg.s['Queue']):
333     srcpkg.s['Queue'] = 'ignore'
334     break
335     if in_source:
336     if srcpkg.s['Bin'] != () and value != srcpkg.s['Bin']:
337     print >>stderr, "Incompatible binaries between new.822(%s) and %s.html (%s)" % \
338     (srcpkg.s['Bin'], src_info_base, value)
339     srcpkg.s['Bin'] = value
340     print >>srco, "%s: %s" % (field, value)
341     else:
342     print >>stderr, "Binary should not mention Binary field in %s.html (%s)" % \
343     (src_info_base, value)
344     elif field == 'Installed-Size':
345     if not in_source:
346     binpkg.b[field] = int(value)
347     elif field == 'Homepage':
348     if not in_source:
349     binpkg.b[field] = value
350     elif field == 'Section':
351     if not in_source:
352     if not binpkg:
353     print >>stderr, "This should not happen", srcpkg, field, value
354     exit(-1)
355     elif field == 'Vcs-Browser':
356     srcpkg.s[field] = value
357     elif binpkg != None and field in dependencies_to_accept:
358     binpkg.b[field] = value
359     print >>srco, "%s: %s" % (field, value)
360     elif field in fields_to_pass or field.startswith('Npp-'):
361     print >>srco, "%s: %s" % (field, value)
362     else:
363     matchvcs = ftpnew_gatherer.vcs_type_re.match(field)
364     if matchvcs:
365     srcpkg.s['Vcs-Type'] = matchvcs.groups()[0]
366     srcpkg.s['Vcs-Url'] = value
367     print >>srco, "%s: %s" % (field, value)
368     else:
369     print >>stderr, "Unknown field in %s: %s" % (srcpkg.s['Source'], field)
370     print >>srco, "*%s: %s" % (field, value)
371     continue
372     if in_description:
373     match = ftpnew_gatherer.src_html_has_description_end_re.match(line)
374     if match:
375     if match.groups()[0][0] != ' ':
376     description += ' '
377 tille 1408 description += de_html(match.groups()[0])
378 tille 1406 in_description = 0
379     if not in_source: # binpkg and binpkg.b:
380 tille 1408 (binpkg.b['Description'], binpkg.b['Long_Description']) = description.split("\n",1)
381     print >>srco, "Description: %s\n%s" % (binpkg.b['Description'], binpkg.b['Long_Description'])
382 tille 1406 else:
383     if line[0] != ' ':
384     description += ' '
385 tille 1408 description += de_html(line)
386 tille 1406 else:
387     match = ftpnew_gatherer.src_html_has_description_start_re.match(line)
388     if match:
389     in_description = 1
390 tille 1408 description = de_html(match.groups()[0]) + "\n"
391 tille 1406 srci.close()
392     srco.close()
393     # cur.execute("EXECUTE ftpnew_insert (%s, %s, %s, %s)"\
394     # % (quote(pkg), pkg_type, quote(tag), quote(ftpnew_gatherer.code_to_tag_type_map[code])));
395     if srcpkg.s['Queue'] != 'ignore':
396     # print srcpkg
397     srcpkg.check_dict()
398     query = """EXECUTE ftpnew_insert_source (%(Source)s, %(Version)s,
399     %(Maintainer)s, %(maintainer_name)s, %(maintainer_email)s,
400     %(Bin)s, %(Changed-By)s, %(Architecture)s, %(Homepage)s,
401     %(Vcs-Type)s, %(Vcs-Url)s, %(Vcs-Browser)s,
402     %(Distribution)s, %(Closes)s, %(License)s,
403     %(Last_modified)s, %(Queue)s)"""
404     cur.execute(query, srcpkg.s)
405     for binpkg in binpkgs:
406     # print binpkg
407     binpkg.check_dict()
408     query = """EXECUTE ftpnew_insert_package (%(Package)s, %(Version)s,
409     %(Architecture)s, %(Maintainer)s, %(Description)s, %(Source)s,
410     %(Depends)s, %(Recommends)s, %(Suggests)s, %(Enhances)s,
411     %(Pre-Depends)s, %(Breaks)s, %(Replaces)s, %(Provides)s, %(Conflicts)s,
412     %(Installed-Size)s, %(Homepage)s, %(Section)s,
413     %(Long_Description)s, %(License)s)"""
414     try:
415     cur.execute(query, binpkg.b)
416     except IntegrityError, err:
417     print >>stderr, err, src_info_html
418     print >>stderr, binpkg
419     print >>stderr, binpkg.b
420     continue
421    
422     cur.execute("DEALLOCATE ftpnew_insert_source")
423     cur.execute("DEALLOCATE ftpnew_insert_package")
424     cur.execute("DEALLOCATE ftpnew_check_existing_package")
425    
426     if __name__ == '__main__':
427     main()
428    
429     # vim:set et tabstop=2:

  ViewVC Help
Powered by ViewVC 1.1.5