/[collab-qa]/udd/udd/ftpnew_gatherer.py
ViewVC logotype

Contents of /udd/udd/ftpnew_gatherer.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1531 - (hide annotations) (download) (as text)
Thu Jul 23 14:05:39 2009 UTC (3 years, 9 months ago) by lucas
File MIME type: text/x-python
File size: 21556 byte(s)
add ANALYZE at the end of all importers to teach pgsql some stats about the data we just imported
1 tille 1406 #!/usr/bin/env python
2    
3     """
4     This script imports information from ftp new queue into the database
5     See http://ftp-master.debian.org/new.822 and
6     http://ftp-master.debian.org/new.html
7     """
8    
9     from debian_bundle import deb822
10     from os import access, mkdir, unlink, W_OK
11     from sys import stderr
12     import aux
13     from aux import quote
14     from gatherer import gatherer
15     import email.Utils
16     import re
17     from time import ctime
18     from psycopg2 import IntegrityError
19    
20     def get_gatherer(connection, config, source):
21     return ftpnew_gatherer(connection, config, source)
22    
23 tille 1408 DEBUG=0
24    
25 tille 1406 # When parsing src html pages we have to get rid of certain html strings
26     def de_html(string):
27 tille 1408 string= re.sub("</?span[^>]*>", '', string)
28     string= re.sub("&quot;", '"', string)
29     string= re.sub("&amp;", '&', string)
30     string= re.sub("&lt;", '<', string)
31     string= re.sub("&gt;", '>', string)
32     string= re.sub("</?pre>", '', string)
33 tille 1406 return string
34    
35     # These fields are not forewarded to UDD tables for the moment
36     fields_to_pass = ('Format',
37     'Date',
38     'Changed-By',
39     'Files',
40     'Uploaders',
41     'Standards-Version',
42     'Priority',
43     'Urgency',
44     'Dm-Upload-Allowed',
45     'Autobuild',
46     'Build-Depends',
47     'Build-Depends-Indep',
48     'Build-Conflicts',
49     'Python-Version')
50     # + startswith('Npp-')
51    
52     dependencies_to_accept = ( 'Depends', 'Recommends', 'Suggests', 'Enhances', 'Pre-Depends',
53     'Breaks', 'Replaces', 'Provides', 'Conflicts')
54    
55     class src_pkg():
56     def __init__(self, source):
57     self.s = {}
58     self.s['Source'] = source
59     self.has_several_versions = 0
60     # self.bin = () # comma separated list of binaries created from the source
61     self.s['Bin'] = () # comma separated list of binaries created from the source
62     self.s['Architecture'] = () # architecture(s separated by blanks)
63     # Just define Vcs fields in case it is not provided in the control
64     self.s['Vcs-Type'] = None
65     self.s['Vcs-Url'] = None
66     # preset WNPP bug
67     self.s['Closes'] = 0
68    
69     def check_dict(self):
70     "Make sure that non-mandatory fields at least get a '' value"
71     for field in ftpnew_gatherer.s_non_mandatory:
72     if not self.s.has_key(field):
73     self.s[field] = ''
74    
75     def __str__(self):
76     str = "Source %(Source)s: %(Version)s, (%(Architecture)s), %(Last_modified)s, %(Queue)s, %(Distribution)s" % \
77     (self.s)
78     str += " %(maintainer_name)s <%(maintainer_email)s>, %(Closes)i" % (self.s)
79     return str
80    
81     class bin_pkg():
82     def __init__(self, package, source):
83     self.b = {}
84     self.b['Package'] = package
85     self.b['Source'] = source
86     self.b['Installed-Size'] = 0
87     self.b['License'] = ''
88    
89     def check_dict(self):
90     "Make sure that non-mandatory fields at least get a '' value"
91     for field in ftpnew_gatherer.b_non_mandatory:
92     if not self.b.has_key(field):
93     self.b[field] = ''
94    
95     def __str__(self):
96     return "Package %s: %s, %s, %s, %s, %s" % \
97     (self.b['Package'], self.b['Version'], self.b['Architecture'], self.b['Maintainer'],
98     self.b['Description'], self.b['Long_Description'])
99    
100     class ftpnew_gatherer(gatherer):
101     "This class imports the data from New queue into the database"
102     s_mandatory = {'Source': 0, 'Format': 0, 'Maintainer': 0, 'Package': 0, 'Version': 0, 'Files': 0,
103     'Queue': 0, 'Last_modified': 0}
104     s_non_mandatory = {'Uploaders': 0, 'Bin': 0, 'Architecture': 0,
105     'Homepage': 0, 'Build-Depends': 0, 'Vcs-Arch': 0, 'Vcs-Bzr': 0,
106     'Vcs-Cvs': 0, 'Vcs-Darcs': 0, 'Vcs-Git': 0, 'Vcs-Hg': 0, 'Vcs-Svn': 0,
107 tille 1474 'Vcs-Mtn':0, 'Vcs-Browser': 0, 'License': 0, 'Section': 0
108 tille 1406 }
109     s_ignorable = {'X-Vcs-Browser': 0, 'X-Vcs-Bzr': 0, 'X-Vcs-Darcs': 0, 'X-Vcs-Svn': 0, 'X-Vcs-Hg':0, 'X-Vcs-Git':0,
110     'Directory':0, 'Comment':0, 'Origin':0, 'Url':0, 'X-Collab-Maint':0, 'Autobuild':0, 'Vcs-Cvs:':0,
111     'Python-Standards-Version':0, 'url':0, 'originalmaintainer':0, 'Originalmaintainer':0,
112     'Build-Recommends':0,
113     'Build-Depends-Indep': 0, 'Build-Conflicts': 0, 'Build-Conflicts-Indep': 0,
114 tille 1474 'Priority': 0, 'Python-Version': 0, 'Checksums-Sha1':0,
115 tille 1406 'Checksums-Sha256':0, 'Original-Maintainer':0, 'Dm-Upload-Allowed':0,
116     'Standards-Version': 0,
117     }
118    
119     b_non_mandatory = {'Source': 0, 'Essential': 0, 'Depends': 0, 'Recommends': 0,
120     'Suggests': 0, 'Enhances': 0, 'Pre-Depends': 0, 'Breaks':0, 'Installed-Size': 0,
121     'Homepage': 0, 'Size': 0, 'Build-Essential':0, 'Origin':0,
122     'SHA1':0, 'Replaces':0, 'Section':0, 'MD5sum':0, 'Bugs':0, 'Priority':0,
123     'Tag':0, 'Task':0, 'Python-Version':0, 'Provides':0, 'Conflicts':0,
124     'SHA256':0, 'Original-Maintainer':0}
125    
126     s_ignorable_re = re.compile("^(Original-|Origianl-|Orginal-|Debian-|X-Original-|Upstream-)")
127     s_vcs = { 'Arch':0, 'Bzr':0, 'Cvs':0, 'Darcs':0, 'Git':0, 'Hg':0, 'Svn':0, 'Mtn':0}
128    
129     src_html_failed_re = re.compile("^<p>The requested URL /new/.+\.html was not found on this server\.</p>")
130     src_html_has_tag_re = re.compile('^\s*<tr><td class="key">([-\w]+):</td><td class="val">(.+)</td></tr>$')
131     src_html_has_description_start_re = re.compile('^\s*<tr><td class="key">Description:</td><td class="val"><pre>(.+)')
132     src_html_has_description_end_re = re.compile('(.+)</pre></td></tr>')
133     closes_is_itp_re = re.compile('^\s*(ITP|RFP|ITA)')
134     vcs_type_re = re.compile('Vcs-(Svn|Git|Bzr|Darcs|Hg|Cvs|Arch|Mtn)')
135    
136     def __init__(self, connection, config, source):
137     gatherer.__init__(self, connection, config, source)
138     self.assert_my_config('path', 'table_sources', 'table_packages', 'ftpmasterURL', 'releases_ignore')
139    
140    
141     def check_existing_binaries(self, values, queue):
142     # Sometimes the source package name has changed, but the binary package name is known in UDD
143     # we are not interested in these packages
144    
145     cur = self.cursor()
146     for value in values:
147     # query = "SELECT count(*) FROM packages WHERE package = '%s'" % (value)
148     query = "EXECUTE ftpnew_check_existing_package ('%s')" % (value)
149     cur.execute(query)
150     in_udd = cur.fetchone()[0]
151     if in_udd:
152 tille 1408 if DEBUG != 0:
153     print >>stderr, "Binary package %s is %i times in UDD - no interest in just known binaries (queue = %s)" \
154     % (value, int(in_udd), queue)
155 tille 1406 return 1
156     return 0
157    
158     def run(self):
159     my_config = self.my_config
160    
161     #start harassing the DB, preparing the final inserts and making place
162     #for the new data:
163     cur = self.cursor()
164    
165     # if we check whether a package just exists in UDD we ignore oldstable which is currently etch but other
166     # dists might have to be ignored as well
167     cur.execute("PREPARE ftpnew_check_existing_package AS SELECT COUNT(*) FROM packages WHERE package = $1 AND release NOT IN (%s)" \
168     % self.my_config["releases_ignore"])
169     # For some reason the code tries to add binary packages twice - just verify whether the package is
170     # just included to make sure we do not trigger conflicting primary keys
171     cur.execute("PREPARE ftpnew_check_just_added_package AS SELECT COUNT(*) FROM new_packages WHERE package = $1 AND version = $2 AND architecture = $3")
172    
173     cur.execute("DELETE FROM %s" % my_config["table_sources"])
174     cur.execute("DELETE FROM %s" % my_config["table_packages"])
175    
176     query = """PREPARE ftpnew_insert_source
177     AS INSERT INTO %s (source, version, maintainer, maintainer_name, maintainer_email, binaries,
178     changed_by, architecture, homepage,
179 tille 1474 vcs_type, vcs_url, vcs_browser,
180     section, distribution, component, closes, license, last_modified, queue)
181     VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)""" % (my_config['table_sources'])
182 tille 1406 cur.execute(query)
183     query = """PREPARE ftpnew_insert_package
184     AS INSERT INTO %s (package, version, architecture, maintainer, description, source,
185     depends, recommends, suggests, enhances, pre_depends, breaks, replaces, provides, conflicts,
186 tille 1474 installed_size, homepage, section, long_description, distribution, component, license)
187     VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22)""" \
188     % (my_config['table_packages'])
189 tille 1406 cur.execute(query)
190    
191     ftpnew_data = open(my_config['path']+'/new.822')
192    
193 tille 1474 has_warned_about_missing_section_key = 0
194 tille 1406 for stanza in deb822.Sources.iter_paragraphs(ftpnew_data, shared_storage=False):
195     if stanza['queue'] == 'accepted' or stanza['queue'] == 'proposedupdates' :
196     continue
197     srcpkg = src_pkg(stanza['source'])
198     versions = stanza['version'].split(' ') # the page lists more than one version
199     srcpkg.has_several_versions = len(versions)-1 # some tests below fail if more than one version in in queue
200     srcpkg.s['Version'] = versions[srcpkg.has_several_versions]
201     srcpkg.s['Architecture'] = stanza['architectures']
202     srcpkg.s['Queue'] = stanza['queue']
203     srcpkg.s['Last_modified'] = ctime(int(stanza['last-modified'])) # We want a real time object instead of an epoch
204     srcpkg.s['Distribution'] = stanza['distribution']
205     srcpkg.s['Changed-By'] = stanza['changed-by']
206 tille 1474 try:
207     srcpkg.s['Section'] = stanza['section']
208     if stanza['section'].startswith('non-free'):
209     srcpkg.s['Component'] = 'non-free'
210     elif stanza['section'].startswith('contrib'):
211     srcpkg.s['Component'] = 'contrib'
212     else:
213     srcpkg.s['Component'] = 'main'
214     except KeyError:
215     srcpkg.s['Section'] = ''
216     srcpkg.s['Component'] = ''
217     if has_warned_about_missing_section_key == 0:
218     has_warned_about_missing_section_key = 1
219     print >>stderr, "Warning: Because of a bug in DAK code the Section field is currently missing."
220 tille 1406
221     # Check UDD for existing source packages of this name
222     query = "SELECT count(*) FROM sources WHERE source = '%s'" % (srcpkg.s['Source'])
223     cur.execute(query)
224     in_udd = cur.fetchone()[0]
225     if in_udd:
226 tille 1408 if DEBUG != 0:
227     print >>stderr, "%s is %i times in UDD - no interest in just known sources (queue = %s)" \
228     % (srcpkg.s['Source'], int(in_udd), srcpkg.s['Source'])
229 tille 1406 continue
230    
231     src_info_base = srcpkg.s['Source'] + '_' + srcpkg.s['Version']
232     src_info_html = my_config['path'] + '/' + src_info_base + '.html'
233     src_info_822 = my_config['path'] + '/' + src_info_base + '.822'
234    
235     try:
236     srci = open(src_info_html, 'r')
237     except IOError, err:
238     print >>stderr, "No html info for package %s in queue %s (%s)." % (srcpkg.s['Source'], stanza['queue'], err)
239     continue
240     srco = open(src_info_822, 'w')
241     in_description = 0
242     in_source = 1
243     binpkgs = []
244     binpkg = None
245     for line in srci.readlines():
246     if ftpnew_gatherer.src_html_failed_re.match(line):
247     print >>stderr, "File %s not found." % (src_info_html)
248     src_info_not_found = 1
249     break
250     match = ftpnew_gatherer.src_html_has_tag_re.match(line)
251     if match:
252     field = match.groups()[0]
253     value = de_html(match.groups()[1])
254     if field == 'Package':
255     # Here begins a new binary package
256     if self.check_existing_binaries((value,), srcpkg.s['Queue']):
257     srcpkg.s['Queue'] = 'ignore'
258     break
259     if in_source:
260     in_source = 0
261     if binpkg:
262     binpkgs.append(binpkg)
263     binpkg = bin_pkg(value, srcpkg.s['Source'])
264     print >>srco, "\nPackage: %s" % (value)
265 tille 1473 binpkg.b['Distribution'] = srcpkg.s['Distribution']
266 tille 1406 elif field == 'Maintainer':
267     # print "DEBUG %s: %s" % (field, value)
268     if in_source:
269     srcpkg.s[field] = value
270     srcpkg.s['maintainer_name'], srcpkg.s['maintainer_email'] = email.Utils.parseaddr(srcpkg.s['Maintainer'])
271     else:
272     binpkg.b[field] = value
273     print >>srco, "%s: %s" % (field, value)
274     elif field == 'Description':
275     if in_source:
276 tille 1408 srcpkg.s[field] = de_html(value)
277 tille 1406 else:
278 tille 1408 binpkg.b[field] = de_html(value)
279 tille 1406 print >>srco, "%s: %s" % (field, value)
280     elif field == 'Architecture':
281     if in_source:
282     srcpkg.s[field] = value
283     else:
284     binpkg.b[field] = value
285     print >>srco, "%s: %s" % (field, value)
286     elif field == 'Source':
287     if in_source:
288     if value != srcpkg.s['Source']:
289     print >>stderr, "Incompatible source names between new.822(%s) and %s.html (%s)" % \
290     (srcpkg.s['Source'], src_info_base, value)
291     srcpkg.s['Source'] = value
292     print >>srco, "%s: %s" % (field, value)
293     elif field == 'Version':
294     if in_source:
295     if srcpkg.has_several_versions == 0 and value != srcpkg.s[field]:
296     print >>stderr, "Incompatible version numbers between new.822(%s) and %s.html (%s)" % \
297     (srcpkg.s[field], src_info_base, value)
298     srcpkg.s[field] = value
299     else:
300     binpkg.b[field] = value
301     print >>srco, "%s: %s" % (field, value)
302     elif field == 'Closes':
303     values = value.split(' ')
304     found_itp = 0
305     for val in values:
306     ival = int(val)
307     query = "SELECT title from bugs where id = %i and package = 'wnpp' and source = 'wnpp'" % (ival)
308     cur.execute(query)
309     try:
310     wnpp_title = cur.fetchone()[0]
311     except TypeError, err:
312     query = "SELECT id, package, source, title FROM bugs WHERE id = %i" % (ival)
313     cur.execute(query)
314     bug_info = cur.fetchone()
315     if not bug_info:
316     print >>stderr, "Bug %i which source package %s claims to close does not exist." % (ival, srcpkg.s['Source'])
317     else:
318     print >>stderr, "Bug #%i of package %s and source %s is not against pseudopackage 'wnpp' and hast title '%s'" % bug_info
319     if not ftpnew_gatherer.closes_is_itp_re.match(wnpp_title):
320     print >>stderr, "Closed bug %i seems to be not ITPed (queue = %s; title = %s)" % (ival, srcpkg.s['Queue'], wnpp_title)
321     else:
322     if found_itp:
323     print >>stderr, "Warning: Package %s seems to have more than one ITP bugs (%i, %i). Only %i is stored in UDD" % \
324     (srcpkg.s['Source'], srcpkg.s['Closes'], ival, srcpkg.s['Closes'])
325     query = "SELECT count(*) FROM bugs_merged_with WHERE id = %i OR id = %i" % (srcpkg.s['Closes'], ival)
326     cur.execute(query)
327     is_merged = cur.fetchone()[0]
328     if is_merged != 2:
329     print >>stderr, " --> Bugs should be merged in BTS!"
330     else: # stay with the ITP found first
331     srcpkg.s['Closes'] = int(ival)
332     found_itp = 1
333     if not found_itp:
334     print >>stderr, "Most probably %s is not new." % (srcpkg.s['Source'])
335     print >>srco, "%s: %s\n" % (field, value)
336     elif field == 'Distribution':
337     if in_source:
338     if srcpkg.has_several_versions == 0 and value != srcpkg.s['Distribution']:
339     print >>stderr, "Incompatible distributions between new.822(%s) and %s.html (%s)" % \
340     (srcpkg.s['Distribution'], src_info_base, value)
341     srcpkg.s['Distribution'] = value
342     print >>srco, "%s: %s" % (field, value)
343     else:
344     print >>stderr, "Binary should not mention distribution field in %s.html (%s)" % \
345     (src_info_base, value)
346     elif field == 'Binary':
347     if in_source:
348     # Binaries are mentioned in different syntax in *.changes and *.dsc
349     value = re.sub(", +", " ", value)
350     if self.check_existing_binaries(value.split(' '), srcpkg.s['Queue']):
351     srcpkg.s['Queue'] = 'ignore'
352     break
353     if in_source:
354     if srcpkg.s['Bin'] != () and value != srcpkg.s['Bin']:
355     print >>stderr, "Incompatible binaries between new.822(%s) and %s.html (%s)" % \
356     (srcpkg.s['Bin'], src_info_base, value)
357     srcpkg.s['Bin'] = value
358     print >>srco, "%s: %s" % (field, value)
359     else:
360     print >>stderr, "Binary should not mention Binary field in %s.html (%s)" % \
361     (src_info_base, value)
362     elif field == 'Installed-Size':
363     if not in_source:
364     binpkg.b[field] = int(value)
365     elif field == 'Homepage':
366     if not in_source:
367     binpkg.b[field] = value
368     elif field == 'Section':
369     if not in_source:
370     if not binpkg:
371     print >>stderr, "This should not happen", srcpkg, field, value
372     exit(-1)
373 tille 1474 else:
374     binpkg.b[field] = value
375     binpkg.b['Component'] = srcpkg.s['Component']
376 tille 1406 elif field == 'Vcs-Browser':
377     srcpkg.s[field] = value
378     elif binpkg != None and field in dependencies_to_accept:
379     binpkg.b[field] = value
380     print >>srco, "%s: %s" % (field, value)
381     elif field in fields_to_pass or field.startswith('Npp-'):
382     print >>srco, "%s: %s" % (field, value)
383     else:
384     matchvcs = ftpnew_gatherer.vcs_type_re.match(field)
385     if matchvcs:
386     srcpkg.s['Vcs-Type'] = matchvcs.groups()[0]
387     srcpkg.s['Vcs-Url'] = value
388     print >>srco, "%s: %s" % (field, value)
389     else:
390     print >>stderr, "Unknown field in %s: %s" % (srcpkg.s['Source'], field)
391     print >>srco, "*%s: %s" % (field, value)
392     continue
393     if in_description:
394     match = ftpnew_gatherer.src_html_has_description_end_re.match(line)
395     if match:
396     if match.groups()[0][0] != ' ':
397     description += ' '
398 tille 1408 description += de_html(match.groups()[0])
399 tille 1406 in_description = 0
400     if not in_source: # binpkg and binpkg.b:
401 tille 1408 (binpkg.b['Description'], binpkg.b['Long_Description']) = description.split("\n",1)
402     print >>srco, "Description: %s\n%s" % (binpkg.b['Description'], binpkg.b['Long_Description'])
403 tille 1406 else:
404     if line[0] != ' ':
405     description += ' '
406 tille 1408 description += de_html(line)
407 tille 1406 else:
408     match = ftpnew_gatherer.src_html_has_description_start_re.match(line)
409     if match:
410     in_description = 1
411 tille 1408 description = de_html(match.groups()[0]) + "\n"
412 tille 1406 srci.close()
413 tille 1486 srco.close()
414     # Append last read binary package to list of binary packages
415     binpkgs.append(binpkg)
416 tille 1406 if srcpkg.s['Queue'] != 'ignore':
417     # print srcpkg
418     srcpkg.check_dict()
419     query = """EXECUTE ftpnew_insert_source (%(Source)s, %(Version)s,
420     %(Maintainer)s, %(maintainer_name)s, %(maintainer_email)s,
421     %(Bin)s, %(Changed-By)s, %(Architecture)s, %(Homepage)s,
422     %(Vcs-Type)s, %(Vcs-Url)s, %(Vcs-Browser)s,
423 tille 1474 %(Section)s, %(Distribution)s, %(Component)s, %(Closes)s, %(License)s,
424 tille 1406 %(Last_modified)s, %(Queue)s)"""
425     cur.execute(query, srcpkg.s)
426     for binpkg in binpkgs:
427     # print binpkg
428     binpkg.check_dict()
429     query = """EXECUTE ftpnew_insert_package (%(Package)s, %(Version)s,
430     %(Architecture)s, %(Maintainer)s, %(Description)s, %(Source)s,
431     %(Depends)s, %(Recommends)s, %(Suggests)s, %(Enhances)s,
432     %(Pre-Depends)s, %(Breaks)s, %(Replaces)s, %(Provides)s, %(Conflicts)s,
433     %(Installed-Size)s, %(Homepage)s, %(Section)s,
434 tille 1474 %(Long_Description)s, %(Distribution)s, %(Component)s, %(License)s)"""
435 tille 1406 try:
436     cur.execute(query, binpkg.b)
437     except IntegrityError, err:
438     print >>stderr, err, src_info_html
439     print >>stderr, binpkg
440     print >>stderr, binpkg.b
441     continue
442 tille 1487 except KeyError, err:
443     print >>stderr, "Missing information field for binary package %s: %s" % (binpkg.b['Package'], err)
444     continue
445 tille 1406
446     cur.execute("DEALLOCATE ftpnew_insert_source")
447     cur.execute("DEALLOCATE ftpnew_insert_package")
448     cur.execute("DEALLOCATE ftpnew_check_existing_package")
449 lucas 1531 cur.execute("ANALYZE %s" % my_config["table_sources"])
450     cur.execute("ANALYZE %s" % my_config["table_packages"])
451 tille 1406
452     if __name__ == '__main__':
453     main()
454    
455     # vim:set et tabstop=2:

  ViewVC Help
Powered by ViewVC 1.1.5