/[collab-qa]/udd/udd/ftpnew_gatherer.py
ViewVC logotype

Contents of /udd/udd/ftpnew_gatherer.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1531 - (show annotations) (download) (as text)
Thu Jul 23 14:05:39 2009 UTC (3 years, 10 months ago) by lucas
File MIME type: text/x-python
File size: 21556 byte(s)
add ANALYZE at the end of all importers to teach pgsql some stats about the data we just imported
1 #!/usr/bin/env python
2
3 """
4 This script imports information from ftp new queue into the database
5 See http://ftp-master.debian.org/new.822 and
6 http://ftp-master.debian.org/new.html
7 """
8
9 from debian_bundle import deb822
10 from os import access, mkdir, unlink, W_OK
11 from sys import stderr
12 import aux
13 from aux import quote
14 from gatherer import gatherer
15 import email.Utils
16 import re
17 from time import ctime
18 from psycopg2 import IntegrityError
19
20 def get_gatherer(connection, config, source):
21 return ftpnew_gatherer(connection, config, source)
22
23 DEBUG=0
24
25 # When parsing src html pages we have to get rid of certain html strings
26 def de_html(string):
27 string= re.sub("</?span[^>]*>", '', string)
28 string= re.sub("&quot;", '"', string)
29 string= re.sub("&amp;", '&', string)
30 string= re.sub("&lt;", '<', string)
31 string= re.sub("&gt;", '>', string)
32 string= re.sub("</?pre>", '', string)
33 return string
34
35 # These fields are not forewarded to UDD tables for the moment
36 fields_to_pass = ('Format',
37 'Date',
38 'Changed-By',
39 'Files',
40 'Uploaders',
41 'Standards-Version',
42 'Priority',
43 'Urgency',
44 'Dm-Upload-Allowed',
45 'Autobuild',
46 'Build-Depends',
47 'Build-Depends-Indep',
48 'Build-Conflicts',
49 'Python-Version')
50 # + startswith('Npp-')
51
52 dependencies_to_accept = ( 'Depends', 'Recommends', 'Suggests', 'Enhances', 'Pre-Depends',
53 'Breaks', 'Replaces', 'Provides', 'Conflicts')
54
55 class src_pkg():
56 def __init__(self, source):
57 self.s = {}
58 self.s['Source'] = source
59 self.has_several_versions = 0
60 # self.bin = () # comma separated list of binaries created from the source
61 self.s['Bin'] = () # comma separated list of binaries created from the source
62 self.s['Architecture'] = () # architecture(s separated by blanks)
63 # Just define Vcs fields in case it is not provided in the control
64 self.s['Vcs-Type'] = None
65 self.s['Vcs-Url'] = None
66 # preset WNPP bug
67 self.s['Closes'] = 0
68
69 def check_dict(self):
70 "Make sure that non-mandatory fields at least get a '' value"
71 for field in ftpnew_gatherer.s_non_mandatory:
72 if not self.s.has_key(field):
73 self.s[field] = ''
74
75 def __str__(self):
76 str = "Source %(Source)s: %(Version)s, (%(Architecture)s), %(Last_modified)s, %(Queue)s, %(Distribution)s" % \
77 (self.s)
78 str += " %(maintainer_name)s <%(maintainer_email)s>, %(Closes)i" % (self.s)
79 return str
80
81 class bin_pkg():
82 def __init__(self, package, source):
83 self.b = {}
84 self.b['Package'] = package
85 self.b['Source'] = source
86 self.b['Installed-Size'] = 0
87 self.b['License'] = ''
88
89 def check_dict(self):
90 "Make sure that non-mandatory fields at least get a '' value"
91 for field in ftpnew_gatherer.b_non_mandatory:
92 if not self.b.has_key(field):
93 self.b[field] = ''
94
95 def __str__(self):
96 return "Package %s: %s, %s, %s, %s, %s" % \
97 (self.b['Package'], self.b['Version'], self.b['Architecture'], self.b['Maintainer'],
98 self.b['Description'], self.b['Long_Description'])
99
100 class ftpnew_gatherer(gatherer):
101 "This class imports the data from New queue into the database"
102 s_mandatory = {'Source': 0, 'Format': 0, 'Maintainer': 0, 'Package': 0, 'Version': 0, 'Files': 0,
103 'Queue': 0, 'Last_modified': 0}
104 s_non_mandatory = {'Uploaders': 0, 'Bin': 0, 'Architecture': 0,
105 'Homepage': 0, 'Build-Depends': 0, 'Vcs-Arch': 0, 'Vcs-Bzr': 0,
106 'Vcs-Cvs': 0, 'Vcs-Darcs': 0, 'Vcs-Git': 0, 'Vcs-Hg': 0, 'Vcs-Svn': 0,
107 'Vcs-Mtn':0, 'Vcs-Browser': 0, 'License': 0, 'Section': 0
108 }
109 s_ignorable = {'X-Vcs-Browser': 0, 'X-Vcs-Bzr': 0, 'X-Vcs-Darcs': 0, 'X-Vcs-Svn': 0, 'X-Vcs-Hg':0, 'X-Vcs-Git':0,
110 'Directory':0, 'Comment':0, 'Origin':0, 'Url':0, 'X-Collab-Maint':0, 'Autobuild':0, 'Vcs-Cvs:':0,
111 'Python-Standards-Version':0, 'url':0, 'originalmaintainer':0, 'Originalmaintainer':0,
112 'Build-Recommends':0,
113 'Build-Depends-Indep': 0, 'Build-Conflicts': 0, 'Build-Conflicts-Indep': 0,
114 'Priority': 0, 'Python-Version': 0, 'Checksums-Sha1':0,
115 'Checksums-Sha256':0, 'Original-Maintainer':0, 'Dm-Upload-Allowed':0,
116 'Standards-Version': 0,
117 }
118
119 b_non_mandatory = {'Source': 0, 'Essential': 0, 'Depends': 0, 'Recommends': 0,
120 'Suggests': 0, 'Enhances': 0, 'Pre-Depends': 0, 'Breaks':0, 'Installed-Size': 0,
121 'Homepage': 0, 'Size': 0, 'Build-Essential':0, 'Origin':0,
122 'SHA1':0, 'Replaces':0, 'Section':0, 'MD5sum':0, 'Bugs':0, 'Priority':0,
123 'Tag':0, 'Task':0, 'Python-Version':0, 'Provides':0, 'Conflicts':0,
124 'SHA256':0, 'Original-Maintainer':0}
125
126 s_ignorable_re = re.compile("^(Original-|Origianl-|Orginal-|Debian-|X-Original-|Upstream-)")
127 s_vcs = { 'Arch':0, 'Bzr':0, 'Cvs':0, 'Darcs':0, 'Git':0, 'Hg':0, 'Svn':0, 'Mtn':0}
128
129 src_html_failed_re = re.compile("^<p>The requested URL /new/.+\.html was not found on this server\.</p>")
130 src_html_has_tag_re = re.compile('^\s*<tr><td class="key">([-\w]+):</td><td class="val">(.+)</td></tr>$')
131 src_html_has_description_start_re = re.compile('^\s*<tr><td class="key">Description:</td><td class="val"><pre>(.+)')
132 src_html_has_description_end_re = re.compile('(.+)</pre></td></tr>')
133 closes_is_itp_re = re.compile('^\s*(ITP|RFP|ITA)')
134 vcs_type_re = re.compile('Vcs-(Svn|Git|Bzr|Darcs|Hg|Cvs|Arch|Mtn)')
135
136 def __init__(self, connection, config, source):
137 gatherer.__init__(self, connection, config, source)
138 self.assert_my_config('path', 'table_sources', 'table_packages', 'ftpmasterURL', 'releases_ignore')
139
140
141 def check_existing_binaries(self, values, queue):
142 # Sometimes the source package name has changed, but the binary package name is known in UDD
143 # we are not interested in these packages
144
145 cur = self.cursor()
146 for value in values:
147 # query = "SELECT count(*) FROM packages WHERE package = '%s'" % (value)
148 query = "EXECUTE ftpnew_check_existing_package ('%s')" % (value)
149 cur.execute(query)
150 in_udd = cur.fetchone()[0]
151 if in_udd:
152 if DEBUG != 0:
153 print >>stderr, "Binary package %s is %i times in UDD - no interest in just known binaries (queue = %s)" \
154 % (value, int(in_udd), queue)
155 return 1
156 return 0
157
158 def run(self):
159 my_config = self.my_config
160
161 #start harassing the DB, preparing the final inserts and making place
162 #for the new data:
163 cur = self.cursor()
164
165 # if we check whether a package just exists in UDD we ignore oldstable which is currently etch but other
166 # dists might have to be ignored as well
167 cur.execute("PREPARE ftpnew_check_existing_package AS SELECT COUNT(*) FROM packages WHERE package = $1 AND release NOT IN (%s)" \
168 % self.my_config["releases_ignore"])
169 # For some reason the code tries to add binary packages twice - just verify whether the package is
170 # just included to make sure we do not trigger conflicting primary keys
171 cur.execute("PREPARE ftpnew_check_just_added_package AS SELECT COUNT(*) FROM new_packages WHERE package = $1 AND version = $2 AND architecture = $3")
172
173 cur.execute("DELETE FROM %s" % my_config["table_sources"])
174 cur.execute("DELETE FROM %s" % my_config["table_packages"])
175
176 query = """PREPARE ftpnew_insert_source
177 AS INSERT INTO %s (source, version, maintainer, maintainer_name, maintainer_email, binaries,
178 changed_by, architecture, homepage,
179 vcs_type, vcs_url, vcs_browser,
180 section, distribution, component, closes, license, last_modified, queue)
181 VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)""" % (my_config['table_sources'])
182 cur.execute(query)
183 query = """PREPARE ftpnew_insert_package
184 AS INSERT INTO %s (package, version, architecture, maintainer, description, source,
185 depends, recommends, suggests, enhances, pre_depends, breaks, replaces, provides, conflicts,
186 installed_size, homepage, section, long_description, distribution, component, license)
187 VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22)""" \
188 % (my_config['table_packages'])
189 cur.execute(query)
190
191 ftpnew_data = open(my_config['path']+'/new.822')
192
193 has_warned_about_missing_section_key = 0
194 for stanza in deb822.Sources.iter_paragraphs(ftpnew_data, shared_storage=False):
195 if stanza['queue'] == 'accepted' or stanza['queue'] == 'proposedupdates' :
196 continue
197 srcpkg = src_pkg(stanza['source'])
198 versions = stanza['version'].split(' ') # the page lists more than one version
199 srcpkg.has_several_versions = len(versions)-1 # some tests below fail if more than one version in in queue
200 srcpkg.s['Version'] = versions[srcpkg.has_several_versions]
201 srcpkg.s['Architecture'] = stanza['architectures']
202 srcpkg.s['Queue'] = stanza['queue']
203 srcpkg.s['Last_modified'] = ctime(int(stanza['last-modified'])) # We want a real time object instead of an epoch
204 srcpkg.s['Distribution'] = stanza['distribution']
205 srcpkg.s['Changed-By'] = stanza['changed-by']
206 try:
207 srcpkg.s['Section'] = stanza['section']
208 if stanza['section'].startswith('non-free'):
209 srcpkg.s['Component'] = 'non-free'
210 elif stanza['section'].startswith('contrib'):
211 srcpkg.s['Component'] = 'contrib'
212 else:
213 srcpkg.s['Component'] = 'main'
214 except KeyError:
215 srcpkg.s['Section'] = ''
216 srcpkg.s['Component'] = ''
217 if has_warned_about_missing_section_key == 0:
218 has_warned_about_missing_section_key = 1
219 print >>stderr, "Warning: Because of a bug in DAK code the Section field is currently missing."
220
221 # Check UDD for existing source packages of this name
222 query = "SELECT count(*) FROM sources WHERE source = '%s'" % (srcpkg.s['Source'])
223 cur.execute(query)
224 in_udd = cur.fetchone()[0]
225 if in_udd:
226 if DEBUG != 0:
227 print >>stderr, "%s is %i times in UDD - no interest in just known sources (queue = %s)" \
228 % (srcpkg.s['Source'], int(in_udd), srcpkg.s['Source'])
229 continue
230
231 src_info_base = srcpkg.s['Source'] + '_' + srcpkg.s['Version']
232 src_info_html = my_config['path'] + '/' + src_info_base + '.html'
233 src_info_822 = my_config['path'] + '/' + src_info_base + '.822'
234
235 try:
236 srci = open(src_info_html, 'r')
237 except IOError, err:
238 print >>stderr, "No html info for package %s in queue %s (%s)." % (srcpkg.s['Source'], stanza['queue'], err)
239 continue
240 srco = open(src_info_822, 'w')
241 in_description = 0
242 in_source = 1
243 binpkgs = []
244 binpkg = None
245 for line in srci.readlines():
246 if ftpnew_gatherer.src_html_failed_re.match(line):
247 print >>stderr, "File %s not found." % (src_info_html)
248 src_info_not_found = 1
249 break
250 match = ftpnew_gatherer.src_html_has_tag_re.match(line)
251 if match:
252 field = match.groups()[0]
253 value = de_html(match.groups()[1])
254 if field == 'Package':
255 # Here begins a new binary package
256 if self.check_existing_binaries((value,), srcpkg.s['Queue']):
257 srcpkg.s['Queue'] = 'ignore'
258 break
259 if in_source:
260 in_source = 0
261 if binpkg:
262 binpkgs.append(binpkg)
263 binpkg = bin_pkg(value, srcpkg.s['Source'])
264 print >>srco, "\nPackage: %s" % (value)
265 binpkg.b['Distribution'] = srcpkg.s['Distribution']
266 elif field == 'Maintainer':
267 # print "DEBUG %s: %s" % (field, value)
268 if in_source:
269 srcpkg.s[field] = value
270 srcpkg.s['maintainer_name'], srcpkg.s['maintainer_email'] = email.Utils.parseaddr(srcpkg.s['Maintainer'])
271 else:
272 binpkg.b[field] = value
273 print >>srco, "%s: %s" % (field, value)
274 elif field == 'Description':
275 if in_source:
276 srcpkg.s[field] = de_html(value)
277 else:
278 binpkg.b[field] = de_html(value)
279 print >>srco, "%s: %s" % (field, value)
280 elif field == 'Architecture':
281 if in_source:
282 srcpkg.s[field] = value
283 else:
284 binpkg.b[field] = value
285 print >>srco, "%s: %s" % (field, value)
286 elif field == 'Source':
287 if in_source:
288 if value != srcpkg.s['Source']:
289 print >>stderr, "Incompatible source names between new.822(%s) and %s.html (%s)" % \
290 (srcpkg.s['Source'], src_info_base, value)
291 srcpkg.s['Source'] = value
292 print >>srco, "%s: %s" % (field, value)
293 elif field == 'Version':
294 if in_source:
295 if srcpkg.has_several_versions == 0 and value != srcpkg.s[field]:
296 print >>stderr, "Incompatible version numbers between new.822(%s) and %s.html (%s)" % \
297 (srcpkg.s[field], src_info_base, value)
298 srcpkg.s[field] = value
299 else:
300 binpkg.b[field] = value
301 print >>srco, "%s: %s" % (field, value)
302 elif field == 'Closes':
303 values = value.split(' ')
304 found_itp = 0
305 for val in values:
306 ival = int(val)
307 query = "SELECT title from bugs where id = %i and package = 'wnpp' and source = 'wnpp'" % (ival)
308 cur.execute(query)
309 try:
310 wnpp_title = cur.fetchone()[0]
311 except TypeError, err:
312 query = "SELECT id, package, source, title FROM bugs WHERE id = %i" % (ival)
313 cur.execute(query)
314 bug_info = cur.fetchone()
315 if not bug_info:
316 print >>stderr, "Bug %i which source package %s claims to close does not exist." % (ival, srcpkg.s['Source'])
317 else:
318 print >>stderr, "Bug #%i of package %s and source %s is not against pseudopackage 'wnpp' and hast title '%s'" % bug_info
319 if not ftpnew_gatherer.closes_is_itp_re.match(wnpp_title):
320 print >>stderr, "Closed bug %i seems to be not ITPed (queue = %s; title = %s)" % (ival, srcpkg.s['Queue'], wnpp_title)
321 else:
322 if found_itp:
323 print >>stderr, "Warning: Package %s seems to have more than one ITP bugs (%i, %i). Only %i is stored in UDD" % \
324 (srcpkg.s['Source'], srcpkg.s['Closes'], ival, srcpkg.s['Closes'])
325 query = "SELECT count(*) FROM bugs_merged_with WHERE id = %i OR id = %i" % (srcpkg.s['Closes'], ival)
326 cur.execute(query)
327 is_merged = cur.fetchone()[0]
328 if is_merged != 2:
329 print >>stderr, " --> Bugs should be merged in BTS!"
330 else: # stay with the ITP found first
331 srcpkg.s['Closes'] = int(ival)
332 found_itp = 1
333 if not found_itp:
334 print >>stderr, "Most probably %s is not new." % (srcpkg.s['Source'])
335 print >>srco, "%s: %s\n" % (field, value)
336 elif field == 'Distribution':
337 if in_source:
338 if srcpkg.has_several_versions == 0 and value != srcpkg.s['Distribution']:
339 print >>stderr, "Incompatible distributions between new.822(%s) and %s.html (%s)" % \
340 (srcpkg.s['Distribution'], src_info_base, value)
341 srcpkg.s['Distribution'] = value
342 print >>srco, "%s: %s" % (field, value)
343 else:
344 print >>stderr, "Binary should not mention distribution field in %s.html (%s)" % \
345 (src_info_base, value)
346 elif field == 'Binary':
347 if in_source:
348 # Binaries are mentioned in different syntax in *.changes and *.dsc
349 value = re.sub(", +", " ", value)
350 if self.check_existing_binaries(value.split(' '), srcpkg.s['Queue']):
351 srcpkg.s['Queue'] = 'ignore'
352 break
353 if in_source:
354 if srcpkg.s['Bin'] != () and value != srcpkg.s['Bin']:
355 print >>stderr, "Incompatible binaries between new.822(%s) and %s.html (%s)" % \
356 (srcpkg.s['Bin'], src_info_base, value)
357 srcpkg.s['Bin'] = value
358 print >>srco, "%s: %s" % (field, value)
359 else:
360 print >>stderr, "Binary should not mention Binary field in %s.html (%s)" % \
361 (src_info_base, value)
362 elif field == 'Installed-Size':
363 if not in_source:
364 binpkg.b[field] = int(value)
365 elif field == 'Homepage':
366 if not in_source:
367 binpkg.b[field] = value
368 elif field == 'Section':
369 if not in_source:
370 if not binpkg:
371 print >>stderr, "This should not happen", srcpkg, field, value
372 exit(-1)
373 else:
374 binpkg.b[field] = value
375 binpkg.b['Component'] = srcpkg.s['Component']
376 elif field == 'Vcs-Browser':
377 srcpkg.s[field] = value
378 elif binpkg != None and field in dependencies_to_accept:
379 binpkg.b[field] = value
380 print >>srco, "%s: %s" % (field, value)
381 elif field in fields_to_pass or field.startswith('Npp-'):
382 print >>srco, "%s: %s" % (field, value)
383 else:
384 matchvcs = ftpnew_gatherer.vcs_type_re.match(field)
385 if matchvcs:
386 srcpkg.s['Vcs-Type'] = matchvcs.groups()[0]
387 srcpkg.s['Vcs-Url'] = value
388 print >>srco, "%s: %s" % (field, value)
389 else:
390 print >>stderr, "Unknown field in %s: %s" % (srcpkg.s['Source'], field)
391 print >>srco, "*%s: %s" % (field, value)
392 continue
393 if in_description:
394 match = ftpnew_gatherer.src_html_has_description_end_re.match(line)
395 if match:
396 if match.groups()[0][0] != ' ':
397 description += ' '
398 description += de_html(match.groups()[0])
399 in_description = 0
400 if not in_source: # binpkg and binpkg.b:
401 (binpkg.b['Description'], binpkg.b['Long_Description']) = description.split("\n",1)
402 print >>srco, "Description: %s\n%s" % (binpkg.b['Description'], binpkg.b['Long_Description'])
403 else:
404 if line[0] != ' ':
405 description += ' '
406 description += de_html(line)
407 else:
408 match = ftpnew_gatherer.src_html_has_description_start_re.match(line)
409 if match:
410 in_description = 1
411 description = de_html(match.groups()[0]) + "\n"
412 srci.close()
413 srco.close()
414 # Append last read binary package to list of binary packages
415 binpkgs.append(binpkg)
416 if srcpkg.s['Queue'] != 'ignore':
417 # print srcpkg
418 srcpkg.check_dict()
419 query = """EXECUTE ftpnew_insert_source (%(Source)s, %(Version)s,
420 %(Maintainer)s, %(maintainer_name)s, %(maintainer_email)s,
421 %(Bin)s, %(Changed-By)s, %(Architecture)s, %(Homepage)s,
422 %(Vcs-Type)s, %(Vcs-Url)s, %(Vcs-Browser)s,
423 %(Section)s, %(Distribution)s, %(Component)s, %(Closes)s, %(License)s,
424 %(Last_modified)s, %(Queue)s)"""
425 cur.execute(query, srcpkg.s)
426 for binpkg in binpkgs:
427 # print binpkg
428 binpkg.check_dict()
429 query = """EXECUTE ftpnew_insert_package (%(Package)s, %(Version)s,
430 %(Architecture)s, %(Maintainer)s, %(Description)s, %(Source)s,
431 %(Depends)s, %(Recommends)s, %(Suggests)s, %(Enhances)s,
432 %(Pre-Depends)s, %(Breaks)s, %(Replaces)s, %(Provides)s, %(Conflicts)s,
433 %(Installed-Size)s, %(Homepage)s, %(Section)s,
434 %(Long_Description)s, %(Distribution)s, %(Component)s, %(License)s)"""
435 try:
436 cur.execute(query, binpkg.b)
437 except IntegrityError, err:
438 print >>stderr, err, src_info_html
439 print >>stderr, binpkg
440 print >>stderr, binpkg.b
441 continue
442 except KeyError, err:
443 print >>stderr, "Missing information field for binary package %s: %s" % (binpkg.b['Package'], err)
444 continue
445
446 cur.execute("DEALLOCATE ftpnew_insert_source")
447 cur.execute("DEALLOCATE ftpnew_insert_package")
448 cur.execute("DEALLOCATE ftpnew_check_existing_package")
449 cur.execute("ANALYZE %s" % my_config["table_sources"])
450 cur.execute("ANALYZE %s" % my_config["table_packages"])
451
452 if __name__ == '__main__':
453 main()
454
455 # vim:set et tabstop=2:

  ViewVC Help
Powered by ViewVC 1.1.5