/[collab-qa]/udd/udd/ubuntu_bugs_gatherer.py
ViewVC logotype

Contents of /udd/udd/ubuntu_bugs_gatherer.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1531 - (show annotations) (download) (as text)
Thu Jul 23 14:05:39 2009 UTC (3 years, 10 months ago) by lucas
File MIME type: text/x-python
File size: 9405 byte(s)
add ANALYZE at the end of all importers to teach pgsql some stats about the data we just imported
1 #!/usr/bin/env python
2
3 """
4 This script imports the Ubuntu bugs from Launchpad
5 """
6
7 import sys
8 from aux import quote
9 from os import _exit
10 from gatherer import gatherer
11 import re
12 import urllib
13 from Queue import Queue, Empty
14 from threading import Thread, currentThread
15 import time
16 import httplib
17 import email
18
19 def get_gatherer(connection, config, source):
20 return ubuntu_bugs_gatherer(connection, config, source)
21
22 class ubuntu_bugs_gatherer(gatherer):
23 debug = False
24
25 def __init__(self, connection, config, source):
26 gatherer.__init__(self, connection, config, source)
27
28 def run(self):
29 my_config = self.my_config
30 num_fetchers = 4
31 num_writers = 1
32 bugs = self.fetch_all_bugs()
33 httpq = Queue()
34 dbq = Queue()
35 for b in bugs:
36 if b == 310331:
37 continue # this bug is buggy: two identical tasks
38 # if self.debug:
39 # if b != 310331:
40 # continue
41 httpq.put(b)
42
43 # start workers
44 for i in range(num_fetchers):
45 t = Thread(target=self.bugfetcher, name="Fetcher-"+str(i),args=[httpq, dbq])
46 t.setDaemon(True)
47 t.start()
48
49 c = self.cursor()
50 c.execute("delete from ubuntu_bugs_subscribers")
51 c.execute("delete from ubuntu_bugs_duplicates")
52 c.execute("delete from ubuntu_bugs_tags")
53 c.execute("delete from ubuntu_bugs_tasks")
54 c.execute("delete from ubuntu_bugs")
55 ok = True
56 while ok:
57 try:
58 if self.debug:
59 print "HTTPQ: ", httpq.qsize(), " DBQ: ", dbq.qsize()
60 d = dbq.get(True, 5) # 10 secs timeout
61 self.dbimport(c, d)
62 except Empty:
63 if httpq.qsize() == 0:
64 ok = False
65 c.execute("analyze ubuntu_bugs_subscribers")
66 c.execute("analyze ubuntu_bugs_duplicates")
67 c.execute("analyze ubuntu_bugs_tags")
68 c.execute("analyze ubuntu_bugs_tasks")
69 c.execute("analyze ubuntu_bugs")
70
71 def fetch_all_bugs(self):
72 fh = urllib.urlopen('https://launchpad.net/ubuntu/+bugs-text')
73 text = fh.read()
74 # convert to a list
75 bugs = text.split('\n')
76 # remove '', or map() will complain
77 bugs.remove('')
78 # convert each bug (string) to a int
79 bugs = map(int, bugs)
80 # sort, so that we can remove duplicates in O(n) later
81 bugs.sort()
82 # remove duplicates. apparently not in lib, see
83 # http://www.python.org/dev/peps/pep-0270/
84 # FIXME use set()
85 nbugs = []
86 on = 0
87 for n in bugs:
88 if n != on:
89 nbugs.append(n)
90 # else:
91 # print "Duplicate bug: " + str(n)
92 on = n
93 fh.close()
94 #print nbugs
95 return nbugs
96
97 # "worker". Fetch a specific bug as text from launchpad.
98 def bugfetcher(self, hq, dq):
99 while True:
100 conn = httplib.HTTPSConnection('bugs.launchpad.net')
101 ok = True
102 b = None
103 while ok:
104 try:
105 b = hq.get(False)
106 except Empty:
107 return
108 except:
109 print "Other exception raised in bugfetcher. exiting."
110 _exit(1)
111
112 try:
113 conn.request('GET', 'https://launchpad.net/bugs/' + str(b) + '/+text')
114 r = conn.getresponse()
115 if r.status == 200:
116 data = r.read()
117 if data != '':
118 dq.put(data)
119 else:
120 print "[", currentThread().getName(), "] Bug ", b, ": Empty data."
121 ok = False
122 hq.put(b)
123 else:
124 print "[", currentThread().getName(), "] Bug ", b, ": Wrong status: ", r.status, " ", r.reason
125 if r.status == 302:
126 print "Exiting."
127 _exit(1)
128 ok = False
129 hq.put(b)
130 except httplib.BadStatusLine, line:
131 print "[", currentThread().getName(), "] Bug ", b, ": BadStatusLine: ", line
132 print str(r.getheaders())
133 print r.read()
134 ok = False
135 hq.put(b)
136 except:
137 print "[", currentThread().getName(), "] Bug ", b, ": error, exiting"
138 print sys.exc_info()
139 _exit(1)
140
141 parre = re.compile('^\s*(.*) \(([^(]*)\)$')
142 def splitpar(self, text):
143 mo = re.search(self.parre, text)
144 if mo == None:
145 return (text, '')
146 return mo.groups()
147
148 contenttype = re.compile('^Content-Type: ')
149 def dbimport(self, c, data):
150 d = data.split('\n\n')
151 bug = d[0] + '\n'
152 tasks = []
153 for di in d[1:-1]:
154 if re.match(self.contenttype, di + '\n'):
155 break
156 else:
157 tasks.append(di)
158 # OK, we have bugs and tasks.
159 bm = email.message_from_string(bug)
160 bugno = int(bm['bug'])
161 # Check that we are not missing some fields
162 # ignore attachments for now
163 s = set(bm.keys()) - set(['bug', 'title', 'reporter', 'attachments',
164 'subscribers', 'tags', 'duplicate-of', 'duplicates', 'date-reported',
165 'date-updated', 'security'])
166 if len(s) > 0:
167 print s
168 name, login = self.splitpar(bm['reporter'])
169 if bm['duplicate-of'] != '':
170 dup = int(bm['duplicate-of'])
171 else:
172 dup = None
173 reported = time.strptime(bm['date-reported'], "%a, %d %b %Y %H:%M:%S -0000")
174 updated = time.strptime(bm['date-updated'], "%a, %d %b %Y %H:%M:%S -0000")
175 if bm['security'] != None:
176 security = 't'
177 else:
178 security = 'f'
179 treported = time.strftime("%a, %d %b %Y %H:%M:%S +0000", reported)
180 tupdated = time.strftime("%a, %d %b %Y %H:%M:%S +0000", updated)
181 c.execute('insert into ubuntu_bugs values (%s, %s, %s, %s, %s, %s, %s, %s)',
182 (bugno, bm['title'], login, name, dup, treported, tupdated, security))
183 # subscribers
184 for sub in bm['subscribers'].split('\n'):
185 name, login = self.splitpar(sub)
186 c.execute('insert into ubuntu_bugs_subscribers values (%s, %s, %s)', (bugno, login, name))
187 # duplicates
188 for d in bm['duplicates'].split():
189 c.execute('insert into ubuntu_bugs_duplicates values (%s, %s)', (bugno, int(d)))
190 # tags
191 for tag in bm['tags'].split():
192 c.execute('insert into ubuntu_bugs_tags values (%s, %s)', (bugno, tag))
193 ### Import tasks
194 for t in tasks:
195 tm = email.message_from_string(t)
196 pkg, distro = self.splitpar(tm['task'])
197 rep_name, rep_login = self.splitpar(tm['reporter'])
198 if tm['assignee'] != '':
199 ass_name, ass_login = self.splitpar(tm['assignee'])
200 else:
201 ass_name = None
202 ass_login = None
203 created = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
204 time.strptime(tm['date-created'], "%a, %d %b %Y %H:%M:%S -0000"))
205 if tm['date-assigned']:
206 assigned = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
207 time.strptime(tm['date-assigned'], "%a, %d %b %Y %H:%M:%S -0000"))
208 else:
209 assigned = ''
210 if tm['date-closed']:
211 closed = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
212 time.strptime(tm['date-closed'], "%a, %d %b %Y %H:%M:%S -0000"))
213 else:
214 closed = ''
215 if tm['date-incomplete']:
216 incomplete = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
217 time.strptime(tm['date-incomplete'], "%a, %d %b %Y %H:%M:%S -0000"))
218 else:
219 incomplete = ''
220 if tm['date-confirmed']:
221 confirmed = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
222 time.strptime(tm['date-confirmed'], "%a, %d %b %Y %H:%M:%S -0000"))
223 else:
224 confirmed = ''
225 if tm['date-inprogress']:
226 inprogress = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
227 time.strptime(tm['date-inprogress'], "%a, %d %b %Y %H:%M:%S -0000"))
228 else:
229 inprogress = ''
230 if tm['date-fix-committed']:
231 fixcommitted = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
232 time.strptime(tm['date-fix-committed'], "%a, %d %b %Y %H:%M:%S -0000"))
233 else:
234 fixcommitted = ''
235 if tm['date-fix-released']:
236 fixreleased = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
237 time.strptime(tm['date-fix-released'], "%a, %d %b %Y %H:%M:%S -0000"))
238 else:
239 fixreleased = ''
240 if tm['date-left-new']:
241 leftnew = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
242 time.strptime(tm['date-left-new'], "%a, %d %b %Y %H:%M:%S -0000"))
243 else:
244 leftnew = ''
245 if tm['date-triaged']:
246 triaged = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
247 time.strptime(tm['date-triaged'], "%a, %d %b %Y %H:%M:%S -0000"))
248 else:
249 triaged = ''
250 if tm['date-left-closed']:
251 leftclosed = time.strftime("%a, %d %b %Y %H:%M:%S +0000",
252 time.strptime(tm['date-left-closed'], "%a, %d %b %Y %H:%M:%S -0000"))
253 else:
254 leftclosed = ''
255 # check for missing headers
256 s = set(tm.keys()) - set(['task', 'reporter', 'assignee', 'status', 'date-created', 'importance', 'component', 'milestone', 'date-assigned', 'date-closed', 'date-incomplete', 'date-confirmed', 'date-inprogress', 'date-fix-committed', 'date-fix-released', 'watch', 'date-left-new', 'date-triaged', 'date-left-closed'])
257 if len(s) > 0:
258 print s
259 print t
260 if self.debug:
261 print str(bugno)+"\n"
262 c.execute('insert into ubuntu_bugs_tasks values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (bugno, pkg, distro,
263 tm['status'], tm['importance'], tm['component'], tm['milestone'], created,
264 assigned, closed, incomplete, confirmed, inprogress, fixcommitted, fixreleased, leftnew, triaged, leftclosed, tm['watch'],
265 rep_login, rep_name, ass_login, ass_name))
266

  ViewVC Help
Powered by ViewVC 1.1.5