14
14
from github import Repository
15
15
from github .GithubException import GithubException
16
16
from github .Issue import Issue
17
+ from github .NamedUser import NamedUser
17
18
from github .PaginatedList import PaginatedList
18
19
from github .PullRequest import PullRequest
19
20
from github .Requester import Requester
@@ -219,14 +220,26 @@ def _get_batch_rate_limited(
219
220
)
220
221
221
222
223
+ def _get_userinfo (user : NamedUser ) -> dict [str , str ]:
224
+ return {
225
+ k : v
226
+ for k , v in {
227
+ "login" : user .login ,
228
+ "name" : user .name ,
229
+ "email" : user .email ,
230
+ }.items ()
231
+ if v is not None
232
+ }
233
+
234
+
222
235
def _convert_pr_to_document (pull_request : PullRequest ) -> Document :
223
236
return Document (
224
237
id = pull_request .html_url ,
225
238
sections = [
226
239
TextSection (link = pull_request .html_url , text = pull_request .body or "" )
227
240
],
228
241
source = DocumentSource .GITHUB ,
229
- semantic_identifier = pull_request .title ,
242
+ semantic_identifier = f" { pull_request .number } : { pull_request . title } " ,
230
243
# updated_at is UTC time but is timezone unaware, explicitly add UTC
231
244
# as there is logic in indexing to prevent wrong timestamped docs
232
245
# due to local time discrepancies with UTC
@@ -236,8 +249,49 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
236
249
else None
237
250
),
238
251
metadata = {
239
- "merged" : str (pull_request .merged ),
240
- "state" : pull_request .state ,
252
+ k : [str (vi ) for vi in v ] if isinstance (v , list ) else str (v )
253
+ for k , v in {
254
+ "object_type" : "PullRequest" ,
255
+ "id" : pull_request .number ,
256
+ "merged" : pull_request .merged ,
257
+ "state" : pull_request .state ,
258
+ "user" : _get_userinfo (pull_request .user ) if pull_request .user else None ,
259
+ "assignees" : [
260
+ _get_userinfo (assignee ) for assignee in pull_request .assignees
261
+ ],
262
+ "repo" : (
263
+ pull_request .base .repo .full_name if pull_request .base else None
264
+ ),
265
+ "num_commits" : str (pull_request .commits ),
266
+ "num_files_changed" : str (pull_request .changed_files ),
267
+ "labels" : [label .name for label in pull_request .labels ],
268
+ "created_at" : (
269
+ pull_request .created_at .replace (tzinfo = timezone .utc )
270
+ if pull_request .created_at
271
+ else None
272
+ ),
273
+ "updated_at" : (
274
+ pull_request .updated_at .replace (tzinfo = timezone .utc )
275
+ if pull_request .updated_at
276
+ else None
277
+ ),
278
+ "closed_at" : (
279
+ pull_request .closed_at .replace (tzinfo = timezone .utc )
280
+ if pull_request .closed_at
281
+ else None
282
+ ),
283
+ "merged_at" : (
284
+ pull_request .merged_at .replace (tzinfo = timezone .utc )
285
+ if pull_request .merged_at
286
+ else None
287
+ ),
288
+ "merged_by" : (
289
+ _get_userinfo (pull_request .merged_by )
290
+ if pull_request .merged_by
291
+ else None
292
+ ),
293
+ }.items ()
294
+ if v is not None
241
295
},
242
296
)
243
297
@@ -252,11 +306,39 @@ def _convert_issue_to_document(issue: Issue) -> Document:
252
306
id = issue .html_url ,
253
307
sections = [TextSection (link = issue .html_url , text = issue .body or "" )],
254
308
source = DocumentSource .GITHUB ,
255
- semantic_identifier = issue .title ,
309
+ semantic_identifier = f" { issue .number } : { issue . title } " ,
256
310
# updated_at is UTC time but is timezone unaware
257
311
doc_updated_at = issue .updated_at .replace (tzinfo = timezone .utc ),
258
312
metadata = {
259
- "state" : issue .state ,
313
+ k : [str (vi ) for vi in v ] if isinstance (v , list ) else str (v )
314
+ for k , v in {
315
+ "object_type" : "Issue" ,
316
+ "id" : issue .number ,
317
+ "state" : issue .state ,
318
+ "user" : _get_userinfo (issue .user ) if issue .user else None ,
319
+ "assignees" : [_get_userinfo (assignee ) for assignee in issue .assignees ],
320
+ "repo" : issue .repository .full_name if issue .repository else None ,
321
+ "labels" : [label .name for label in issue .labels ],
322
+ "created_at" : (
323
+ issue .created_at .replace (tzinfo = timezone .utc )
324
+ if issue .created_at
325
+ else None
326
+ ),
327
+ "updated_at" : (
328
+ issue .updated_at .replace (tzinfo = timezone .utc )
329
+ if issue .updated_at
330
+ else None
331
+ ),
332
+ "closed_at" : (
333
+ issue .closed_at .replace (tzinfo = timezone .utc )
334
+ if issue .closed_at
335
+ else None
336
+ ),
337
+ "closed_by" : (
338
+ _get_userinfo (issue .closed_by ) if issue .closed_by else None
339
+ ),
340
+ }.items ()
341
+ if v is not None
260
342
},
261
343
)
262
344
0 commit comments